Land b1_8_gate onto b1_8 (20081218_1708)

author johann <johann>

Thu, 18 Dec 2008 18:02:32 +0000 (18:02 +0000)

committer johann <johann>

Thu, 18 Dec 2008 18:02:32 +0000 (18:02 +0000)
author johann <johann>
Thu, 18 Dec 2008 18:02:32 +0000 (18:02 +0000)
committer johann <johann>
Thu, 18 Dec 2008 18:02:32 +0000 (18:02 +0000)
diff --git a/lustre/BUGS b/lustre/BUGS

index ba84777..6679d5d 100644 (file)
--- a/lustre/BUGS
+++ b/lustre/BUGS
@@ -1 +1 @@
-To report bugs, please visit http://bugzilla.clusterfs.com/
+To report bugs, please visit https://bugzilla.lustre.org/
diff --git a/lustre/BUILDING b/lustre/BUILDING

index 1c69d3c..f54cb07 100644 (file)
--- a/lustre/BUILDING
+++ b/lustre/BUILDING
@@ -4,7 +4,7 @@ BUILDING LUSTRE
  You must already have a Lustre-patched kernel, which is outside of the
  scope of this document.  For more information on this process, see the
  web sites below.  Also consider downloading a pre-packaged Lustre
-kernel and utilities from http://www.lustre.org/downloads.html
+kernel and utilities from http://downloads.lustre.org/
  
  To build:
    sh autogen.sh
@@ -21,10 +21,7 @@ To clean up:
  More information about Lustre:
    http://www.lustre.org/
  
-More information about Cluster File Systems:
-  http://www.clusterfs.com/
-
  Feedback: 
-  lustre-discuss@lists.clusterfs.com
+  lustre-discuss@lists.lustre.org
  
  - The Lustre Team -
diff --git a/lustre/ChangeLog b/lustre/ChangeLog

index 396b9fb..15bcca8 100644 (file)
--- a/lustre/ChangeLog
+++ b/lustre/ChangeLog
@@ -1,10 +1,52 @@
  tbd Sun Microsystems, Inc.
+       * version 1.8.1
+       * Support for kernels:
+        2.6.5-7.314 (SLES 9),
+        2.6.9-67.0.22.EL (RHEL 4),
+        2.6.16.60-0.31 (SLES 10),
+        2.6.18-92.1.17.el5 (RHEL 5),
+        2.6.22.14 vanilla (kernel.org)
+       * Client support for unpatched kernels:
+         (see http://wiki.lustre.org/index.php?title=Patchless_Client)
+         2.6.16 - 2.6.22 vanilla (kernel.org)
+       * Client support for unpatched kernels:
+         we do not recommend using patchless RHEL4 clients with kernels
+         prior to 2.6.9-55EL (RHEL4U5).
+       * Recommended e2fsprogs version: 1.40.11-sun1
+       * Note that reiserfs quotas are disabled on SLES 10 in this kernel.
+       * RHEL 4 and RHEL 5/SLES 10 clients behaves differently on 'cd' to a
+         removed cwd "./" (refer to Bugzilla 14399).
+       * A new quota file format has been introduced in 1.6.5.
+         The format conversion from prior releases is handled transparently,
+         but releases older than 1.4.12/1.6.5 don't understand this new
+         format. The automatic format conversion can be avoided by running
+         the following command on the MDS:
+               'tunefs.lustre --param="mdt.quota_type=ug1" $MDTDEV'.
+         For more information, please refer to bugzilla 13904.
+       * Output of lfs quota has been made less detailed by default,
+         old (verbose) output can be obtained by using -v option.
+       * File join has been disabled in this release, refer to Bugzilla 16929.
+       * A new Lustre ADIO driver is available for MPICH2-1.0.7.
+       * NFS export disabled when stack size < 8192. Since the NFSv4 export
+         of Lustre filesystem with 4K stack may cause a stack overflow. For more
+         information, please refer to bugzilla 17630.
+
+Severity   : normal
+Frequency  : race on file read and write
+Bugzilla   : 16417
+Description: Lustre doesn't delete files
+Details    : Clients drop lock reference and release openhandle when they find
+             stale inode.
+
+
+-------------------------------------------------------------------------------
+12-31-2008 Sun Microsystems, Inc.
         * version 1.8.0
         * Support for kernels:
-        2.6.5-7.311 (SLES 9),
-        2.6.9-67.0.20.EL (RHEL 4),
-        2.6.16.54-0.2.5 (SLES 10),
-         2.6.18-53.1.21.el5 (RHEL 5),
+        2.6.5-7.314 (SLES 9),
+        2.6.9-67.0.22.EL (RHEL 4),
+        2.6.16.60-0.31 (SLES 10),
+        2.6.18-92.1.17.el5 (RHEL 5),
          2.6.22.14 vanilla (kernel.org)
         * Client support for unpatched kernels:
           (see http://wiki.lustre.org/index.php?title=Patchless_Client)
@@ -23,7 +65,7 @@ tbd Sun Microsystems, Inc.
           the following command on the MDS:
                 'tunefs.lustre --param="mdt.quota_type=ug1" $MDTDEV'.
           For more information, please refer to bugzilla 13904.
-       * A new quota file format was introduced in 1.8.0.
+       * A new quota file format was introduced in 1.6.6/1.8.0.
           The format conversion from prior releases is handled transparently,
           but releases older than 1.6.6/1.8.0 don't understand this new
           format. The automatic format conversion can be avoided by running
@@ -33,78 +75,384 @@ tbd Sun Microsystems, Inc.
                 'tunefs.lustre --param="ost.quota_type=ug1" $MDTDEV'
                 or (for 1.4.12/1.6.5 quota files)
                 'tunefs.lustre --param="mdt.quota_type=ug2" $MDTDEV',
-               'tunefs.lustre --param="ost.quota_type=ug2" $MDTDEV'            
+               'tunefs.lustre --param="ost.quota_type=ug2" $MDTDEV'
           For more information, please refer to bugzilla 13904.
+       * Output of lfs quota has been made less detailed by default,
+         old (verbose) output can be obtained by using -v option.
+       * File join has been disabled in this release, refer to Bugzilla 16929.
+       * A new Lustre ADIO driver is available for MPICH2-1.0.7.
+       * NFS export disabled when stack size < 8192. Since the NFSv4 export of
+         Lustre filesystem with 4K stack may cause a stack overflow. For more
+         information, please refer to bugzilla 17630.
+
+Severity   : major
+Frequency  : on remount
+Bugzilla   : 18018
+Description: external journal device not working after the remount
+Details    : clear dev_rdonly flag for external journal devices in
+             blkdev_put()
+
+Severity   : minor
+Frequency  : rare
+Bugzilla   : 17802
+Description: shutdown vs evict race
+Details    : client_disconnect_export vs connect request race.
+            if client will evicted at this time - we start invalidate
+             thread without referece to import and import can be freed
+             at same time.
+
+Severity   : minor
+Frequency  : always
+Bugzilla   : 16693
+Description: shrink LOV EAs before replying
+Details    : correctly adjust LOV EA buffer for reply.
+
+Severity   : normal
+Frequency  : rare
+Bugzilla   : 16081
+Description: don't skip ost target if they assigned to file
+Details    : Drop slow OSCs if we can, but not for requested start idx.
+             This means "if OSC is slow and it is not the requested
+             start OST, then it can be skipped, otherwise skip it only
+             if it is inactive/recovering/out-of-space.
+
+Severity   : enhancement
+Bugzilla   : 17201
+Description: Update to RHEL5 kernel-2.6.18-92.1.17.el5.
+
+Severity   : normal
+Frequency  : rare, need acl's on inode.
+Bugzilla   : 16492
+Description: client can't handle ost additional correctly
+Details    : if ost was added after client connected to mds client can have
+             hit lnet_try_match_md ... to big messages to wide striped files.
+             in this case need teach client to handle config events about add
+             lov target and update client max ea size at that event.
+
+Severity   : enhancement
+Bugzilla   : 17374
+Description: Update to sles9 kernel-2.6.5-7.314.
+
+Severity   : enhancement
+Bugzilla   : 17458
+Description: Update to SLES10 SP2 kernel-2.6.16.60-0.31.
+
+Severity   : normal
+Frequency  : Create a symlink file with a very long name
+Bugzilla   : 16578
+Description: ldlm_cancel_pack()) ASSERTION(max >= dlm->lock_count + count)
+Details    : If there is no extra space in the request for early cancels,
+            ldlm_req_handles_avail() returns 0 instead of a negative value.
+
+Severity   : major
+Frequency  : rare
+Bugzilla   : 16492
+Description: mds is deadlocked
+Details    : in rare cases, inode in catalog can have i_no less than have parent
+            i_no, this produce wrong order for locking during open, and parallel
+            unlink can be lock open. this need teach mds_open to grab locks in
+            resource id order, not at parent -> child order.
+
+Severity   : enhancement
+Bugzilla   : 1819
+Description: Add /proc entry for import status
+Details    : The mdc, osc, and mgc import directories now have
+            an import directory that contains useful import data for debugging
+            connection problems.
+
+Severity   : enhancement
+Bugzilla   : 15966
+Description: Re-disable certain /proc logging
+Details    : Enable and disable client's offset_stats, extents_stats and
+            extents_stats_per_process stats logging on the fly.
+
+Severity   : major
+Frequency  : Only on FC kernels 2.6.22+
+Bugzilla   : 16303
+Description: oops in statahead
+Details    : Do not drop reference count for the dentry from VFS when lookup,
+            VFS will do that by itself.
+
+Severity   : enhancement
+Bugzilla   : 16643
+Description: Generic /proc file permissions
+Details    : Set /Proc file permissions in a more generic way to enable non-
+            root users operate on some /proc files.
+
+Severity   : major
+Bugzilla   : 16561
+Description: Hitting mdc_commit_close() ASSERTION
+Details    : Properly handle request reference release in
+            ll_release_openhandle().
+
+Severity   : normal
+Bugzilla   : 15975
+Frequency  : only patchless client
+Description: add workaround for race between add/remove dentry from hash
+
+Severity   : enhancement
+Bugzilla   : 16845
+Description: Allow OST glimpses to return PW locks
+
+Severity   : minor
+Bugzilla   : 16717
+Description: LBUG when llog conf file is full
+Details    : When llog bitmap is full, ENOSPC should be returned for plain
+            log.
+
+Severity   : normal
+Bugzilla   : 16907
+Description: Prevent import from entering FULL state when server in recovery
+
+Severity   : major
+Bugzilla   : 16750
+Description: service mount cannot take device name with ":"
+Details    : Only when device name contains ":/" will mount treat it as
+            client mount.
+
+Severity   : normal
+Bugzilla   : 15927
+Frequency  : rare
+Description: replace ptlrpcd with the statahead thread to interpret the async
+            statahead RPC callback
+
+Severity   : normal
+Bugzilla   : 16611
+Frequency  : on recovery
+Description: I/O failures after umount during fail back
+Details    : if client reconnected to restarted server we need join to recovery
+            instead of find server handler is changed and process self eviction
+            with cancel all locks.
+
+Severity   : enhancement
+Bugzilla   : 16633
+Description: Update to RHEL5 kernel-2.6.18-92.1.10.el5.
+
+Severity   : normal
+Bugzilla   : 15825
+Description: Kernel BUG tries to release flock
+Details    : Lustre does not destroy flock lock before last reference goes
+            away. So always drop flock locks when client is evicted and
+            perform unlock regardless of successfulness of speaking to MDS.
+
+Severity   : enhancement
+Bugzilla   : 16547
+Description: Update to SLES10 SP2 kernel-2.6.16.60-0.27.
+
+Severity   : enhancement
+Bugzilla   : 16566
+Description: Upcall on Lustre log has been dumped
+Details    : Allow for a user mode script to be called once a Lustre log has
+            been dumped. It passes the filename of the dumped log to the
+            script, the location of the script can be specified via
+            /proc/sys/lnet/debug_log_upcall.
+
+Severity   : minor
+Bugzilla   : 16583
+Frequency  : rare
+Description: avoid messages about idr_remove called for id that is not allocated
+Details    : Move assigment s_dev for clustered nfs to end of initialization,
+            for avoid problem with error handling.
+
+Severity   : minor
+Bugzilla   : 16109
+Frequency  : rare
+Description: avoid Already found the key in hash [CONN_UNUSED_HASH] messages
+Details    : When connection is reused this not moved from CONN_UNUSED_HASH into
+            CONN_USED_HASH and this prodice warning when put connection again
+            in unused hash.
+
+Severity   : normal
+Bugzilla   : 15139
+Frequency  : rare
+Description: avoid ASSERTION(client_stat->nid_exp_ref_count == 0) failed
+Details    : release reference to stats when client disconnected, not
+            when export destroyed for avoid races when client destroyed
+            after main ost export.
+
+Severity   : normal
+Bugzilla   : 16679
+Description: more cleanup in mds_lov
+Details    : add workaround for get valid ost count for avoid warnings about
+            drop too big messages, not init llog cat under semphore which
+            can be blocked on reconnect and break normal replay, fix access
+            to wrong pointer.
+
+Severity   : enhancement
+Bugzilla   : 15899
+Description: File striping can now be set to use an arbitrary pool of OSTs.
+
+Severity   : enhancement
+Bugzilla   : 16573
+Description: Export bytes_read/bytes_write count on OSC/OST.
+
+Severity   : normal
+Bugzilla   : 16237
+Description: Early reply size mismatch, MGC loses connection
+Details    : Apply the MGS_CONNECT_SUPPORTED mask at reconnect time so
+            the connect flags are properly negotiated.
+
+Severity   : normal
+Bugzilla   : 16006
+Description: Properly propagate oinfo flags from lov to osc for statfs
+Details    : restore missing copy oi_flags to lov requests.
+
+Severity   : normal
+Bugzilla   : 16317
+Description: exports in /proc are broken
+Details    : recreate /proc entries for clients when they reconnect.
+
+Severity   : enhancement
+Bugzilla   : 16581
+Description: Add man pages for llobdstat(8), llstat(8), plot-llstat(8),
+          : l_getgroups(8), lst(8), routerstat(8)
+Details    : included man pages for llobdstat(8), llstat(8),
+          : plot-llstat(8), l_getgroups(8), lst(8), routerstat(8)
+
+Severity   : enhancement
+Bugzilla   : 16208
+Description: Implement lustre ll_show_options method.
+
+Severity   : enhancement
+Bugzilla   : 16188
+Description: Update to SLES9 kernel-2.6.5-7.312.
+
+Severity   : enhancement
+Bugzilla   : 16503
+Description: Update to RHEL4 kernel-2.6.9-67.0.22.EL.
+
+Severity   : normal
+Bugzilla   : 16317
+Description: exports in /proc are broken
+Details    : recreate /proc entries for clients when they reconnect.
+
+Severity   : normal
+Bugzilla   : 16080
+Description: don't fail open with -ERANGE
+Details    : if client connected until mds will be know about real ost count
+            get LOV EA can be fail because mds not allocate enougth buffer
+            for LOV EA.
+
+Severity   : normal
+Bugzilla   : 15576
+Description: Resolve device initialization race
+Details    : Prevent proc handler from accessing devices added to the
+            obd_devs array but yet be intialized.
+
+Severity   : enhancement
+Bugzilla   : 16091
+Description: configure's --enable-quota should check the
+          : kernel .config for CONFIG_QUOTA
+Details    : configure is terminated if --enable-quota is passed but
+          : no quota support is in kernel
+
+Severity   : enhancement
+Bugzilla   : 15308
+Description: Update to SLES10 SP2 kernel-2.6.16.60-0.23.
+
+Severity   : enhancement
+Bugzilla   : 16190
+Description: Update to RHEL5 kernel-2.6.18-92.1.6.el5.
+
+Severity   : normal
+Bugzilla   : 16318
+Frequency  : rare, on PPC clients
+Description: don't swab ost objects in response about directory, because
+            this not exist.
+Details    : bug similar bug 14856, but in different function.
+
+Severity   : enhancement
+Bugzilla   : 15754
+Description: lfs quota tool enhancement
+Details    : added units specifiers support for setquota, default to
+            current uid/gid for quota report, short quota stats by
+            default, nonpositional parameters for setquota, added
+            llapi_quotactl manual page.
+
+Severity   : enhancement
+Bugzilla   : 15625
+Description: *optional* service tags registration
+Details    : if the "service tags" package is installed on a Lustre node
+            When the filesystem is mounted, a local-node service tag will
+            be created.  See http://inventory.sun.com/ for more information
+            about the Service Tags asset management system.
  
  Severity   : normal
  Bugzilla   : 16037
  Description: Client runs out of low memory
  Details    : Consider only lowmem when counting initial number of llap pages
  
-Severity   : normal
  Bugzilla   : 15825
  Description: Kernel BUG tries to release flock
  Details    : Lustre does not destroy flock lock before last reference goes
-             away. So always drop flock locks when client is evicted and 
-             perform unlock regardless of successfulness of speaking to MDS.
+            away. So always drop flock locks when client is evicted and
+            perform unlock regardless of successfulness of speaking to MDS.
  
  Severity   : normal
+Frequency  : occasional
  Bugzilla   : 15210
-Description: add recount protection for osc callbacks, so avoid panic on shutdown
+Description: add refcount for osc callbacks, so avoid panic on shutdown
  
  Severity   : enhancement
  Bugzilla   : 16189
  Description: Update to RHEL4 kernel-2.6.9-67.0.20.
  
  Severity   : normal
+Frequency  : testing only
  Bugzilla   : 12653
  Description: sanity test 65a fails if stripecount of -1 is set
  Details    : handle -1 striping on filesystem in ll_dirstripe_verify
  
  Severity   : normal
+Frequency  : only in unusual configurations
  Bugzilla   : 16014
  Description: Kernel panic with find ost index.
  Details    : lov_obd have panic if some OST's have sparse indexes.
  
-Severity   : normal
-Bugzilla   : 14742
-Frequency  : rare
-Description: ASSERTION(CheckWriteback(page,cmd)) failed
-Details    : badly clear PG_Writeback bit in ll_ap_completion can produce false
-            positive assertion.
-
  Severity   : enhancement
  Bugzilla   : 15865
  Description: Update to RHEL5 kernel-2.6.18-53.1.21.el5.
  
  Severity   : major
+Frequency  : rarely, if filesystem is mounted with -o flock
  Bugzilla   : 15924
  Description: do not process already freed flock
  Details    : flock can possibly be freed by another thread before it reaches
              to ldlm_flock_completion_ast.
  
  Severity   : normal
+Frequency  : rarely, if filesystem is mounted with -o flock
  Bugzilla   : 14480
  Description: LBUG during stress test
  Details    : Need properly lock accesses the flock deadlock detection list.
  
  Severity   : minor
+Frequency  : rarely, if binaries are being run from Lustre
  Bugzilla   : 15837
  Description: oops in page fault handler
-Details    : kernel page fault handler can return two special 'pages' in 
+Details    : kernel page fault handler can return two special 'pages' in
              error case, don't try dereference NOPAGE_SIGBUS and NOPAGE_OMM.
  
  Severity   : minor
+Frequency  : rarely, during shutdown
  Bugzilla   : 15716
  Description: timeout with invalidate import.
-Details    : ptlrpcd_check call obd_zombie_impexp_cull and wait request which should be
-            handled by ptlrpcd. This produce long age waiting and -ETIMEOUT
-            ptlrpc_invalidate_import and as result LASSERT.
+Details    : ptlrpcd_check call obd_zombie_impexp_cull and wait request which
+            should be handled by ptlrpcd. This produce long age waiting and
+            -ETIMEOUT ptlrpc_invalidate_import and as result LASSERT.
+
+Severity   : normal
+Frequency  : rarely
+Bugzilla   : 14742
+Frequency  : rare
+Description: ASSERTION(CheckWriteback(page,cmd)) failed
+Details    : badly clear PG_Writeback bit in ll_ap_completion can produce false
+            positive assertion.
  
  Severity   : normal
  Frequency  : only with broken builds/installations
  Bugzilla   : 15779
-Description: do not LBUG if lquota.ko and fsfilt_ldiskfs.ko are of different versions
+Description: no LBUG if lquota.ko and fsfilt_ldiskfs.ko are different versions
  Details    : just return an error to a user, put a console error message
  
  Severity   : enhancement
@@ -115,7 +463,7 @@ Severity   : enhancement
  Bugzilla   : 15742
  Description: Update to RHEL4 kernel-2.6.9-67.0.15.
  
-Severity   : major
+Severity   : enhancement
  Bugzilla   : 14134
  Description: enable MGS and MDT services start separately
  Details    : add a 'nomgs' option in mount.lustre to enable start a MDT with
@@ -123,36 +471,37 @@ Details    : add a 'nomgs' option in mount.lustre to enable start a MDT with
              to 'nosvc' mount option.
  
  Severity   : normal
-Frequency  : always, on ppc.
+Frequency  : always, on big-endian systems
  Bugzilla   : 14856
-Description: cleanup in ptlrpc code, related to ppc platform
-Details    : store magic in native order avoid panic's in recovery on ppc node
-            and forbid from this error in future. Also fix posibily of twice swab
-            data. Fix get lov striping to userpace.
+Description: cleanup in ptlrpc code, related to PPC platform
+Details    : store magic in native order avoid panic's in recovery on PPC
+            node and forbid from this error in future. Also fix posibily
+            of twice swab data. Fix get lov striping to userpace.
  
  Severity   : normal
+Frequency  : rarely, if replay get lost on server
  Bugzilla   : 15756
-Frequency  : rare, replay get lost on server
  Description: server incorrectly drop resent replays lead to recovery failure.
  Details    : do not drop replay according to msg flags, instead we check the
              per-export recovery request queue for duplication of transno.
  
  Severity   : normal
-Bugzilla   : 14835
  Frequency  : after recovery
+Bugzilla   : 14835
  Description: precreate to many object's after del orphan.
  Details    : del orphan st in oscc last_id == next_id and this triger growing
              count of precreated objects. Set flag LOW to skip increase count
              of precreated objects.
  
  Severity   : normal
-Bugzilla   : 15139
  Frequency  : rare, on clear nid stats
+Bugzilla   : 15139
  Description: ASSERTION(client_stat->nid_exp_ref_count == 0)
  Details    : when clean nid stats sometimes try destroy live entry,
              and this produce panic in free.
  
  Severity   : major
+Frequency  : occasionally since 1.6.4
  Bugzilla   : 15575
  Description: Stack overflow during MDS log replay
  Details    : ease stack pressure by using a thread dealing llog_process.
@@ -166,59 +515,72 @@ Details      : Mountpoint references were being leaked during open reply
              in reconstruct_open() and free dentry reference also.
  
  Severity   : normal
+Frequency  : rare
  Bugzilla   : 15443
  Description: wait until IO finished before start new when do lock cancel.
  Details    : VM protocol want old IO finished before start new, in this case
-            need wait until PG_writeback is cleared until check dirty flag and
-            call writepages in lock cancel callback.
+            need wait until PG_writeback is cleared until check dirty flag
+            and call writepages in lock cancel callback.
  
  Severity   : normal
+Frequency  : rare
  Bugzilla   : 12888
  Description: mds_mfd_close() ASSERTION(rc == 0)
  Details    : In mds_mfd_close(), we need protect inode's writecount change
              within its orphan write semaphore to prevent possible races.
  
  Severity   : minor
-Bugzilla   : 14645
  Frequency  : rare, on shutdown ost
+Bugzilla   : 14645
  Description: don't hit live lock with umount ost.
  Details    : shrink_dcache_parent can be in long loop with destroy dentries,
              use shrink_dcache_sb instead.
  
  Severity   : minor
+Frequency  : only when echo_client is used
  Bugzilla   : 14949
-Description: don't panic with use echo client
-Details    : echo client pass NULL as client nid pointer and this produce null
+Description: don't panic with use echo_client
+Details    : echo client pass NULL as client nid pointer and this produce NULL
              pointer dereference.
  
  Severity   : normal
+Frequency  : Always on 32-bit PowerPC systems
  Bugzilla   : 15278
-Description: fix build on ppc32
-Details    : compile code with -m64 flag produce wrong object file for ppc32.
+Description: fix build on PPC32
+Details    : compile code with -m64 flag produce wrong object file for PPC32.
  
  Severity   : normal
+Frequency  : rare
  Bugzilla   : 15574
  Description: MDS LBUG: ASSERTION(!IS_ERR(dchild))
-Details    : In reconstruct_* functions, LASSERTs on both the data supplied by
-            a client, and the data on disk are dangerous and incorrect. Change
-            them with client eviction.
+Details    : In reconstruct_* functions, LASSERTs on both the data supplied
+            by a client, and the data on disk are dangerous and incorrect.
+            Change them with client eviction.
  
-Severity   : normal
+Severity   : enhancement
  Bugzilla   : 15346
  Description: skiplist implementation simplification
  Details    : skiplists are used to group compatible locks on granted list
-            that was implemented as tracking first and last lock of each lock group
-            the patch changes that to using doubly linked lists
+            that was implemented as tracking first and last lock of each lock
+            group the patch changes that to using doubly linked lists
  
  Severity   : normal
  Bugzilla   : 15933
  Description: delete compatibility for 32bit qdata
-Details    : as planned, when lustre is beyond b1_8, lquota won't support for 32bit
-            qunit. That means servers of b1_4 and servers of b1_8 can't be used
-            together if users want to use quota.
+Details    : as planned, when lustre is beyond b1_8, lquota won't support 32bit
+            qunit. That means servers of b1_4 and servers of b1_8 can't be
+            used together if users want to use quota.
  
  Severity   : normal
-Frequency  : blocks per group is less than blocksize*8 and uninit_groups is enabled
+Frequency  : only with administrator action
+Bugzilla   : 14693
+Description: mount failure if config log has invalid conf_param setting
+Details    : If administrator specified an incorrect configuration parameter
+            with "lctl conf_param" this would cause an error during future
+            client mounts.  Instead, ignore the bad configuration parameter.
+
+Severity   : normal
+Frequency  : blocks per group < blocksize*8 and uninit_groups is enabled
  Bugzilla   : 15932
  Description: ldiskfs error: XXX blocks in bitmap, YYY in gd
  Details    : If blocks per group is less than blocksize*8, set rest of the
@@ -239,7 +601,7 @@ Description: more ldlm soft lockups
  Details    : In ldlm_resource_add_lock(), call to ldlm_resource_dump()
              starve other threads from the resource lock for a long time in
              case of long waiting queue, so change the debug level from
-            OTHER to the less frequently used D_INFO.
+            D_OTHER to the less frequently used D_INFO.
  
  Severity   : enhancement
  Bugzilla   : 13128
@@ -247,11 +609,11 @@ Description: add -gid, -group, -uid, -user options to lfs find
  
  Severity   : enhancement
  Bugzilla   : 15284
-Description: ll_recover_lost_found_objs - rename objects in lost+found to object ID
-Details           : OST crashes and subsequent e2fsck can lead to objects being moved
-            to lost+found directory. Using the "ll_recover_lost_found_objs"
+Description: ll_recover_lost_found_objs - recover objects in lost+found
+Details    : OST corruption and subsequent e2fsck can leave objects in the
+            lost+found directory.  Using the "ll_recover_lost_found_objs"
              tool, these objects can be retrieved and data can be salvaged
-            by using the object ID saved in the fid.
+            by using the object ID saved in the fid EA on each object.
  
  Severity   : minor
  Frequency  : rare
@@ -270,32 +632,213 @@ Details    : The direct IO path doesn't call check_rpcs to submit a new RPC once
  
  Severity   : normal
  Bugzilla   : 15684
-Description: Procfs and llog threads access destoryed import sometimes. 
+Description: Procfs and llog threads access destoryed import sometimes.
  Details    : Sync the import destoryed process with procfs and llog threads by
              the import refcount and semaphore.
  
--------------------------------------------------------------------------------
+Severity   : major
+Bugzilla   : 15674
+Description: mds fails to respond, threads stuck in ldlm_completion_ast
+Details    : Sort source/child resource pair after updating child resource.
+
+Severity   : major
+Frequncy   : rare
+Bugzilla   : 16226
+Description: kernel BUG at ldiskfs2_ext_new_extent_cb
+Details    : If insertion of an extent fails, then discard the inode
+            preallocation and free data blocks else it can lead to duplicate
+            blocks.
+
+Severity   : normal
+Bugzilla   : 16199
+Description: don't always update ctime in ext3_xattr_set_handle()
+Details    : Current xattr code updates the inode ctime in ext3_xattr_set_handle.
+            In some cases the ctime should not be updated, for example for
+            2.0->1.8 compatibility it is necessary to delete an xattr and it
+            should not update the ctime.
+
+Severity   : normal
+Bugzilla   : 15058
+Description: add quota statistics
+Details    : 1. sort out quota proc entries and proc code.
+            2. add quota statistics
  
+Severity   : normal
+Frequency  : often
+Bugzilla   : 16125
+Description: quotas are not honored with O_DIRECT
+Details    : all writes with the flag O_DIRECT will use grants which leads to
+            this problem. Now using OBD_BRW_SYNC to guard this.
+
+Severity   : major
+Frequency  : rare
+Bugzilla   : 15713/16362
+Description: Assertion in iopen_connect_dentry in 1.6.3
+Details    : looking up an inode via iopen with the wrong generation number can
+            populate the dcache with a disconneced dentry while the inode
+            number is in the process of being reallocated. This causes an
+            assertion failure in iopen since the inode's dentry list contains
+            both a connected and disconnected dentry.
+
+Severity   : normal
+Bugzilla   : 16496
+Description: assertion failure in ldlm_handle2lock()
+Details    : fix a race between class_handle_unhash() and class_handle2object()
+            introduced in lustre 1.6.5 by bug 13622.
+
+Severity   : enhancement
+Bugzilla   : 11817
+Description: superblock lock contention with many SMP cores on one client
+Details    : several client filesystem locks were highly contended on SMP
+            NUMA systems with 8 or more cores.  Per-CPU datastructures
+            and more efficient locking implemented to reduce contention.
+
+Severity   : minor
+Frequency  : rare
+Bugzilla   : 12755
+Description: Kernel BUG: sd_iostats_bump: unexpected disk index
+Details    : remove the limit of 256 scsi disks in the sd_iostat patch
+
+Severity   : minor
+Frequency  : rare
+Bugzilla   : 16494
+Description: oops in sd_iostats_seq_show()
+Details    : unloading/reloading the scsi low level driver triggers a kernel
+            bug when trying to access the sd iostat file.
+
+Severity   : major
+Frequency  : rare
+Bugzilla   : 16404
+Description: Kernel panics during QLogic driver reload
+Details    : REQ_BLOCK_PC requests are not handled properly in the sd iostat
+            patch, causing memory corruption.
+
+Severity   : minor
+Frequency  : rare
+Bugzilla   : 16140
+Description: journal_dev option does not work in b1_6
+Details    : pass mount option during pre-mount.
+
+Severity   : enhancement
+Bugzilla   : 10555
+Description: Add a FIEMAP(FIle Extent MAP) ioctl for ldiskfs
+Details    : FIEMAP ioctl will allow an application to efficiently fetch the
+            extent information of a file. It can be used to map logical blocks
+            in a file to physical blocks in the block device.
+
+Severity   : normal
+Frequency  : only with adaptive timeout enabled
+Bugzilla   : 16972
+Description: DEBUG_REQ() bad paging request
+Details    : ptlrpc_at_recv_early_reply() should not modify req->rq_repmsg
+            because it can be accessed by reply_in_callback() without the
+            rq_lock held.
+
+Severity   : normal
+Frequency  : only on Cray X2
+Bugzilla   : 16813
+Description: X2 build failures
+Details    : fix build failures on Cray X2.
+
+Severity   : normal
+Bugzilla   : 2066
+Description: xid & resent requests
+Details    : Initialize RPC XID from clock at startup (randomly if clock is
+            bad).
+
+Severity   : major
+Bugzilla   : 14840
+Description: quota recovery deadlock during mds failover
+Details    : This patch includes att18982, att18236, att18237 in bz14840.
+            Solve the problems:
+            1. fix osts hang when mds does failover with quotaon
+            2. prevent watchdog storm when osts threads wait for the
+               recovery of mds
+
+Severity   : normal
+Bugzilla   : 16695
+Description: kernel panic on racer
+Details    : Do not access dchild->d_inode when IS_ERR(dchild) is true.
+
+Severity   : enhancement
+Bugzilla   : 14095
+Description: Add lustre_start utility to start or stop multiple Lustre servers
+            from a CSV file.
+
+Severity   : major
+Bugzilla   : 17024
+Description: Lustre GPF in {:ptlrpc:ptlrpc_server_free_request+373}
+Details    : In case of memory pressure, list_del() can be called twice on
+            req->rq_history_list, causing a kernel oops.
+
+Severity   : normal
+Bugzilla   : 17026
+Description: (ptllnd_peer.c:557:kptllnd_peer_check_sends()) ASSERTION(!in_interrupt()) failed
+Details    : fix stack overflow in the distributed lock manager by defering export
+            eviction after a failed ast to the elt thread instead of handling
+            it in the dlm interpret routine.
+
+Severity   : enhancement
+Bugzilla   : 12800
+Description: More exported tunables for mballoc
+Details    : Add support for tunable preallocation window and new tunables for
+            large/small requests
+
+Severity   : normal
+Bugzilla   : 16680
+Description: Detect corruption of block bitmap and checking for preallocations
+Details    : Checks validity of on-disk block bitmap. Also it does better
+            checking of number of applied preallocations. When corruption is
+            found, it turns filesystem readonly to prevent further corruptions.
+
+Severity   : normal
+Bugzilla   : 16438
+Frequency  : only for big-endian servers
+Description: Check if system is big-endian while mounting fs with extents feature
+Details    : Mounting a filesystem with extents feature will fail on big-endian
+            systems since ext3-based ldiskfs is not supported on big-endian
+            systems. This can be over-riden with "bigendian_extents" mount option.
+
+Severity   : normal
+Bugzilla   : 16860
+Description: Excessive recovery window
+Details    : With AT enabled, the recovery window can be excessively long (6000+
+            seconds). To address this problem, we no longer use
+            OBD_RECOVERY_FACTOR when extending the recovery window (the connect
+            timeout no longer depends on the service time, it is set to
+            INITIAL_CONNECT_TIMEOUT now) and clients report the old service
+            time via pb_service_time.
+
+Severity   : normal
+Bugzilla   : 16522
+Description: Watchdog triggered on MDS failover
+Details    : enable OBD_CONNECT_MDT flag when connecting from the MDS so that
+            the OSTs know that the MDS "UUID" can be reused for the same export
+            from a different NID, so we do not need to wait for the export to be
+            evicted
+
+-------------------------------------------------------------------------------
  
  2008-05-26  Sun Microsystems, Inc.
         * version 1.6.5
         * Support for kernels:
-        2.6.5-7.311 (SLES 9),
-        2.6.9-67.0.7.EL (RHEL 4),
-        2.6.16.54-0.2.5 (SLES 10),
-        2.6.22.14 vanilla (kernel.org)
+       2.6.5-7.311 (SLES 9),
+       2.6.9-67.0.7.EL (RHEL 4),
+       2.6.16.54-0.2.5 (SLES 10),
+       2.6.18-53.1.14.el5 (RHEL 5),
+       2.6.22.14 vanilla (kernel.org)
         * Client support for unpatched kernels:
-         (see http://wiki.lustre.org/index.php?title=Patchless_Client)
-         2.6.16 - 2.6.22 vanilla (kernel.org)
+        (see http://wiki.lustre.org/index.php?title=Patchless_Client)
+        2.6.16 - 2.6.22 vanilla (kernel.org)
         * Due to problems with nested symlinks and FMODE_EXEC (bug 12652),
-         we do not recommend using patchless RHEL4 clients with kernels
-         prior to 2.6.9-55EL (RHEL4U5).
+        we do not recommend using patchless RHEL4 clients with kernels
+        prior to 2.6.9-55EL (RHEL4U5).
         * Recommended e2fsprogs version: 1.40.7-sun1
         * Note that reiserfs quotas are disabled on SLES 10 in this kernel.
         * RHEL 4 and RHEL 5/SLES 10 clients behaves differently on 'cd' to a
-         removed cwd "./" (refer to Bugzilla 14399).
+        removed cwd "./" (refer to Bugzilla 14399).
         * A new quota file format has been introduced in 1.6.5.
-         The format conversion from prior releases is handled transparently,
+        The format conversion from prior releases is handled transparently,
          but releases older than 1.4.12/1.6.5 will not understand this new
          format.  The automatic format conversion can be avoided by running
          the following command on the MDS before upgrading:
@@ -306,13 +849,20 @@ Severity   : major
  Bugzilla   : 14443
  Description: quota performance fix
  Details    : quota data is written in journalled mode instead of ordered to
-             increase performance
+            increase performance
  
  Severity   : normal
  Bugzilla   : 13915
  Description: lfs support for human-readable quota grace time strings
  Details    : lfs setquota -t and lfs quota -t represent quota grace times
-             in "XXwXXdXXhXXmXXs" format instead of large values in seconds
+            in "XXwXXdXXhXXmXXs" format instead of large values in seconds
+
+Severity   : normal
+Frequency  : always with o2ib 1.3 and sles10
+Bugzilla   : 15870
+Description: fix build with SLES10 and o2ib v3.
+Details    : sles10 uses diffrent name for Module.symver file but configure
+            assume this file has same name on RHEL/SLES/vanila kernels.
  
  Severity   : critical
  Frequency  : very rare, if additional xattrs are used on kernels >= 2.6.12
@@ -343,20 +893,20 @@ Details      : Mountpoint references were being leaked during open reply
  Severity   : minor
  Frequency  : rare
  Bugzilla   : 13380
-Description: fix for occasional failure case of -ENOSPC in recovery-small tests 
-Details    : Move the 'good_osts' check before the 'total_bavail' check.  This 
-            will result in an -EAGAIN and in the exit call path we call 
-            alloc_rr() which will with increasing aggressiveness attempt to 
+Description: fix for occasional failure case of -ENOSPC in recovery-small tests
+Details    : Move the 'good_osts' check before the 'total_bavail' check.  This
+            will result in an -EAGAIN and in the exit call path we call
+            alloc_rr() which will with increasing aggressiveness attempt to
              aquire precreated objects on the minimum number of required OSCs.
  
  Severity   : major
  Bugzilla   : 14326
  Description: Use old size assignment to avoid deadlock
  Details    : This reverts the changes in bugs 2369 and bug 14138 that introduced
-            the scheduling while holding a spinlock.  We do not need locking 
-            for size in ll_update_inode() because size is only updated from 
-            the MDS for directories or files without objects, so there is no 
-            other place to do the update, and concurrent access to such inodes 
+            the scheduling while holding a spinlock.  We do not need locking
+            for size in ll_update_inode() because size is only updated from
+            the MDS for directories or files without objects, so there is no
+            other place to do the update, and concurrent access to such inodes
              are protected by the inode lock.
  
  Severity   : normal
@@ -376,8 +926,8 @@ Severity   : normal
  Bugzilla   : 14872
  Description: the recovery timer never expires
  Details    : for new client connect request, the recovery timer should not be
-            reset, otherwise recovery timer will never expired, if the old 
-            client never come. Only old client connect and first connection 
+            reset, otherwise recovery timer will never expired, if the old
+            client never come. Only old client connect and first connection
              req should trigger recovery timer reset.
  
  Severity   : normal
@@ -489,9 +1039,9 @@ Description: Disable adaptive timeouts by default
  Severity   : major
  Frequency  : on network error
  Bugzilla   : 15027
-Description: panic with double free request if network error 
+Description: panic with double free request if network error
  Details    : mdc_finish_enqueue is finish request if any network error occuring,
-            but it's true only for synchronus enqueue, for async enqueue 
+            but it's true only for synchronus enqueue, for async enqueue
              (via ptlrpcd) this incorrect and ptlrpcd want finish request
              himself.
  
@@ -558,14 +1108,14 @@ Severity   : normal
  Frequency  : rare
  Bugzilla   : 14421
  Description: ASSERTION(!PageDirty(page)) failed
-Details    : Wrong check could lead to an assertion failure under specific 
+Details    : Wrong check could lead to an assertion failure under specific
              load patterns.
  
  Severity   : normal
  Frequency  : rare
  Bugzilla   : 12228
  Description: LBUG in ptlrpc_check_set() bad phase ebc0de00
-Details    : access to bitfield in structure is always rounded to long 
+Details    : access to bitfield in structure is always rounded to long
              and this produce problem with not atomic change any bit.
  
  Severity   : normal
@@ -917,9 +1467,9 @@ Details    : Client gets evicted from server.  Now client also thinks it is
  
  Severity   : normal
  Bugzilla   : 14483
-Description: Detect stride IO mode in read-ahead 
+Description: Detect stride IO mode in read-ahead
  Details    : When a client does stride read, read-ahead should detect that and
-            read-ahead pages according to the detected stride pattern. 
+            read-ahead pages according to the detected stride pattern.
  
  Severity   : normal
  Bugzilla   : 15033
@@ -974,8 +1524,8 @@ Details    : Force q->max_phys_segments to MAX_PHYS_SEGMENTS on SLES10 to be
  Severity   : normal
  Bugzilla   : 15198
  Description: LDLM soft lockups - improvement
-Details    : It is be possible to send the lock handle along with each read 
-            or write request because the client is already doing a lock match 
+Details    : It is be possible to send the lock handle along with each read
+            or write request because the client is already doing a lock match
              itself so there isn't any reason the OST should have to re-do that
              search.
  
@@ -993,7 +1543,7 @@ Frequency  : rare
  Bugzilla   : 15776
  Description: Extent locks not granted with no conflicts sometimes.
  Details    : When race occurs in glimpse handler and nothing is returned,
-             we do not reprocess the queue after lock cancel, and that leads
+            we do not reprocess the queue after lock cancel, and that leads
              to a stall until next activity on a resource
  
  Severity   : normal
@@ -1003,10 +1553,10 @@ Description: during mds failovers with quota on, OSTs got into deadlock state
              and causing dumpstack.
  Details           : for every quota slave, at any time, there is only one quota req
              is sent to quota master for every uid/gid. Before that quota req
-            returns, all the thread relative to the same uid/gid will wait. 
+            returns, all the thread relative to the same uid/gid will wait.
              So if the quota req is lost because mds failovers or any other
              reasons, this bug will be hit. Now, dqacq_interpret() will handle
-            quota reqs who time out. 
+            quota reqs who time out.
  
  Severity   : enhancement
  Frequency  : always
@@ -1045,9 +1595,9 @@ Severity   : normal
  Frequency  : occasional
  Bugzilla   : 13730
  Description: Do not fail import if osc_interpret_create gets -EAGAIN
-Details    : If osc_interpret_create got -EAGAIN it immediately exits and 
-            wakeup oscc_waitq.  After wakeup oscc_wait_for_objects call 
-            oscc_has_objects and see OSC has no objests and call 
+Details    : If osc_interpret_create got -EAGAIN it immediately exits and
+            wakeup oscc_waitq.  After wakeup oscc_wait_for_objects call
+            oscc_has_objects and see OSC has no objests and call
              oscc_internal_create to resend create request.
  
  Severity   : enhancement
@@ -1074,7 +1624,7 @@ Details    : This causes SLES 10 clients to behave as patchless clients
              even on a Lustre-patched (server) kernel.
  
  Severity   : enhancement
-Bugzilla   : 2369 
+Bugzilla   : 2369
  Description: use i_size_read and i_size_write in 2.6 port
  Details    : replace inode->i_size access with i_size_read/write()
  
@@ -1098,7 +1648,7 @@ Frequency  : only on ppc
  Bugzilla   : 12234
  Description: /proc/fs/lustre/devices broken on ppc
  Details    : The patch as applied to 1.6.2 doesn't look correct for all arches.
-            We should make sure the type of 'index' is loff_t and then cast 
+            We should make sure the type of 'index' is loff_t and then cast
              explicitly as needed below. Do not assign an explicitly cast
              loff_t to an int.
  
@@ -1165,15 +1715,15 @@ Frequency  : always
  Bugzilla   : 13751
  Description: Kernel patches update for RHEL5 2.6.18-8.1.14.el5.
  Details    : Modify target file & which_patch.
-            A flaw was found in the IA32 system call emulation provided 
-            on AMD64 and Intel 64 platforms. An improperly validated 64-bit 
-            value could be stored in the %RAX register, which could trigger an 
-            out-of-bounds system call table access. An untrusted local user 
-            could exploit this flaw to run code in the kernel 
-            (ie a root privilege escalation). (CVE-2007-4573). 
+            A flaw was found in the IA32 system call emulation provided
+            on AMD64 and Intel 64 platforms. An improperly validated 64-bit
+            value could be stored in the %RAX register, which could trigger an
+            out-of-bounds system call table access. An untrusted local user
+            could exploit this flaw to run code in the kernel
+            (ie a root privilege escalation). (CVE-2007-4573).
  
  Severity   : minor
-Bugzilla   : 13732 
+Bugzilla   : 13732
  Description: change order of libsysio includes
  Details    : '#include sysio.h' should always come before '#include xtio.h'
  
@@ -1188,12 +1738,12 @@ Frequency  : rarely
  Bugzilla   : 13570
  Description: To avoid grant space > avaible space when the disk is almost
              full. Without this patch you might see the error "grant XXXX >
-            available" or some LBUG about grant, when the disk is almost 
+            available" or some LBUG about grant, when the disk is almost
              full.
  Details    : In filter_check_grant, for non_grant cache write, we should
-            check the left space by  if (*left > ungranted + bytes), instead 
-            of (*left > ungranted), because only we are sure the left space 
-            is enough for another "bytes", then the ungrant space should be 
+            check the left space by  if (*left > ungranted + bytes), instead
+            of (*left > ungranted), because only we are sure the left space
+            is enough for another "bytes", then the ungrant space should be
              increase. In client, we should update cl_avail_grant only there is
              OBD_MD_FLGRANT in the reply.
  
@@ -1212,6 +1762,14 @@ Description: Incorrect file ownership on O_DIRECT output files
  Details    : block usage reported by 'lfs quota' does not take into account
              files that have been written with O_DIRECT.
  
+Severity   : normal
+Bugzilla   : 17197
+Description: (rw.c:1323:ll_read_ahead_pages()) ASSERTION(page_idx > ria->ria_stoff) failed
+Details    : Once the unmatched stride IO mode is detected, shrink the stride-ahead
+            window to 0. If it does hit cache miss, and read-pattern is still
+            stride-io mode, does not reset the stride window, but also does not
+            increase the stride window length in this case.
+
  --------------------------------------------------------------------------------
  
  2007-09-27         Cluster File Systems, Inc. <info@clusterfs.com>
@@ -1356,10 +1914,10 @@ Details    : change the condition to increase offset_idx
  Severity   : enhancement
  Bugzilla   : 2262
  Description: self-adjustable client's lru lists
-Details    : use adaptive algorithm for managing client cached locks lru 
+Details    : use adaptive algorithm for managing client cached locks lru
              lists according to current server load, other client's work
-            pattern, memory activities, etc. Both, server and client 
-            side namespaces provide number of proc tunables for controlling 
+            pattern, memory activities, etc. Both, server and client
+            side namespaces provide number of proc tunables for controlling
              things
  
  Severity   : cleanup
@@ -1368,17 +1926,17 @@ Description: rewrite ext2-derived code in llite/dir.c and obdclass/uuid.c
  Details    : rewrite inherited code (uuid parsing code from ext2 utils and
              readdir code from ext3) from scratch preserving functionality.
  
-Severity   : normal 
+Severity   : normal
  Bugzilla   : 13436
  Description: Only those disconnect error should be returned by rq_status.
-Details    : In open/enqueue processs, Some errors, which will cause client 
-            disconnected, should be returned by rq_status, while other 
+Details    : In open/enqueue processs, Some errors, which will cause client
+            disconnected, should be returned by rq_status, while other
              errors should still be returned by intent, then mdc or llite will
              detect them.
  
  Severity   : enhancement
  Bugzilla   : 11230
-Description: Tune the kernel for good SCSI performance. 
+Description: Tune the kernel for good SCSI performance.
  Details    : Set the value of /sys/block/{dev}/queue/max_sectors_kb
              to the value of /sys/block/{dev}/queue/max_hw_sectors_kb
              in mount_lustre.
@@ -1410,7 +1968,7 @@ Severity   : normal
  Bugzilla   : 13304
  Frequency  : Always, for kernels after 2.6.16
  Description: Fix warning idr_remove called for id=.. which is not allocated.
-Details    : Last kernels save old s_dev before kill super and not allow 
+Details    : Last kernels save old s_dev before kill super and not allow
              to restore from callback - restore it before call kill_anon_super.
  
  Severity   : normal
@@ -1866,10 +2424,10 @@ Details    : Lov_mds_md was not free in an error handler in mds_create_object.
              be freed no matter whether fsfilt_commit success or not.
  
  Severity   : minor
-Frequency  : only with huge numbers of clients
+Frequency  : only with large numbers of cores on a single node
  Bugzilla   : 11817
  Description: Prevents from taking the superblock lock in llap_from_page for
-            a soon died page.
+            a soon killed page.
  Details    : using LL_ORIGIN_REMOVEPAGE origin flag instead of LL_ORIGIN_UNKNOW
              for llap_from_page call in ll_removepage() prevents from taking
              the superblock lock for a soon died page.
diff --git a/lustre/LICENSE.cray b/lustre/LICENSE.cray

new file mode 100644 (file)

index 0000000..ba5a473
--- /dev/null
+++ b/lustre/LICENSE.cray
@@ -0,0 +1,384 @@
+Each file in this distribution contains a header stating the copyright
+owner(s), and the licensing terms for that file.  Some files are not
+eligible for copyright protection, and contain neither.
+
+There are many files which may be covered by a separate license that
+you signed or otherwise agreed to before downloading this software.
+If you did not agree to such an agreement, or if the file does not
+mention that license, then you can redistribute and/or modify it under
+the terms of version 2 of the GNU General Public License.  Each file
+is very clear about which license is applicable.
+
+In any case, Lustre is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the license
+text for more details.
+
+Reproduced below is the GNU General Public License version 2, and
+Linus's clarifying statement from the Linux kernel source code:
+
+----------------------------------------
+
+   NOTE! This copyright does *not* cover user programs that use kernel
+ services by normal system calls - this is merely considered normal use
+ of the kernel, and does *not* fall under the heading of "derived work".
+ Also note that the GPL below is copyrighted by the Free Software
+ Foundation, but the instance of code that it refers to (the Linux
+ kernel) is copyrighted by me and others who actually wrote it.
+
+                       Linus Torvalds
+
+----------------------------------------
+
+                   GNU GENERAL PUBLIC LICENSE
+                      Version 2, June 1991
+
+ Copyright (C) 1989, 1991 Free Software Foundation, Inc.
+                       59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ Everyone is permitted to copy and distribute verbatim copies
+ of this license document, but changing it is not allowed.
+
+                           Preamble
+
+  The licenses for most software are designed to take away your
+freedom to share and change it.  By contrast, the GNU General Public
+License is intended to guarantee your freedom to share and change free
+software--to make sure the software is free for all its users.  This
+General Public License applies to most of the Free Software
+Foundation's software and to any other program whose authors commit to
+using it.  (Some other Free Software Foundation software is covered by
+the GNU Library General Public License instead.)  You can apply it to
+your programs, too.
+
+  When we speak of free software, we are referring to freedom, not
+price.  Our General Public Licenses are designed to make sure that you
+have the freedom to distribute copies of free software (and charge for
+this service if you wish), that you receive source code or can get it
+if you want it, that you can change the software or use pieces of it
+in new free programs; and that you know you can do these things.
+
+  To protect your rights, we need to make restrictions that forbid
+anyone to deny you these rights or to ask you to surrender the rights.
+These restrictions translate to certain responsibilities for you if you
+distribute copies of the software, or if you modify it.
+
+  For example, if you distribute copies of such a program, whether
+gratis or for a fee, you must give the recipients all the rights that
+you have.  You must make sure that they, too, receive or can get the
+source code.  And you must show them these terms so they know their
+rights.
+
+  We protect your rights with two steps: (1) copyright the software, and
+(2) offer you this license which gives you legal permission to copy,
+distribute and/or modify the software.
+
+  Also, for each author's protection and ours, we want to make certain
+that everyone understands that there is no warranty for this free
+software.  If the software is modified by someone else and passed on, we
+want its recipients to know that what they have is not the original, so
+that any problems introduced by others will not reflect on the original
+authors' reputations.
+
+  Finally, any free program is threatened constantly by software
+patents.  We wish to avoid the danger that redistributors of a free
+program will individually obtain patent licenses, in effect making the
+program proprietary.  To prevent this, we have made it clear that any
+patent must be licensed for everyone's free use or not licensed at all.
+
+  The precise terms and conditions for copying, distribution and
+modification follow.
+\f
+                   GNU GENERAL PUBLIC LICENSE
+   TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION
+
+  0. This License applies to any program or other work which contains
+a notice placed by the copyright holder saying it may be distributed
+under the terms of this General Public License.  The "Program", below,
+refers to any such program or work, and a "work based on the Program"
+means either the Program or any derivative work under copyright law:
+that is to say, a work containing the Program or a portion of it,
+either verbatim or with modifications and/or translated into another
+language.  (Hereinafter, translation is included without limitation in
+the term "modification".)  Each licensee is addressed as "you".
+
+Activities other than copying, distribution and modification are not
+covered by this License; they are outside its scope.  The act of
+running the Program is not restricted, and the output from the Program
+is covered only if its contents constitute a work based on the
+Program (independent of having been made by running the Program).
+Whether that is true depends on what the Program does.
+
+  1. You may copy and distribute verbatim copies of the Program's
+source code as you receive it, in any medium, provided that you
+conspicuously and appropriately publish on each copy an appropriate
+copyright notice and disclaimer of warranty; keep intact all the
+notices that refer to this License and to the absence of any warranty;
+and give any other recipients of the Program a copy of this License
+along with the Program.
+
+You may charge a fee for the physical act of transferring a copy, and
+you may at your option offer warranty protection in exchange for a fee.
+
+  2. You may modify your copy or copies of the Program or any portion
+of it, thus forming a work based on the Program, and copy and
+distribute such modifications or work under the terms of Section 1
+above, provided that you also meet all of these conditions:
+
+    a) You must cause the modified files to carry prominent notices
+    stating that you changed the files and the date of any change.
+
+    b) You must cause any work that you distribute or publish, that in
+    whole or in part contains or is derived from the Program or any
+    part thereof, to be licensed as a whole at no charge to all third
+    parties under the terms of this License.
+
+    c) If the modified program normally reads commands interactively
+    when run, you must cause it, when started running for such
+    interactive use in the most ordinary way, to print or display an
+    announcement including an appropriate copyright notice and a
+    notice that there is no warranty (or else, saying that you provide
+    a warranty) and that users may redistribute the program under
+    these conditions, and telling the user how to view a copy of this
+    License.  (Exception: if the Program itself is interactive but
+    does not normally print such an announcement, your work based on
+    the Program is not required to print an announcement.)
+\f
+These requirements apply to the modified work as a whole.  If
+identifiable sections of that work are not derived from the Program,
+and can be reasonably considered independent and separate works in
+themselves, then this License, and its terms, do not apply to those
+sections when you distribute them as separate works.  But when you
+distribute the same sections as part of a whole which is a work based
+on the Program, the distribution of the whole must be on the terms of
+this License, whose permissions for other licensees extend to the
+entire whole, and thus to each and every part regardless of who wrote it.
+
+Thus, it is not the intent of this section to claim rights or contest
+your rights to work written entirely by you; rather, the intent is to
+exercise the right to control the distribution of derivative or
+collective works based on the Program.
+
+In addition, mere aggregation of another work not based on the Program
+with the Program (or with a work based on the Program) on a volume of
+a storage or distribution medium does not bring the other work under
+the scope of this License.
+
+  3. You may copy and distribute the Program (or a work based on it,
+under Section 2) in object code or executable form under the terms of
+Sections 1 and 2 above provided that you also do one of the following:
+
+    a) Accompany it with the complete corresponding machine-readable
+    source code, which must be distributed under the terms of Sections
+    1 and 2 above on a medium customarily used for software interchange; or,
+
+    b) Accompany it with a written offer, valid for at least three
+    years, to give any third party, for a charge no more than your
+    cost of physically performing source distribution, a complete
+    machine-readable copy of the corresponding source code, to be
+    distributed under the terms of Sections 1 and 2 above on a medium
+    customarily used for software interchange; or,
+
+    c) Accompany it with the information you received as to the offer
+    to distribute corresponding source code.  (This alternative is
+    allowed only for noncommercial distribution and only if you
+    received the program in object code or executable form with such
+    an offer, in accord with Subsection b above.)
+
+The source code for a work means the preferred form of the work for
+making modifications to it.  For an executable work, complete source
+code means all the source code for all modules it contains, plus any
+associated interface definition files, plus the scripts used to
+control compilation and installation of the executable.  However, as a
+special exception, the source code distributed need not include
+anything that is normally distributed (in either source or binary
+form) with the major components (compiler, kernel, and so on) of the
+operating system on which the executable runs, unless that component
+itself accompanies the executable.
+
+If distribution of executable or object code is made by offering
+access to copy from a designated place, then offering equivalent
+access to copy the source code from the same place counts as
+distribution of the source code, even though third parties are not
+compelled to copy the source along with the object code.
+\f
+  4. You may not copy, modify, sublicense, or distribute the Program
+except as expressly provided under this License.  Any attempt
+otherwise to copy, modify, sublicense or distribute the Program is
+void, and will automatically terminate your rights under this License.
+However, parties who have received copies, or rights, from you under
+this License will not have their licenses terminated so long as such
+parties remain in full compliance.
+
+  5. You are not required to accept this License, since you have not
+signed it.  However, nothing else grants you permission to modify or
+distribute the Program or its derivative works.  These actions are
+prohibited by law if you do not accept this License.  Therefore, by
+modifying or distributing the Program (or any work based on the
+Program), you indicate your acceptance of this License to do so, and
+all its terms and conditions for copying, distributing or modifying
+the Program or works based on it.
+
+  6. Each time you redistribute the Program (or any work based on the
+Program), the recipient automatically receives a license from the
+original licensor to copy, distribute or modify the Program subject to
+these terms and conditions.  You may not impose any further
+restrictions on the recipients' exercise of the rights granted herein.
+You are not responsible for enforcing compliance by third parties to
+this License.
+
+  7. If, as a consequence of a court judgment or allegation of patent
+infringement or for any other reason (not limited to patent issues),
+conditions are imposed on you (whether by court order, agreement or
+otherwise) that contradict the conditions of this License, they do not
+excuse you from the conditions of this License.  If you cannot
+distribute so as to satisfy simultaneously your obligations under this
+License and any other pertinent obligations, then as a consequence you
+may not distribute the Program at all.  For example, if a patent
+license would not permit royalty-free redistribution of the Program by
+all those who receive copies directly or indirectly through you, then
+the only way you could satisfy both it and this License would be to
+refrain entirely from distribution of the Program.
+
+If any portion of this section is held invalid or unenforceable under
+any particular circumstance, the balance of the section is intended to
+apply and the section as a whole is intended to apply in other
+circumstances.
+
+It is not the purpose of this section to induce you to infringe any
+patents or other property right claims or to contest validity of any
+such claims; this section has the sole purpose of protecting the
+integrity of the free software distribution system, which is
+implemented by public license practices.  Many people have made
+generous contributions to the wide range of software distributed
+through that system in reliance on consistent application of that
+system; it is up to the author/donor to decide if he or she is willing
+to distribute software through any other system and a licensee cannot
+impose that choice.
+
+This section is intended to make thoroughly clear what is believed to
+be a consequence of the rest of this License.
+\f
+  8. If the distribution and/or use of the Program is restricted in
+certain countries either by patents or by copyrighted interfaces, the
+original copyright holder who places the Program under this License
+may add an explicit geographical distribution limitation excluding
+those countries, so that distribution is permitted only in or among
+countries not thus excluded.  In such case, this License incorporates
+the limitation as if written in the body of this License.
+
+  9. The Free Software Foundation may publish revised and/or new versions
+of the General Public License from time to time.  Such new versions will
+be similar in spirit to the present version, but may differ in detail to
+address new problems or concerns.
+
+Each version is given a distinguishing version number.  If the Program
+specifies a version number of this License which applies to it and "any
+later version", you have the option of following the terms and conditions
+either of that version or of any later version published by the Free
+Software Foundation.  If the Program does not specify a version number of
+this License, you may choose any version ever published by the Free Software
+Foundation.
+
+  10. If you wish to incorporate parts of the Program into other free
+programs whose distribution conditions are different, write to the author
+to ask for permission.  For software which is copyrighted by the Free
+Software Foundation, write to the Free Software Foundation; we sometimes
+make exceptions for this.  Our decision will be guided by the two goals
+of preserving the free status of all derivatives of our free software and
+of promoting the sharing and reuse of software generally.
+
+                           NO WARRANTY
+
+  11. BECAUSE THE PROGRAM IS LICENSED FREE OF CHARGE, THERE IS NO WARRANTY
+FOR THE PROGRAM, TO THE EXTENT PERMITTED BY APPLICABLE LAW.  EXCEPT WHEN
+OTHERWISE STATED IN WRITING THE COPYRIGHT HOLDERS AND/OR OTHER PARTIES
+PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY OF ANY KIND, EITHER EXPRESSED
+OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
+MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE.  THE ENTIRE RISK AS
+TO THE QUALITY AND PERFORMANCE OF THE PROGRAM IS WITH YOU.  SHOULD THE
+PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF ALL NECESSARY SERVICING,
+REPAIR OR CORRECTION.
+
+  12. IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING
+WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MAY MODIFY AND/OR
+REDISTRIBUTE THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES,
+INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING
+OUT OF THE USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED
+TO LOSS OF DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY
+YOU OR THIRD PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER
+PROGRAMS), EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGES.
+
+                    END OF TERMS AND CONDITIONS
+\f
+           How to Apply These Terms to Your New Programs
+
+  If you develop a new program, and you want it to be of the greatest
+possible use to the public, the best way to achieve this is to make it
+free software which everyone can redistribute and change under these terms.
+
+  To do so, attach the following notices to the program.  It is safest
+to attach them to the start of each source file to most effectively
+convey the exclusion of warranty; and each file should have at least
+the "copyright" line and a pointer to where the full notice is found.
+
+    <one line to give the program's name and a brief idea of what it does.>
+    Copyright (C) 19yy  <name of author>
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with this program; if not, write to the Free Software
+    Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+
+
+Also add information on how to contact you by electronic and paper mail.
+
+If the program is interactive, make it output a short notice like this
+when it starts in an interactive mode:
+
+    Gnomovision version 69, Copyright (C) 19yy name of author
+    Gnomovision comes with ABSOLUTELY NO WARRANTY; for details type `show w'.
+    This is free software, and you are welcome to redistribute it
+    under certain conditions; type `show c' for details.
+
+The hypothetical commands `show w' and `show c' should show the appropriate
+parts of the General Public License.  Of course, the commands you use may
+be called something other than `show w' and `show c'; they could even be
+mouse-clicks or menu items--whatever suits your program.
+
+You should also get your employer (if you work as a programmer) or your
+school, if any, to sign a "copyright disclaimer" for the program, if
+necessary.  Here is a sample; alter the names:
+
+  Yoyodyne, Inc., hereby disclaims all copyright interest in the program
+  `Gnomovision' (which makes passes at compilers) written by James Hacker.
+
+  <signature of Ty Coon>, 1 April 1989
+  Ty Coon, President of Vice
+
+This General Public License does not permit incorporating your program into
+proprietary programs.  If your program is a subroutine library, you may
+consider it more useful to permit linking proprietary applications with the
+library.  If this is what you want to do, use the GNU Library General
+Public License instead of this License.
+
+*** Use this verbiage for CRAY ***
+
+You may have signed or agreed to another license before downloading
+this software.  If so, you are bound by the terms and conditions
+of that agreement, and the following does not apply to you.  See the
+LICENSE file included with this distribution for more information.
+
+If you did not agree to a different license, then this copy of Lustre
+is open source software; you can redistribute it and/or modify it
+under the terms of version 2 of the GNU General Public License as
+published by the Free Software Foundation.
diff --git a/lustre/Makefile.in b/lustre/Makefile.in

index c06794a..ca60f36 100644 (file)
--- a/lustre/Makefile.in
+++ b/lustre/Makefile.in
@@ -5,9 +5,9 @@ subdir-m += ptlrpc
  subdir-m += osc
  subdir-m += obdecho
  subdir-m += mgc
+subdir-m += quota
  
  @SERVER_TRUE@subdir-m += mds obdfilter ost mgs
  @CLIENT_TRUE@subdir-m += mdc llite 
-@QUOTA_TRUE@subdir-m += quota
  
  @INCLUDE_RULES@
diff --git a/lustre/autoMakefile.am b/lustre/autoMakefile.am

index 53ae48a..13d576c 100644 (file)
--- a/lustre/autoMakefile.am
+++ b/lustre/autoMakefile.am
@@ -1,7 +1,38 @@
-# Copyright (C) 2001  Cluster File Systems, Inc.
  #
-# This code is issued under the GNU General Public License.
-# See the file COPYING in this distribution
+# GPL HEADER START
+#
+# DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License version 2 only,
+# as published by the Free Software Foundation.
+#
+# This program is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+# General Public License version 2 for more details (a copy is included
+# in the LICENSE file that accompanied this code).
+#
+# You should have received a copy of the GNU General Public License
+# version 2 along with this program; If not, see
+# http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+#
+# Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+# CA 95054 USA or visit www.sun.com if you need additional information or
+# have any questions.
+#
+# GPL HEADER END
+#
+
+#
+# Copyright  2008 Sun Microsystems, Inc. All rights reserved
+# Use is subject to license terms.
+#
+
+#
+# This file is part of Lustre, http://www.lustre.org/
+# Lustre is a trademark of Sun Microsystems, Inc.
+#
  
  AUTOMAKE_OPTIONS = foreign
  
@@ -27,9 +58,7 @@ if CLIENT
  SUBDIRS += $(CLIENT_SUBDIRS)
  endif
  
-if QUOTA
  SUBDIRS += $(QUOTA_SUBDIRS)
-endif
  
  # this needs to be after the client subdirs
  if LIBLUSTRE
diff --git a/lustre/autoconf/lustre-core.m4 b/lustre/autoconf/lustre-core.m4

index 2792f61..9b684da 100644 (file)
--- a/lustre/autoconf/lustre-core.m4
+++ b/lustre/autoconf/lustre-core.m4
@@ -122,6 +122,26 @@ LB_LINUX_TRY_COMPILE([
  ])
  
  #
+# LC_FUNC_RELEASEPAGE_WITH_GFP
+#
+# if ->releasepage() takes a gfp_t arg in 2.6.9
+# This kernel defines gfp_t (HAS_GFP_T) but doesn't use it for this function,
+# while others either don't have gfp_t or pass gfp_t as the parameter.
+#
+AC_DEFUN([LC_FUNC_RELEASEPAGE_WITH_GFP],
+[AC_MSG_CHECKING([if releasepage has a gfp_t parameter])
+RELEASEPAGE_WITH_GFP="`grep -c 'releasepage.*gfp_t' $LINUX/include/linux/fs.h`"
+if test "$RELEASEPAGE_WITH_GFP" != 0 ; then
+       AC_DEFINE(HAVE_RELEASEPAGE_WITH_GFP, 1,
+                  [releasepage with gfp_t parameter])
+       AC_MSG_RESULT([yes])
+else
+       AC_MSG_RESULT([no])
+fi
+])
+
+
+#
  # LC_FUNC_ZAP_PAGE_RANGE
  #
  # if zap_page_range() takes a vma arg
@@ -220,9 +240,6 @@ LB_LINUX_TRY_COMPILE([
         #include <linux/fs.h>
         #include <linux/version.h>
  ],[
-       #if (LINUX_VERSION_CODE < KERNEL_VERSION(2,4,24))
-       #error "down_read_trylock broken before 2.4.24"
-       #endif
         struct inode i;
         return (char *)&i.i_alloc_sem - (char *)&i;
  ],[
@@ -237,27 +254,52 @@ LB_LINUX_TRY_COMPILE([
  # LC_FUNC_REGISTER_CACHE
  #
  # if register_cache() is defined by kernel
+# 
+# There are two ways to shrink one customized cache in linux kernels. For the
+# kernels are prior than 2.6.5(?), register_cache() is used, and for latest 
+# kernels, set_shrinker() is used instead.
  #
  AC_DEFUN([LC_FUNC_REGISTER_CACHE],
-[AC_MSG_CHECKING([if kernel defines register_cache()])
+[AC_MSG_CHECKING([if kernel defines cache pressure hook])
  LB_LINUX_TRY_COMPILE([
-       #include <linux/list.h>
-       #include <linux/cache_def.h>
+       #include <linux/mm.h>
  ],[
-       struct cache_definition cache;
+       shrinker_t shrinker;
+
+       set_shrinker(1, shrinker);
  ],[
-       AC_MSG_RESULT([yes])
-       AC_DEFINE(HAVE_REGISTER_CACHE, 1, [register_cache found])
-       AC_MSG_CHECKING([if kernel expects return from cache shrink function])
-       HAVE_CACHE_RETURN_INT="`grep -c 'int.*shrink' $LINUX/include/linux/cache_def.h`"
-       if test "$HAVE_CACHE_RETURN_INT" != 0 ; then
-               AC_DEFINE(HAVE_CACHE_RETURN_INT, 1, [kernel expects return from shrink_cache])
-               AC_MSG_RESULT(yes)
-       else
-               AC_MSG_RESULT(no)
-       fi
+       AC_MSG_RESULT([set_shrinker])
+       AC_DEFINE(HAVE_SHRINKER_CACHE, 1, [shrinker_cache found])
+       AC_DEFINE(HAVE_CACHE_RETURN_INT, 1, [shrinkers should return int])
  ],[
-       AC_MSG_RESULT([no])
+       LB_LINUX_TRY_COMPILE([
+               #include <linux/list.h>
+               #include <linux/cache_def.h>
+       ],[
+               struct cache_definition cache;
+       ],[
+               AC_MSG_RESULT([register_cache])
+               AC_DEFINE(HAVE_REGISTER_CACHE, 1, [register_cache found])
+               AC_MSG_CHECKING([if kernel expects return from cache shrink ])
+               tmp_flags="$EXTRA_KCFLAGS"
+               EXTRA_KCFLAGS="-Werror"
+               LB_LINUX_TRY_COMPILE([
+                       #include <linux/list.h>
+                       #include <linux/cache_def.h>
+               ],[
+                       struct cache_definition c;
+                       c.shrinker = (int (*)(int, unsigned int))1;
+               ],[
+                       AC_DEFINE(HAVE_CACHE_RETURN_INT, 1,
+                                 [kernel expects return from shrink_cache])
+                       AC_MSG_RESULT(yes)
+               ],[
+                       AC_MSG_RESULT(no)
+               ])
+               EXTRA_KCFLAGS="$tmp_flags"
+       ],[
+               AC_MSG_RESULT([no])
+       ])
  ])
  ])
  
@@ -551,6 +593,28 @@ AC_DEFUN([LC_XATTR_ACL],
  [])
  ])
  
+#
+# LC_LINUX_FIEMAP_H
+#
+# If we have fiemap.h
+# after 2.6.27 use fiemap.h in include/linux
+#
+AC_DEFUN([LC_LINUX_FIEMAP_H],
+[LB_CHECK_FILE([$LINUX/include/linux/fiemap.h],[
+        AC_MSG_CHECKING([if fiemap.h can be compiled])
+        LB_LINUX_TRY_COMPILE([
+                #include <linux/fiemap.h>
+        ],[],[
+                AC_MSG_RESULT([yes])
+                AC_DEFINE(HAVE_LINUX_FIEMAP_H, 1, [Kernel has fiemap.h])
+        ],[
+                AC_MSG_RESULT([no])
+        ])
+],
+[])
+])
+
+
  AC_DEFUN([LC_STRUCT_INTENT_FILE],
  [AC_MSG_CHECKING([if struct open_intent has a file field])
  LB_LINUX_TRY_COMPILE([
@@ -792,7 +856,7 @@ LB_LINUX_TRY_COMPILE([
  # LC_INODE_I_MUTEX
  # after 2.6.15 inode have i_mutex intead of i_sem
  AC_DEFUN([LC_INODE_I_MUTEX],
-[AC_MSG_CHECKING([use inode have i_mutex ])
+[AC_MSG_CHECKING([if inode has i_mutex ])
  LB_LINUX_TRY_COMPILE([
         #include <linux/mutex.h>
         #include <linux/fs.h>
@@ -806,7 +870,7 @@ LB_LINUX_TRY_COMPILE([
          AC_DEFINE(HAVE_INODE_I_MUTEX, 1,
                  [after 2.6.15 inode have i_mutex intead of i_sem])
  ],[
-        AC_MSG_RESULT(NO)
+        AC_MSG_RESULT(no)
  ])
  ])
  
@@ -827,7 +891,7 @@ LB_LINUX_TRY_COMPILE([
          AC_DEFINE(HAVE_DQUOTOFF_MUTEX, 1,
                  [after 2.6.17 dquote use mutex instead if semaphore])
  ],[
-        AC_MSG_RESULT(NO)
+        AC_MSG_RESULT(no)
  ])
  ])
  
@@ -887,7 +951,7 @@ LB_LINUX_TRY_COMPILE([
         AC_DEFINE(HAVE_INVALIDATEPAGE_RETURN_INT, 1,
                 [Define if return type of invalidatepage should be int])
  ],[
-       AC_MSG_RESULT(NO)
+       AC_MSG_RESULT(no)
  ])
  ])
  
@@ -916,7 +980,7 @@ LB_LINUX_TRY_COMPILE([
         AC_DEFINE(HAVE_UMOUNTBEGIN_VFSMOUNT, 1,
                 [Define umount_begin need second argument])
  ],[
-       AC_MSG_RESULT(NO)
+       AC_MSG_RESULT(no)
  ])
  EXTRA_KCFLAGS="$tmp_flags"
  ])
@@ -935,7 +999,7 @@ LB_LINUX_TRY_COMPILE([
         AC_DEFINE(HAVE_INODE_BLKSIZE, 1,
                 [struct inode has i_blksize field])
  ],[
-       AC_MSG_RESULT(NO)
+       AC_MSG_RESULT(no)
  ])
  ])
  
@@ -962,7 +1026,7 @@ LB_LINUX_TRY_COMPILE([
          AC_DEFINE(HAVE_VFS_READDIR_U64_INO, 1,
                  [if vfs_readdir need 64bit inode number])
  ],[
-        AC_MSG_RESULT(NO)
+        AC_MSG_RESULT(no)
  ])
  EXTRA_KCFLAGS="$tmp_flags"
  ])
@@ -974,14 +1038,14 @@ AC_DEFUN([LC_FILE_WRITEV],
  LB_LINUX_TRY_COMPILE([
          #include <linux/fs.h>
  ],[
-        struct file_operations *fops;
+        struct file_operations *fops = NULL;
          fops->writev = NULL;
  ],[
          AC_MSG_RESULT(yes)
          AC_DEFINE(HAVE_FILE_WRITEV, 1,
                  [use fops->writev])
  ],[
-       AC_MSG_RESULT(NO)
+       AC_MSG_RESULT(no)
  ])
  ])
  
@@ -992,14 +1056,14 @@ AC_DEFUN([LC_FILE_READV],
  LB_LINUX_TRY_COMPILE([
          #include <linux/fs.h>
  ],[
-        struct file_operations *fops;
+        struct file_operations *fops = NULL;
          fops->readv = NULL;
  ],[
          AC_MSG_RESULT(yes)
          AC_DEFINE(HAVE_FILE_READV, 1,
                  [use fops->readv])
  ],[
-        AC_MSG_RESULT(NO)
+        AC_MSG_RESULT(no)
  ])
  ])
  
@@ -1016,7 +1080,7 @@ LB_LINUX_TRY_COMPILE([
          AC_DEFINE(HAVE_NR_PAGECACHE, 1,
                  [is kernel export nr_pagecache])
  ],[
-        AC_MSG_RESULT(NO)
+        AC_MSG_RESULT(no)
  ])
  ])
  
@@ -1026,6 +1090,7 @@ LB_LINUX_TRY_COMPILE([
  AC_DEFUN([LC_CANCEL_DIRTY_PAGE],
  [AC_MSG_CHECKING([kernel has cancel_dirty_page])
  LB_LINUX_TRY_COMPILE([
+        #include <linux/mm.h>
          #include <linux/page-flags.h>
  ],[
          cancel_dirty_page(NULL, 0);
@@ -1034,7 +1099,7 @@ LB_LINUX_TRY_COMPILE([
          AC_DEFINE(HAVE_CANCEL_DIRTY_PAGE, 1,
                    [kernel has cancel_dirty_page instead of clear_page_dirty])
  ],[
-        AC_MSG_RESULT(NO)
+        AC_MSG_RESULT(no)
  ])
  ])
  
@@ -1048,6 +1113,7 @@ LB_LINUX_TRY_COMPILE([
  AC_DEFUN([LC_PAGE_CONSTANT],
  [AC_MSG_CHECKING([if kernel have PageConstant defined])
  LB_LINUX_TRY_COMPILE([
+        #include <linux/mm.h>
          #include <linux/page-flags.h>
  ],[
          #ifndef PG_constant
@@ -1066,6 +1132,7 @@ LB_LINUX_TRY_COMPILE([
  AC_DEFUN([LC_PG_FS_MISC],
  [AC_MSG_CHECKING([kernel has PG_fs_misc])
  LB_LINUX_TRY_COMPILE([
+        #include <linux/mm.h>
          #include <linux/page-flags.h>
  ],[
          #ifndef PG_fs_misc
@@ -1076,7 +1143,7 @@ LB_LINUX_TRY_COMPILE([
          AC_DEFINE(HAVE_PG_FS_MISC, 1,
                    [is kernel have PG_fs_misc])
  ],[
-        AC_MSG_RESULT(NO)
+        AC_MSG_RESULT(no)
  ])
  ])
  
@@ -1084,6 +1151,7 @@ LB_LINUX_TRY_COMPILE([
  AC_DEFUN([LC_PAGE_CHECKED],
  [AC_MSG_CHECKING([kernel has PageChecked and SetPageChecked])
  LB_LINUX_TRY_COMPILE([
+        #include <linux/mm.h>
          #include <linux/page-flags.h>
  ],[
          #ifndef PageChecked
@@ -1097,7 +1165,7 @@ LB_LINUX_TRY_COMPILE([
          AC_DEFINE(HAVE_PAGE_CHECKED, 1,
                    [does kernel have PageChecked and SetPageChecked])
  ],[
-        AC_MSG_RESULT(NO)
+        AC_MSG_RESULT(no)
  ])
  ])
  
@@ -1146,6 +1214,38 @@ AC_DEFINE(HAVE___D_MOVE, 1,
  ])
  ])
  
+#
+# LC_EXPORT_INVALIDATE_MAPPING_PAGES
+#
+# SLES9, RHEL4, RHEL5, vanilla 2.6.24 export invalidate_mapping_pages() but
+# SLES10 2.6.16 does not, for some reason.  For filter cache invalidation.
+#
+AC_DEFUN([LC_EXPORT_INVALIDATE_MAPPING_PAGES],
+    [LB_CHECK_SYMBOL_EXPORT([invalidate_mapping_pages], [mm/truncate.c], [
+         AC_DEFINE(HAVE_INVALIDATE_MAPPING_PAGES, 1,
+                        [exported invalidate_mapping_pages])],
+    [LB_CHECK_SYMBOL_EXPORT([invalidate_inode_pages], [mm/truncate.c], [
+         AC_DEFINE(HAVE_INVALIDATE_INODE_PAGES, 1,
+                        [exported invalidate_inode_pages])], [
+       AC_MSG_ERROR([no way to invalidate pages])
+  ])
+    ],[])
+])
+
+#
+# LC_EXPORT_FILEMAP_FDATASYNC_RANGE
+#
+# No standard kernels export this
+#
+AC_DEFUN([LC_EXPORT_FILEMAP_FDATAWRITE_RANGE],
+[LB_CHECK_SYMBOL_EXPORT([filemap_fdatawrite_range],
+[mm/filemap.c],[
+AC_DEFINE(HAVE_FILEMAP_FDATAWRITE_RANGE, 1,
+            [filemap_fdatawrite_range is exported by the kernel])
+],[
+])
+])
+
  # The actual symbol exported varies among architectures, so we need
  # to check many symbols (but only in the current architecture.)  No
  # matter what symbol is exported, the kernel #defines node_to_cpumask
@@ -1209,6 +1309,26 @@ LB_LINUX_TRY_COMPILE([
  ])
  ])
  
+# 2.6.12 merge patch from oracle to convert tree_lock from spinlock to rwlock
+AC_DEFUN([LC_RW_TREE_LOCK],
+[AC_MSG_CHECKING([if kernel has tree_lock as rwlock])
+tmp_flags="$EXTRA_KCFLAGS"
+EXTRA_KCFLAGS="-Werror"
+LB_LINUX_TRY_COMPILE([
+        #include <linux/fs.h>
+],[
+        struct address_space a;
+
+        write_lock(&a.tree_lock);
+],[
+        AC_MSG_RESULT([yes])
+        AC_DEFINE(HAVE_RW_TREE_LOCK, 1, [kernel has tree_lock as rw_lock])
+],[
+        AC_MSG_RESULT([no])
+])
+EXTRA_KCFLAGS="$tmp_flags"
+])
+
  # 2.6.23 have return type 'void' for unregister_blkdev
  AC_DEFUN([LC_UNREGISTER_BLKDEV_RETURN_INT],
  [AC_MSG_CHECKING([if unregister_blkdev return int])
@@ -1265,10 +1385,11 @@ AC_DEFUN([LC_PROG_LINUX],
            LC_CONFIG_PINGER
            LC_CONFIG_CHECKSUM
            LC_CONFIG_LIBLUSTRE_RECOVERY
-          LC_CONFIG_QUOTA
            LC_CONFIG_HEALTH_CHECK_WRITE
            LC_CONFIG_LRU_RESIZE
            LC_CONFIG_ADAPTIVE_TIMEOUTS
+          LC_CONFIG_DELAYED_RECOVERY
+          LC_QUOTA_MODULE
  
            LC_TASK_PPTR
            # RHEL4 patches
@@ -1281,6 +1402,7 @@ AC_DEFUN([LC_PROG_LINUX],
  
            LC_STRUCT_KIOBUF
            LC_FUNC_COND_RESCHED
+          LC_FUNC_RELEASEPAGE_WITH_GFP
            LC_FUNC_ZAP_PAGE_RANGE
            LC_FUNC_PDE
            LC_FUNC_DIRECT_IO
@@ -1307,14 +1429,21 @@ AC_DEFUN([LC_PROG_LINUX],
            LC_QUOTA_READ
            LC_COOKIE_FOLLOW_LINK
            LC_FUNC_RCU
+          LC_PERCPU_COUNTER
            LC_QUOTA64
  
            # does the kernel have VFS intent patches?
            LC_VFS_INTENT_PATCHES
  
+          # 2.6.12
+          LC_RW_TREE_LOCK
+
            # 2.6.15
            LC_INODE_I_MUTEX
  
+          # 2.6.16
+          LC_SECURITY_PLUG  # for SLES10 SP2
+
            # 2.6.17
            LC_DQUOTOFF_MUTEX
  
@@ -1324,6 +1453,10 @@ AC_DEFUN([LC_PROG_LINUX],
            LC_VFS_KERN_MOUNT
            LC_INVALIDATEPAGE_RETURN_INT
            LC_UMOUNTBEGIN_HAS_VFSMOUNT
+         if test x$enable_server = xyes ; then
+                LC_EXPORT_INVALIDATE_MAPPING_PAGES
+                LC_EXPORT_FILEMAP_FDATAWRITE_RANGE
+         fi
  
            #2.6.18 + RHEL5 (fc6)
            LC_PG_FS_MISC
@@ -1340,8 +1473,8 @@ AC_DEFUN([LC_PROG_LINUX],
  
            # raid5-zerocopy patch
            LC_PAGE_CONSTANT
-         
-         # 2.6.22
+
+          # 2.6.22
            LC_INVALIDATE_BDEV_2ARG
            LC_FS_RENAME_DOES_D_MOVE
            # 2.6.23
@@ -1405,13 +1538,51 @@ if test x$enable_liblustre_acl = xyes ; then
    AC_DEFINE(LIBLUSTRE_POSIX_ACL, 1, Liblustre Support ACL-enabled MDS)
  fi
  
-AC_MSG_CHECKING([whether to build mpitests])
-AC_ARG_ENABLE([mpitests],
-       AC_HELP_STRING([--enable-mpitests],
-                       [build liblustre mpi tests]),
-       [],[enable_mpitests=no])
+#
+# --enable-mpitest
+#
+AC_ARG_ENABLE(mpitests,
+       AC_HELP_STRING([--enable-mpitest=yes|no|mpich directory],
+                           [include mpi tests]),
+       [
+        enable_mpitests=yes
+         case $enableval in
+         yes)
+               MPI_ROOT=/opt/mpich
+               LDFLAGS="$LDFLAGS -L$MPI_ROOT/ch-p4/lib -L$MPI_ROOT/ch-p4/lib64"
+               CFLAGS="$CFLAGS -I$MPI_ROOT/include"
+               ;;
+         no)
+               enable_mpitests=no
+               ;;
+        [[\\/$]]* | ?:[[\\/]]* )
+               MPI_ROOT=$enableval
+               LDFLAGS="$LDFLAGS -L$with_mpi/lib"
+               CFLAGS="$CFLAGS -I$MPI_ROOT/include"
+                ;;
+         *)
+                 AC_MSG_ERROR([expected absolute directory name for --enable-mpitests or yes or no])
+                 ;;
+        esac
+       ],
+       [
+       MPI_ROOT=/opt/mpich
+        LDFLAGS="$LDFLAGS -L$MPI_ROOT/ch-p4/lib -L$MPI_ROOT/ch-p4/lib64"
+        CFLAGS="$CFLAGS -I$MPI_ROOT/include"
+       enable_mpitests=yes
+       ]
+)
+AC_SUBST(MPI_ROOT)
+
+if test x$enable_mpitests != xno; then
+       AC_MSG_CHECKING([whether to mpitests can be built])
+        AC_CHECK_FILE([$MPI_ROOT/include/mpi.h],
+                      [AC_CHECK_LIB([mpich],[MPI_Start],[enable_mpitests=yes],[enable_mpitests=no])],
+                      [enable_mpitests=no])
+fi
  AC_MSG_RESULT([$enable_mpitests])
  
+
  AC_MSG_NOTICE([Enabling Lustre configure options for libsysio])
  ac_configure_args="$ac_configure_args --with-lustre-hack --with-sockets"
  
@@ -1443,26 +1614,53 @@ if test x$enable_adaptive_timeouts == xyes; then
  fi
  ])
  
+# config delayed recovery
+AC_DEFUN([LC_CONFIG_DELAYED_RECOVERY],
+[AC_MSG_CHECKING([whether to enable delayed recovery support])
+AC_ARG_ENABLE([delayed-recovery],
+       AC_HELP_STRING([--enable-delayed-recovery],
+                       [enable late recovery after main one]),
+       [],[enable_delayed_recovery='no'])
+AC_MSG_RESULT([$enable_delayed_recovery])
+if test x$enable_delayed_recovery == xyes; then
+   AC_DEFINE(HAVE_DELAYED_RECOVERY, 1, [Enable delayed recovery support])
+fi
+])
+
  #
  # LC_CONFIG_QUOTA
  #
-# whether to enable quota support
+# whether to enable quota support global control
  #
  AC_DEFUN([LC_CONFIG_QUOTA],
-[AC_MSG_CHECKING([whether to enable quota support])
-AC_ARG_ENABLE([quota], 
+[AC_ARG_ENABLE([quota],
         AC_HELP_STRING([--enable-quota],
                         [enable quota support]),
         [],[enable_quota='yes'])
-AC_MSG_RESULT([$enable_quota])
-if test x$linux25 != xyes; then
-   enable_quota='no'
-fi
-if test x$enable_quota != xno; then
-   AC_DEFINE(HAVE_QUOTA_SUPPORT, 1, [Enable quota support])
+])
+
+# whether to enable quota support(kernel modules)
+AC_DEFUN([LC_QUOTA_MODULE],
+[if test x$enable_quota != xno; then
+    LB_LINUX_CONFIG([QUOTA],[
+       enable_quota_module='yes'
+       AC_DEFINE(HAVE_QUOTA_SUPPORT, 1, [Enable quota support])
+    ],[
+       enable_quota_module='no'
+       AC_MSG_WARN([quota is not enabled because the kernel - lacks quota support])
+    ])
  fi
  ])
-  
+
+AC_DEFUN([LC_QUOTA],
+[#check global
+LC_CONFIG_QUOTA
+#check for utils
+AC_CHECK_HEADER(sys/quota.h,
+                [AC_DEFINE(HAVE_SYS_QUOTA_H, 1, [Define to 1 if you have <sys/quota.h>.])],
+                [AC_MSG_ERROR([don't find <sys/quota.h> in your system])])
+])
+
  AC_DEFUN([LC_QUOTA_READ],
  [AC_MSG_CHECKING([if kernel supports quota_read])
  LB_LINUX_TRY_COMPILE([
@@ -1557,6 +1755,53 @@ LB_LINUX_TRY_COMPILE([
  ])
  ])
  
+# LC_SECURITY_PLUG  # for SLES10 SP2
+# check security plug in sles10 sp2 kernel 
+AC_DEFUN([LC_SECURITY_PLUG],
+[AC_MSG_CHECKING([If kernel has security plug support])
+LB_LINUX_TRY_COMPILE([
+        #include <linux/fs.h>
+],[
+        struct dentry   *dentry;
+        struct vfsmount *mnt;
+        struct iattr    *iattr;
+
+        notify_change(dentry, mnt, iattr);
+],[
+        AC_MSG_RESULT(yes)
+        AC_DEFINE(HAVE_SECURITY_PLUG, 1,
+                [SLES10 SP2 use extra parameter in vfs])
+],[
+        AC_MSG_RESULT(no)
+])
+])
+
+AC_DEFUN([LC_PERCPU_COUNTER],
+[AC_MSG_CHECKING([if have struct percpu_counter defined])
+LB_LINUX_TRY_COMPILE([
+        #include <linux/percpu_counter.h>
+],[],[
+        AC_DEFINE(HAVE_PERCPU_COUNTER, 1, [percpu_counter found])
+        AC_MSG_RESULT([yes])
+
+        AC_MSG_CHECKING([if percpu_counter_inc takes the 2nd argument])
+        LB_LINUX_TRY_COMPILE([
+                #include <linux/percpu_counter.h>
+        ],[
+                struct percpu_counter c;
+                percpu_counter_init(&c, 0);
+        ],[
+                AC_DEFINE(HAVE_PERCPU_2ND_ARG, 1, [percpu_counter_init has two
+                                                   arguments])
+                AC_MSG_RESULT([yes])
+        ],[
+                AC_MSG_RESULT([no])
+        ])
+],[
+        AC_MSG_RESULT([no])
+])
+])
+
  #
  # LC_CONFIGURE
  #
@@ -1568,10 +1813,6 @@ AC_DEFUN([LC_CONFIGURE],
  # include/liblustre.h
  AC_CHECK_HEADERS([asm/page.h sys/user.h sys/vfs.h stdint.h blkid/blkid.h])
  
-# include/lustre/lustre_user.h
-# See note there re: __ASM_X86_64_PROCESSOR_H
-AC_CHECK_HEADERS([linux/fs.h linux/quota.h])
-
  # liblustre/llite_lib.h
  AC_CHECK_HEADERS([xtio.h file.h])
  
@@ -1642,7 +1883,7 @@ AM_CONDITIONAL(LIBLUSTRE_TESTS, test x$enable_liblustre_tests = xyes)
  AM_CONDITIONAL(MPITESTS, test x$enable_mpitests = xyes, Build MPI Tests)
  AM_CONDITIONAL(CLIENT, test x$enable_client = xyes)
  AM_CONDITIONAL(SERVER, test x$enable_server = xyes)
-AM_CONDITIONAL(QUOTA, test x$enable_quota = xyes)
+AM_CONDITIONAL(QUOTA, test x$enable_quota_module = xyes)
  AM_CONDITIONAL(BLKID, test x$ac_cv_header_blkid_blkid_h = xyes)
  AM_CONDITIONAL(EXT2FS_DEVEL, test x$ac_cv_header_ext2fs_ext2fs_h = xyes)
  AM_CONDITIONAL(LIBPTHREAD, test x$enable_libpthread = xyes)
diff --git a/lustre/autoconf/lustre-version.ac b/lustre/autoconf/lustre-version.ac

index dfa6092..c5ca2b7 100644 (file)
--- a/lustre/autoconf/lustre-version.ac
+++ b/lustre/autoconf/lustre-version.ac
@@ -1,9 +1,15 @@
  m4_define([LUSTRE_MAJOR],[1])
  m4_define([LUSTRE_MINOR],[7])
-m4_define([LUSTRE_PATCH],[90])
+m4_define([LUSTRE_PATCH],[150])
  m4_define([LUSTRE_FIX],[0])
  # Note: we're starting prerelease versions at 50 this time.
  
+dnl # don't forget to update the service tags info
+m4_define([CLIENT_URN],["LUSTRE-180-CLT"])
+m4_define([MDS_URN],["LUSTRE-180-MDS"])
+m4_define([MGS_URN],["LUSTRE-180-MGS"])
+m4_define([OSS_URN],["LUSTRE-180-OSS"])
+
  dnl # liblustre delta is 0.0.1.32 , next version with fixes is ok, but
  dnl # after following release candidate/beta would spill this warning already.
  m4_define([LUSTRE_VER_ALLOWED_OFFSET],["OBD_OCD_VERSION(0,0,1,32)"])
@@ -25,6 +31,10 @@ m4_define([LUSTRE_VERSION],m4_if(LUSTRE_FIX,[0],LUSTRE_MAJOR.LUSTRE_MINOR.LUSTRE
  [AC_LUSTRE_VER_ALLOWED_OFFSET]=LUSTRE_VER_ALLOWED_OFFSET
  [AC_LUSTRE_LIB_VER_OFFSET_WARN]=LUSTRE_LIB_VER_OFFSET_WARN
  [AC_LUSTRE_CLI_VER_OFFSET_WARN]=LUSTRE_CLI_VER_OFFSET_WARN
+[AC_LUSTRE_CLIENT_URN]=CLIENT_URN
+[AC_LUSTRE_MGS_URN]=MGS_URN
+[AC_LUSTRE_MDS_URN]=MDS_URN
+[AC_LUSTRE_OSS_URN]=OSS_URN
  
  AC_SUBST([AC_LUSTRE_MAJOR])
  AC_SUBST([AC_LUSTRE_MINOR])
@@ -34,3 +44,7 @@ AC_SUBST([AC_LUSTRE_VERSION_STRING])
  AC_SUBST([AC_LUSTRE_VER_ALLOWED_OFFSET])
  AC_SUBST([AC_LUSTRE_LIB_VER_OFFSET_WARN])
  AC_SUBST([AC_LUSTRE_CLI_VER_OFFSET_WARN])
+AC_SUBST([AC_LUSTRE_CLIENT_URN])
+AC_SUBST([AC_LUSTRE_MDS_URN])
+AC_SUBST([AC_LUSTRE_MGS_URN])
+AC_SUBST([AC_LUSTRE_OSS_URN])
diff --git a/lustre/conf/Makefile.am b/lustre/conf/Makefile.am

index 978cf29..ad285b6 100644 (file)
--- a/lustre/conf/Makefile.am
+++ b/lustre/conf/Makefile.am
@@ -1,7 +1,38 @@
-# Copyright (C) 2001  Cluster File Systems, Inc.
  #
-# This code is issued under the GNU General Public License.
-# See the file COPYING in this distribution
+# GPL HEADER START
+#
+# DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License version 2 only,
+# as published by the Free Software Foundation.
+#
+# This program is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+# General Public License version 2 for more details (a copy is included
+# in the LICENSE file that accompanied this code).
+#
+# You should have received a copy of the GNU General Public License
+# version 2 along with this program; If not, see
+# http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+#
+# Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+# CA 95054 USA or visit www.sun.com if you need additional information or
+# have any questions.
+#
+# GPL HEADER END
+#
+
+#
+# Copyright  2008 Sun Microsystems, Inc. All rights reserved
+# Use is subject to license terms.
+#
+
+#
+# This file is part of Lustre, http://www.lustre.org/
+# Lustre is a trademark of Sun Microsystems, Inc.
+#
  
  EXTRA_DIST = lustre.dtd lustre.schema slapd-lustre.conf lustre2ldif.xsl top.ldif
  ldapconfdir = $(sysconfdir)/openldap
diff --git a/lustre/contrib/README b/lustre/contrib/README

index 0741258..92555a7 100644 (file)
--- a/lustre/contrib/README
+++ b/lustre/contrib/README
@@ -1,6 +1,8 @@
  The files in this directory are user-contributed and are not supported by
-CFS in any way.
+Sun Microsystems, Inc. in any way.
  
+. adio_driver_mpich2-1.0.7.patch: an optimized Lustre ADIO driver for MPICH2-1.0.7,
+  developed by Sun Lustre group and ORNL together.
  . mpich2-1.0.3.patch & adio-lustre-mpich2-v03.patch : came from weikuan ORNL
    (wyu@ornl.gov) and you can get detail information http://ft.ornl.gov/projects/io/
     
diff --git a/lustre/contrib/adio_driver_mpich2-1.0.7.patch b/lustre/contrib/adio_driver_mpich2-1.0.7.patch

new file mode 100644 (file)

index 0000000..6b33872
--- /dev/null
+++ b/lustre/contrib/adio_driver_mpich2-1.0.7.patch
@@ -0,0 +1,2588 @@
+diff -ruN ad_lustre_orig/ad_lustre_aggregate.c ad_lustre/ad_lustre_aggregate.c
+--- ad_lustre_orig/ad_lustre_aggregate.c       1970-01-01 08:00:00.000000000 +0800
++++ ad_lustre/ad_lustre_aggregate.c    2008-10-17 17:30:00.000000000 +0800
+@@ -0,0 +1,502 @@
++/* -*- Mode: C; c-basic-offset:4 ; -*- */
++/*
++ *   Copyright (C) 1997 University of Chicago.
++ *   See COPYRIGHT notice in top-level directory.
++ *
++ *   Copyright (C) 2007 Oak Ridge National Laboratory
++ *
++ *   Copyright (C) 2008 Sun Microsystems, Lustre group
++ */
++
++#include "ad_lustre.h"
++#include "adio_extern.h"
++
++void ADIOI_LUSTRE_Get_striping_info(ADIO_File fd, int ** striping_info_ptr,
++                                  int mode)
++{
++    int *striping_info = NULL;
++    /* get striping information:
++     *  striping_info[0]: stripe_size
++     *  striping_info[1]: stripe_count
++     *  striping_info[2]: avail_cb_nodes
++     */
++    int stripe_size, stripe_count, CO = 1, CO_max = 1, CO_nodes, lflag;
++    int avail_cb_nodes, divisor, nprocs_for_coll = fd->hints->cb_nodes;
++    char *value = (char *) ADIOI_Malloc((MPI_MAX_INFO_VAL + 1) * sizeof(char));
++
++    /* Get hints value */
++    /* stripe size */
++    MPI_Info_get(fd->info, "striping_unit", MPI_MAX_INFO_VAL, value, &lflag);
++    if (lflag)
++      stripe_size = atoi(value);
++    /* stripe count */
++    /* stripe_size and stripe_count have been validated in ADIOI_LUSTRE_Open() */
++    MPI_Info_get(fd->info, "striping_factor", MPI_MAX_INFO_VAL, value, &lflag);
++    if (lflag)
++      stripe_count = atoi(value);
++
++    /* Calculate the available number of I/O clients, that is
++     *  avail_cb_nodes=min(cb_nodes, stripe_count*CO), where
++     *  CO=1 by default
++     */
++    if (!mode) {
++        /* for collective read,
++       * if "CO" clients access the same OST simultaneously,
++       * the OST disk seek time would be much. So, to avoid this,
++       * it might be better if 1 client only accesses 1 OST.
++       * So, we set CO = 1 to meet the above requirement.
++       */
++      CO = 1;
++      /*XXX: maybe there are other better way for collective read */
++    } else {
++        /* CO_max: the largest number of IO clients for each ost group */
++        CO_max = (nprocs_for_coll - 1)/ stripe_count + 1;
++        /* CO also has been validated in ADIOI_LUSTRE_Open(), >0 */
++      MPI_Info_get(fd->info, "CO", MPI_MAX_INFO_VAL, value, &lflag);
++      if (lflag)
++          CO = atoi(value);
++      CO = ADIOI_MIN(CO_max, CO);
++    }
++    /* Calculate how many IO clients we need */
++    /* To avoid extent lock conflicts,
++     * avail_cb_nodes should divide (stripe_count*CO) exactly,
++     * so that each OST is accessed by only one or more constant clients. */
++    avail_cb_nodes = ADIOI_MIN(nprocs_for_coll, stripe_count * CO);
++    if (avail_cb_nodes == nprocs_for_coll) {
++        CO_nodes = stripe_count * CO;
++        do {
++            /* find the divisor of CO_nodes */
++            divisor = 1;
++            do {
++                divisor ++;
++            } while (CO_nodes % divisor);
++            CO_nodes = CO_nodes / divisor;
++            /* if stripe_count*CO is a prime number, change nothing */
++            if ((CO_nodes <= avail_cb_nodes) && (CO_nodes != 1)) {
++                avail_cb_nodes = CO_nodes;
++                break;
++            }
++        } while (CO_nodes != 1);
++    }
++
++    *striping_info_ptr = (int *) ADIOI_Malloc(3 * sizeof(int));
++    striping_info = *striping_info_ptr;
++    striping_info[0] = stripe_size;
++    striping_info[1] = stripe_count;
++    striping_info[2] = avail_cb_nodes;
++
++    ADIOI_Free(value);
++}
++
++int ADIOI_LUSTRE_Calc_aggregator(ADIO_File fd, ADIO_Offset off,
++                                 ADIO_Offset *len, int *striping_info)
++{
++    int rank_index, rank;
++    ADIO_Offset avail_bytes;
++    int stripe_size = striping_info[0];
++    int avail_cb_nodes = striping_info[2];
++
++    /* Produce the stripe-contiguous pattern for Lustre */
++    rank_index = (int)((off / stripe_size) % avail_cb_nodes);
++
++    avail_bytes = (off / (ADIO_Offset)stripe_size + 1) *
++                  (ADIO_Offset)stripe_size - off;
++    if (avail_bytes < *len) {
++      /* this proc only has part of the requested contig. region */
++      *len = avail_bytes;
++    }
++    /* map our index to a rank */
++    /* NOTE: FOR NOW WE DON'T HAVE A MAPPING...JUST DO 0..NPROCS_FOR_COLL */
++    rank = fd->hints->ranklist[rank_index];
++
++    return rank;
++}
++
++void ADIOI_LUSTRE_Calc_my_req(ADIO_File fd, ADIO_Offset *offset_list,
++                            int *len_list, int contig_access_count,
++                            int *striping_info, int nprocs,
++                              int *count_my_req_procs_ptr,
++                            int **count_my_req_per_proc_ptr,
++                            ADIOI_Access ** my_req_ptr,
++                            int **buf_idx_ptr)
++{
++    /* Nothing different from ADIOI_Calc_my_req(), except calling
++     * ADIOI_Lustre_Calc_aggregator() instead of the old one */
++    int *count_my_req_per_proc, count_my_req_procs, *buf_idx;
++    int i, l, proc;
++    ADIO_Offset avail_len, rem_len, curr_idx, off;
++    ADIOI_Access *my_req;
++
++    *count_my_req_per_proc_ptr = (int *) ADIOI_Calloc(nprocs, sizeof(int));
++    count_my_req_per_proc = *count_my_req_per_proc_ptr;
++
++    /* buf_idx is relevant only if buftype_is_contig.
++     * buf_idx[i] gives the index into user_buf where data received
++     * from proc. i should be placed. This allows receives to be done
++     * without extra buffer. This can't be done if buftype is not contig.
++     */
++    buf_idx = (int *) ADIOI_Malloc(nprocs * sizeof(int));
++    /* initialize buf_idx to -1 */
++    for (i = 0; i < nprocs; i++)
++      buf_idx[i] = -1;
++
++    /* one pass just to calculate how much space to allocate for my_req;
++     * contig_access_count was calculated way back in ADIOI_Calc_my_off_len()
++     */
++    for (i = 0; i < contig_access_count; i++) {
++      /* short circuit offset/len processing if len == 0
++       * (zero-byte  read/write
++       */
++      if (len_list[i] == 0)
++          continue;
++      off = offset_list[i];
++      avail_len = len_list[i];
++      /* we set avail_len to be the total size of the access.
++       * then ADIOI_LUSTRE_Calc_aggregator() will modify the value to return
++       * the amount that was available.
++       */
++      proc = ADIOI_LUSTRE_Calc_aggregator(fd, off, &avail_len, striping_info);
++      count_my_req_per_proc[proc]++;
++      /* figure out how many data is remaining in the access
++       * we'll take care of this data (if there is any)
++       * in the while loop below.
++       */
++      rem_len = len_list[i] - avail_len;
++
++      while (rem_len != 0) {
++          off += avail_len;   /* point to first remaining byte */
++          avail_len = rem_len;        /* save remaining size, pass to calc */
++          proc = ADIOI_LUSTRE_Calc_aggregator(fd, off, &avail_len, striping_info);
++          count_my_req_per_proc[proc]++;
++          rem_len -= avail_len;       /* reduce remaining length by amount from fd */
++      }
++    }
++
++    *my_req_ptr = (ADIOI_Access *) ADIOI_Malloc(nprocs * sizeof(ADIOI_Access));
++    my_req = *my_req_ptr;
++
++    count_my_req_procs = 0;
++    for (i = 0; i < nprocs; i++) {
++      if (count_my_req_per_proc[i]) {
++          my_req[i].offsets = (ADIO_Offset *)
++                              ADIOI_Malloc(count_my_req_per_proc[i] *
++                                             sizeof(ADIO_Offset));
++          my_req[i].lens = (int *) ADIOI_Malloc(count_my_req_per_proc[i] *
++                                                sizeof(int));
++          count_my_req_procs++;
++      }
++      my_req[i].count = 0;    /* will be incremented where needed later */
++    }
++
++    /* now fill in my_req */
++    curr_idx = 0;
++    for (i = 0; i < contig_access_count; i++) {
++      if (len_list[i] == 0)
++          continue;
++      off = offset_list[i];
++      avail_len = len_list[i];
++      proc = ADIOI_LUSTRE_Calc_aggregator(fd, off, &avail_len, striping_info);
++
++      /* for each separate contiguous access from this process */
++      if (buf_idx[proc] == -1)
++          buf_idx[proc] = (int) curr_idx;
++
++      l = my_req[proc].count;
++      curr_idx += (int) avail_len;    /* NOTE: Why is curr_idx an int?  Fix? */
++
++      rem_len = len_list[i] - avail_len;
++
++      /* store the proc, offset, and len information in an array
++       * of structures, my_req. Each structure contains the
++       * offsets and lengths located in that process's FD,
++       * and the associated count.
++       */
++      my_req[proc].offsets[l] = off;
++      my_req[proc].lens[l] = (int) avail_len;
++      my_req[proc].count++;
++
++      while (rem_len != 0) {
++          off += avail_len;
++          avail_len = rem_len;
++          proc = ADIOI_LUSTRE_Calc_aggregator(fd, off, &avail_len,
++                                                striping_info);
++          if (buf_idx[proc] == -1)
++              buf_idx[proc] = (int) curr_idx;
++
++          l = my_req[proc].count;
++          curr_idx += avail_len;
++          rem_len -= avail_len;
++
++          my_req[proc].offsets[l] = off;
++          my_req[proc].lens[l] = (int) avail_len;
++          my_req[proc].count++;
++      }
++    }
++
++#ifdef AGG_DEBUG
++    for (i = 0; i < nprocs; i++) {
++      if (count_my_req_per_proc[i] > 0) {
++          FPRINTF(stdout, "data needed from %d (count = %d):\n",
++                          i, my_req[i].count);
++          for (l = 0; l < my_req[i].count; l++) {
++              FPRINTF(stdout, "   off[%d] = %lld, len[%d] = %d\n",
++                              l, my_req[i].offsets[l], l, my_req[i].lens[l]);
++          }
++      }
++    }
++#endif
++#if 0
++    for (i = 0; i < nprocs; i++) {
++      FPRINTF(stdout, "buf_idx[%d] = 0x%x\n", i, buf_idx[i]);
++    }
++#endif
++
++    *count_my_req_procs_ptr = count_my_req_procs;
++    *buf_idx_ptr = buf_idx;
++}
++
++int ADIOI_LUSTRE_Docollect(ADIO_File fd, int contig_access_count,
++                         int *len_list, int nprocs)
++{
++    /* If the processes are non-interleaved, we will check the req_size.
++     *   if (avg_req_size > big_req_size) {
++     *       docollect = 0;
++     *   }
++     */
++
++    int i, docollect = 1, lflag, big_req_size = 0;
++    ADIO_Offset req_size = 0, total_req_size;
++    int avg_req_size, total_access_count;
++    char *value = NULL;
++
++    /* calculate total_req_size and total_access_count */
++    for (i = 0; i < contig_access_count; i++)
++        req_size += len_list[i];
++    MPI_Allreduce(&req_size, &total_req_size, 1, MPI_LONG_LONG_INT, MPI_SUM,
++               fd->comm);
++    MPI_Allreduce(&contig_access_count, &total_access_count, 1, MPI_INT, MPI_SUM,
++               fd->comm);
++    /* estimate average req_size */
++    avg_req_size = (int)(total_req_size / total_access_count);
++
++    /* get hint of big_req_size */
++    value = (char *) ADIOI_Malloc((MPI_MAX_INFO_VAL + 1) * sizeof(char));
++    MPI_Info_get(fd->info, "big_req_size", MPI_MAX_INFO_VAL, value, &lflag);
++    if (lflag)
++        big_req_size = atoi(value);
++    /* Don't perform collective I/O if there are big requests */
++    if ((big_req_size > 0) && (avg_req_size > big_req_size))
++        docollect = 0;
++
++    ADIOI_Free(value);
++
++    return docollect;
++}
++
++void ADIOI_LUSTRE_Calc_others_req(ADIO_File fd, int count_my_req_procs,
++                                int *count_my_req_per_proc,
++                                ADIOI_Access * my_req,
++                                int nprocs, int myrank,
++                                  ADIO_Offset start_offset,
++                                  ADIO_Offset end_offset,
++                                  int *striping_info,
++                                int *count_others_req_procs_ptr,
++                                ADIOI_Access ** others_req_ptr)
++{
++    /* what requests of other processes will be written by this process */
++
++    int *count_others_req_per_proc, count_others_req_procs, proc;
++    int i, j, lflag, samesize = 0, contiguous = 0;
++    int avail_cb_nodes = striping_info[2];
++    MPI_Request *send_requests, *recv_requests;
++    MPI_Status *statuses;
++    ADIOI_Access *others_req;
++    char *value = NULL;
++    ADIO_Offset min_st_offset, off, req_len, avail_len, rem_len, *all_lens;
++
++    /* There are two hints, which could reduce some MPI communication overhead,
++     * if the users knows the I/O pattern and set them correctly. */
++    /* They are
++     * contiguous_data: if the data are contiguous,
++     *                  we don't need to do MPI_Alltoall().
++     * same_io_size: And if the data req size is same,
++     *               we can calculate the offset directly
++     */
++    value = (char *) ADIOI_Malloc((MPI_MAX_INFO_VAL + 1) * sizeof(char));
++    /* hint of contiguous data */
++    MPI_Info_get(fd->info, "contiguous_data", MPI_MAX_INFO_VAL, value, &lflag);
++    if (lflag && !strcmp(value, "yes"))
++        contiguous = 1;
++    /* hint of same io size */
++    MPI_Info_get(fd->info, "same_io_size", MPI_MAX_INFO_VAL, value, &lflag);
++    if (lflag && !strcmp(value, "yes"))
++        samesize = 1;
++    ADIOI_Free(value);
++
++    *others_req_ptr = (ADIOI_Access *) ADIOI_Malloc(nprocs *
++                                                    sizeof(ADIOI_Access));
++    others_req = *others_req_ptr;
++
++    /* if the data are contiguous, we can calulate the offset and length
++     * of the other requests simply, instead of MPI_Alltoall() */
++    if (contiguous) {
++        for (i = 0; i < nprocs; i++) {
++            others_req[i].count = 0;
++        }
++        req_len = end_offset - start_offset + 1;
++        all_lens = (ADIO_Offset *) ADIOI_Malloc(nprocs * sizeof(ADIO_Offset));
++
++        /* same req size ? */
++        if (samesize == 0) {
++            /* calculate the min_st_offset */
++            MPI_Allreduce(&start_offset, &min_st_offset, 1, MPI_LONG_LONG,
++                          MPI_MIN, fd->comm);
++            /* exchange request length */
++            MPI_Allgather(&req_len, 1, ADIO_OFFSET, all_lens, 1, ADIO_OFFSET,
++                          fd->comm);
++        } else { /* same request size */
++            /* calculate the 1st request's offset */
++            min_st_offset = start_offset - myrank * req_len;
++            /* assign request length to all_lens[] */
++            for (i = 0; i < nprocs; i ++)
++               all_lens[i] = req_len;
++        }
++        if (myrank < avail_cb_nodes) {
++            /* This is a IO client and it will receive data from others */
++            off = min_st_offset;
++            /* calcaulte other_req[i].count */
++            for (i = 0; i < nprocs; i++) {
++                avail_len = all_lens[i];
++                rem_len = avail_len;
++                while (rem_len > 0) {
++                  proc = ADIOI_LUSTRE_Calc_aggregator(fd, off, &avail_len,
++                                                        striping_info);
++                    if (proc == myrank) {
++                        others_req[i].count ++;
++                    }
++                    off += avail_len;
++                    rem_len -= avail_len;
++                    avail_len = rem_len;
++                }
++            }
++            /* calculate offset and len for each request */
++            off = min_st_offset;
++            for (i = 0; i < nprocs; i++) {
++                if (others_req[i].count) {
++                  others_req[i].offsets = (ADIO_Offset *)
++                                            ADIOI_Malloc(others_req[i].count *
++                                                       sizeof(ADIO_Offset));
++                  others_req[i].lens = (int *)
++                                         ADIOI_Malloc(others_req[i].count *
++                                                      sizeof(int));
++                    others_req[i].mem_ptrs = (MPI_Aint *)
++                                             ADIOI_Malloc(others_req[i].count *
++                                                        sizeof(MPI_Aint));
++                }
++                j = 0;
++                avail_len = all_lens[i];
++                rem_len = avail_len;
++                while (rem_len > 0) {
++                  proc = ADIOI_LUSTRE_Calc_aggregator(fd, off, &avail_len,
++                                                        striping_info);
++                    if (proc == myrank) {
++                        others_req[i].offsets[j] = off;
++                        others_req[i].lens[j] = (int)avail_len;
++                        j ++;
++                    }
++                    off += avail_len;
++                    rem_len -= avail_len;
++                    avail_len = rem_len;
++                }
++            }
++        }
++        ADIOI_Free(all_lens);
++    } else {
++        /* multiple non-contiguous requests */
++        /* first find out how much to send/recv and from/to whom */
++
++        /*
++         * count_others_req_procs:
++         *    number of processes whose requests will be written by
++         *    this process (including this process itself)
++         * count_others_req_per_proc[i]:
++         *    how many separate contiguous requests of proc[i] will be
++         *    written by this process.
++         */
++
++        count_others_req_per_proc = (int *) ADIOI_Malloc(nprocs * sizeof(int));
++
++        MPI_Alltoall(count_my_req_per_proc, 1, MPI_INT,
++                   count_others_req_per_proc, 1, MPI_INT, fd->comm);
++
++        count_others_req_procs = 0;
++        for (i = 0; i < nprocs; i++) {
++          if (count_others_req_per_proc[i]) {
++              others_req[i].count = count_others_req_per_proc[i];
++              others_req[i].offsets = (ADIO_Offset *)
++                                        ADIOI_Malloc(others_req[i].count *
++                                               sizeof(ADIO_Offset));
++              others_req[i].lens = (int *)
++                                   ADIOI_Malloc(others_req[i].count *
++                                                  sizeof(int));
++              others_req[i].mem_ptrs = (MPI_Aint *)
++                                       ADIOI_Malloc(others_req[i].count *
++                                                    sizeof(MPI_Aint));
++              count_others_req_procs++;
++          } else
++              others_req[i].count = 0;
++        }
++
++        /* now send the calculated offsets and lengths to respective processes */
++
++        send_requests = (MPI_Request *) ADIOI_Malloc(2 * (count_my_req_procs + 1) *
++                                                     sizeof(MPI_Request));
++        recv_requests = (MPI_Request *) ADIOI_Malloc(2 * (count_others_req_procs+1)*
++                                                   sizeof(MPI_Request));
++        /* +1 to avoid a 0-size malloc */
++
++        j = 0;
++        for (i = 0; i < nprocs; i++) {
++          if (others_req[i].count) {
++              MPI_Irecv(others_req[i].offsets, others_req[i].count,
++                        ADIO_OFFSET, i, i + myrank, fd->comm,
++                        &recv_requests[j]);
++              j++;
++              MPI_Irecv(others_req[i].lens, others_req[i].count,
++                        MPI_INT, i, i + myrank + 1, fd->comm,
++                        &recv_requests[j]);
++              j++;
++          }
++        }
++
++        j = 0;
++        for (i = 0; i < nprocs; i++) {
++          if (my_req[i].count) {
++              MPI_Isend(my_req[i].offsets, my_req[i].count,
++                        ADIO_OFFSET, i, i + myrank, fd->comm,
++                        &send_requests[j]);
++              j++;
++              MPI_Isend(my_req[i].lens, my_req[i].count,
++                        MPI_INT, i, i + myrank + 1, fd->comm,
++                        &send_requests[j]);
++              j++;
++          }
++        }
++
++        statuses = (MPI_Status *)
++                   ADIOI_Malloc((1 + 2 * ADIOI_MAX(count_my_req_procs,
++                                                 count_others_req_procs)) *
++                                         sizeof(MPI_Status));
++        /* +1 to avoid a 0-size malloc */
++
++        MPI_Waitall(2 * count_my_req_procs, send_requests, statuses);
++        MPI_Waitall(2 * count_others_req_procs, recv_requests, statuses);
++
++        ADIOI_Free(send_requests);
++        ADIOI_Free(recv_requests);
++        ADIOI_Free(statuses);
++        ADIOI_Free(count_others_req_per_proc);
++
++        *count_others_req_procs_ptr = count_others_req_procs;
++    }
++}
+diff -ruN ad_lustre_orig/ad_lustre.c ad_lustre/ad_lustre.c
+--- ad_lustre_orig/ad_lustre.c 2008-09-17 14:36:57.000000000 +0800
++++ ad_lustre/ad_lustre.c      2008-10-17 17:03:42.000000000 +0800
+@@ -1,9 +1,11 @@
+ /* -*- Mode: C; c-basic-offset:4 ; -*- */
+-/* 
+- *   Copyright (C) 2001 University of Chicago. 
++/*
++ *   Copyright (C) 2001 University of Chicago.
+  *   See COPYRIGHT notice in top-level directory.
+  *
+  *   Copyright (C) 2007 Oak Ridge National Laboratory
++ *
++ *   Copyright (C) 2008 Sun Microsystems, Lustre group
+  */
+ 
+ #include "ad_lustre.h"
+@@ -13,12 +15,12 @@
+     ADIOI_LUSTRE_ReadContig, /* ReadContig */
+     ADIOI_LUSTRE_WriteContig, /* WriteContig */
+     ADIOI_GEN_ReadStridedColl, /* ReadStridedColl */
+-    ADIOI_GEN_WriteStridedColl, /* WriteStridedColl */
++    ADIOI_LUSTRE_WriteStridedColl, /* WriteStridedColl */
+     ADIOI_GEN_SeekIndividual, /* SeekIndividual */
+     ADIOI_GEN_Fcntl, /* Fcntl */
+     ADIOI_LUSTRE_SetInfo, /* SetInfo */
+     ADIOI_GEN_ReadStrided, /* ReadStrided */
+-    ADIOI_GEN_WriteStrided, /* WriteStrided */
++    ADIOI_LUSTRE_WriteStrided, /* WriteStrided */
+     ADIOI_GEN_Close, /* Close */
+ #if defined(ROMIO_HAVE_WORKING_AIO) && !defined(CRAY_XT_LUSTRE)
+     ADIOI_GEN_IreadContig, /* IreadContig */
+diff -ruN ad_lustre_orig/ad_lustre.h ad_lustre/ad_lustre.h
+--- ad_lustre_orig/ad_lustre.h 2008-09-17 14:36:57.000000000 +0800
++++ ad_lustre/ad_lustre.h      2008-10-17 17:11:11.000000000 +0800
+@@ -1,9 +1,11 @@
+ /* -*- Mode: C; c-basic-offset:4 ; -*- */
+-/* 
+- *   Copyright (C) 1997 University of Chicago. 
++/*
++ *   Copyright (C) 1997 University of Chicago.
+  *   See COPYRIGHT notice in top-level directory.
+  *
+  *   Copyright (C) 2007 Oak Ridge National Laboratory
++ *
++ *   Copyright (C) 2008 Sun Microsystems, Lustre group
+  */
+ 
+ #ifndef AD_UNIX_INCLUDE
+@@ -24,7 +26,32 @@
+ 
+ /*#include <fcntl.h>*/
+ #include <sys/ioctl.h>
++#ifdef WITH_LUSTRE
+ #include "lustre/lustre_user.h"
++#else
++/* copy something from lustre_user.h here */
++#  define LOV_USER_MAGIC 0x0BD10BD0
++#  define LL_IOC_LOV_SETSTRIPE  _IOW ('f', 154, long)
++#  define LL_IOC_LOV_GETSTRIPE  _IOW ('f', 155, long)
++#  define lov_user_ost_data lov_user_ost_data_v1
++struct lov_user_ost_data_v1 {     /* per-stripe data structure */
++        __u64 l_object_id;        /* OST object ID */
++        __u64 l_object_gr;        /* OST object group (creating MDS number) */
++        __u32 l_ost_gen;          /* generation of this OST index */
++        __u32 l_ost_idx;          /* OST index in LOV */
++} __attribute__((packed));
++#define lov_user_md lov_user_md_v1
++struct lov_user_md_v1 {           /* LOV EA user data (host-endian) */
++        __u32 lmm_magic;          /* magic number = LOV_USER_MAGIC_V1 */
++        __u32 lmm_pattern;        /* LOV_PATTERN_RAID0, LOV_PATTERN_RAID1 */
++        __u64 lmm_object_id;      /* LOV object ID */
++        __u64 lmm_object_gr;      /* LOV object group */
++        __u32 lmm_stripe_size;    /* size of stripe in bytes */
++        __u16 lmm_stripe_count;   /* num stripes in use for this object */
++        __u16 lmm_stripe_offset;  /* starting stripe offset in lmm_objects */
++        struct lov_user_ost_data_v1 lmm_objects[0]; /* per-stripe data */
++} __attribute__((packed));
++#endif
+ #include "adio.h"
+ /*#include "adioi.h"*/
+ 
+@@ -41,24 +68,31 @@
+ 
+ void ADIOI_LUSTRE_Open(ADIO_File fd, int *error_code);
+ void ADIOI_LUSTRE_Close(ADIO_File fd, int *error_code);
+-void ADIOI_LUSTRE_ReadContig(ADIO_File fd, void *buf, int count, 
+-                      MPI_Datatype datatype, int file_ptr_type,
+-                     ADIO_Offset offset, ADIO_Status *status, int
+-                   *error_code);
+-void ADIOI_LUSTRE_WriteContig(ADIO_File fd, void *buf, int count, 
+-                      MPI_Datatype datatype, int file_ptr_type,
+-                      ADIO_Offset offset, ADIO_Status *status, int
+-                    *error_code);   
++void ADIOI_LUSTRE_ReadContig(ADIO_File fd, void *buf, int count,
++                             MPI_Datatype datatype, int file_ptr_type,
++                             ADIO_Offset offset, ADIO_Status *status,
++                             int *error_code);
++void ADIOI_LUSTRE_WriteContig(ADIO_File fd, void *buf, int count,
++                              MPI_Datatype datatype, int file_ptr_type,
++                              ADIO_Offset offset, ADIO_Status *status,
++                              int *error_code);
++void ADIOI_LUSTRE_WriteStrided(ADIO_File fd, void *buf, int count,
++                             MPI_Datatype datatype, int file_ptr_type,
++                             ADIO_Offset offset, ADIO_Status *status,
++                             int *error_code);
+ void ADIOI_LUSTRE_WriteStridedColl(ADIO_File fd, void *buf, int count,
+-                     MPI_Datatype datatype, int file_ptr_type,
+-                     ADIO_Offset offset, ADIO_Status *status, int
+-                     *error_code);
++                                 MPI_Datatype datatype, int file_ptr_type,
++                                 ADIO_Offset offset, ADIO_Status *status,
++                                   int *error_code);
+ void ADIOI_LUSTRE_ReadStridedColl(ADIO_File fd, void *buf, int count,
+-                     MPI_Datatype datatype, int file_ptr_type,
+-                     ADIO_Offset offset, ADIO_Status *status, int
+-                     *error_code);
++                                MPI_Datatype datatype, int file_ptr_type,
++                                ADIO_Offset offset, ADIO_Status *status,
++                                  int *error_code);
++void ADIOI_LUSTRE_ReadStrided(ADIO_File fd, void *buf, int count,
++                            MPI_Datatype datatype, int file_ptr_type,
++                            ADIO_Offset offset, ADIO_Status *status,
++                              int *error_code);
+ void ADIOI_LUSTRE_Fcntl(ADIO_File fd, int flag, ADIO_Fcntl_t *fcntl_struct,
+                      int *error_code);
+ void ADIOI_LUSTRE_SetInfo(ADIO_File fd, MPI_Info users_info, int *error_code);
+-
+ #endif /* End of AD_UNIX_INCLUDE */
+diff -ruN ad_lustre_orig/ad_lustre_hints.c ad_lustre/ad_lustre_hints.c
+--- ad_lustre_orig/ad_lustre_hints.c   2008-09-17 14:36:57.000000000 +0800
++++ ad_lustre/ad_lustre_hints.c        2008-10-20 14:36:48.000000000 +0800
+@@ -1,9 +1,11 @@
+ /* -*- Mode: C; c-basic-offset:4 ; -*- */
+-/* 
+- *   Copyright (C) 1997 University of Chicago. 
++/*
++ *   Copyright (C) 1997 University of Chicago.
+  *   See COPYRIGHT notice in top-level directory.
+  *
+  *   Copyright (C) 2007 Oak Ridge National Laboratory
++ *
++ *   Copyright (C) 2008 Sun Microsystems, Lustre group
+  */
+ 
+ #include "ad_lustre.h"
+@@ -11,130 +13,173 @@
+ 
+ void ADIOI_LUSTRE_SetInfo(ADIO_File fd, MPI_Info users_info, int *error_code)
+ {
+-    char *value, *value_in_fd;
+-    int flag, tmp_val[3], str_factor=-1, str_unit=0, start_iodev=-1;
+-    struct lov_user_md lum = { 0 };
+-    int err, myrank, fd_sys, perm, amode, old_mask;
++    char *value = NULL;
++    int flag, tmp_val, int_val, str_factor, str_unit, start_iodev;
++    static char myname[] = "ADIOI_LUSTRE_SETINFO";
+ 
++    *error_code = MPI_SUCCESS;
+     value = (char *) ADIOI_Malloc((MPI_MAX_INFO_VAL+1)*sizeof(char));
++
+     if ( (fd->info) == MPI_INFO_NULL) {
+-      /* This must be part of the open call. can set striping parameters 
+-           if necessary. */ 
++      /* This must be part of the open call. can set striping parameters
++           if necessary. */
+       MPI_Info_create(&(fd->info));
+ 
+       MPI_Info_set(fd->info, "direct_read", "false");
+       MPI_Info_set(fd->info, "direct_write", "false");
+       fd->direct_read = fd->direct_write = 0;
+-      
+-      /* has user specified striping or server buffering parameters 
++
++      /* has user specified striping or server buffering parameters
+            and do they have the same value on all processes? */
+       if (users_info != MPI_INFO_NULL) {
+-          MPI_Info_get(users_info, "striping_unit", MPI_MAX_INFO_VAL, 
+-                       value, &flag);
+-          if (flag) 
+-              str_unit=atoi(value);
+-
+-          MPI_Info_get(users_info, "striping_factor", MPI_MAX_INFO_VAL, 
+-                       value, &flag);
+-          if (flag) 
+-              str_factor=atoi(value);
+-
+-          MPI_Info_get(users_info, "start_iodevice", MPI_MAX_INFO_VAL, 
++            /* direct read and write */
++          MPI_Info_get(users_info, "direct_read", MPI_MAX_INFO_VAL,
+                        value, &flag);
+-          if (flag) 
+-              start_iodev=atoi(value);
+-
+-          MPI_Info_get(users_info, "direct_read", MPI_MAX_INFO_VAL, 
+-                           value, &flag);
+           if (flag && (!strcmp(value, "true") || !strcmp(value, "TRUE"))) {
+               MPI_Info_set(fd->info, "direct_read", "true");
+               fd->direct_read = 1;
+           }
+-
+-          MPI_Info_get(users_info, "direct_write", MPI_MAX_INFO_VAL, 
++          MPI_Info_get(users_info, "direct_write", MPI_MAX_INFO_VAL,
+                            value, &flag);
+           if (flag && (!strcmp(value, "true") || !strcmp(value, "TRUE"))) {
+               MPI_Info_set(fd->info, "direct_write", "true");
+               fd->direct_write = 1;
+           }
++            /*  stripe size */
++          MPI_Info_get(users_info, "striping_unit", MPI_MAX_INFO_VAL,
++                       value, &flag);
++          if (flag && (str_unit = atoi(value))) {
++              tmp_val = str_unit;
++              MPI_Bcast(&tmp_val, 1, MPI_INT, 0, fd->comm);
++              if (tmp_val != str_unit) {
++                  MPIO_ERR_CREATE_CODE_INFO_NOT_SAME(myname,
++                                                     "striping_unit",
++                                                     error_code);
++                    ADIOI_Free(value);
++                  return;
++              }
++              MPI_Info_set(fd->info, "striping_unit", value);
++          }
++            /* stripe count */
++          MPI_Info_get(users_info, "striping_factor", MPI_MAX_INFO_VAL,
++                       value, &flag);
++          if (flag && (str_factor = atoi(value))) {
++              tmp_val = str_factor;
++              MPI_Bcast(&tmp_val, 1, MPI_INT, 0, fd->comm);
++              if (tmp_val != str_factor) {
++                  MPIO_ERR_CREATE_CODE_INFO_NOT_SAME(myname,
++                                                     "striping_factor",
++                                                     error_code);
++                    ADIOI_Free(value);
++                  return;
++              }
++              MPI_Info_set(fd->info, "striping_factor", value);
++          }
++            /* stripe offset */
++            MPI_Info_get(users_info, "start_iodevice", MPI_MAX_INFO_VAL,
++                       value, &flag);
++          if (flag && ((start_iodev = atoi(value)) >= 0)) {
++              tmp_val = start_iodev;
++              MPI_Bcast(&tmp_val, 1, MPI_INT, 0, fd->comm);
++              if (tmp_val != start_iodev) {
++                  MPIO_ERR_CREATE_CODE_INFO_NOT_SAME(myname,
++                                                     "start_iodevice",
++                                                     error_code);
++                    ADIOI_Free(value);
++                  return;
++              }
++              MPI_Info_set(fd->info, "start_iodevice", value);
++          }
+       }
+-
+-      MPI_Comm_rank(fd->comm, &myrank);
+-      if (myrank == 0) {
+-          tmp_val[0] = str_factor;
+-          tmp_val[1] = str_unit;
+-          tmp_val[2] = start_iodev;
++    }
++    if (users_info != MPI_INFO_NULL) {
++        /* CO: IO Clients/OST,
++         * to keep the load balancing between clients and OSTs */
++        MPI_Info_get(users_info, "CO", MPI_MAX_INFO_VAL, value,
++                     &flag);
++      if (flag && (int_val = atoi(value)) > 0) {
++            tmp_val = int_val;
++          MPI_Bcast(&tmp_val, 1, MPI_INT, 0, fd->comm);
++          if (tmp_val != int_val) {
++                MPIO_ERR_CREATE_CODE_INFO_NOT_SAME(myname,
++                                                   "CO",
++                                                   error_code);
++                ADIOI_Free(value);
++              return;
++          }
++          MPI_Info_set(fd->info, "CO", value);
+       }
+-      MPI_Bcast(tmp_val, 3, MPI_INT, 0, fd->comm);
+-
+-      if (tmp_val[0] != str_factor 
+-              || tmp_val[1] != str_unit 
+-              || tmp_val[2] != start_iodev) {
+-          FPRINTF(stderr, "ADIOI_LUSTRE_SetInfo: All keys"
+-                  "-striping_factor:striping_unit:start_iodevice "
+-                  "need to be identical across all processes\n");
+-          MPI_Abort(MPI_COMM_WORLD, 1);
+-              } else if ((str_factor > 0) || (str_unit > 0) || (start_iodev >= 0)) {
+-           /* if user has specified striping info, process 0 tries to set it */
+-          if (!myrank) {
+-              if (fd->perm == ADIO_PERM_NULL) {
+-                  old_mask = umask(022);
+-                  umask(old_mask);
+-                  perm = old_mask ^ 0666;
+-              }
+-              else perm = fd->perm;
+-
+-              amode = 0;
+-              if (fd->access_mode & ADIO_CREATE)
+-                  amode = amode | O_CREAT;
+-              if (fd->access_mode & ADIO_RDONLY)
+-                  amode = amode | O_RDONLY;
+-              if (fd->access_mode & ADIO_WRONLY)
+-                  amode = amode | O_WRONLY;
+-              if (fd->access_mode & ADIO_RDWR)
+-                  amode = amode | O_RDWR;
+-              if (fd->access_mode & ADIO_EXCL)
+-                  amode = amode | O_EXCL;
+-
+-              /* we need to create file so ensure this is set */
+-              amode = amode | O_LOV_DELAY_CREATE | O_CREAT;
+-
+-              fd_sys = open(fd->filename, amode, perm);
+-              if (fd_sys == -1) { 
+-                  if (errno != EEXIST) 
+-                      fprintf(stderr, 
+-                              "Failure to open file %s %d %d\n",strerror(errno), amode, perm);
+-              } else {
+-                  lum.lmm_magic = LOV_USER_MAGIC;
+-                  lum.lmm_pattern = 0;
+-                  lum.lmm_stripe_size = str_unit;
+-                  lum.lmm_stripe_count = str_factor;
+-                  lum.lmm_stripe_offset = start_iodev;
+-
+-                  err = ioctl(fd_sys, LL_IOC_LOV_SETSTRIPE, &lum);
+-                  if (err == -1 && errno != EEXIST) { 
+-                      fprintf(stderr, "Failure to set stripe info %s \n", strerror(errno));
+-                  }
+-                  close(fd_sys);
+-             }
+-          } /* End of striping parameters validation */
++        /* big_req_size:
++         * if the req size is bigger than this,
++         * collective IO may not be performed.
++         */
++      MPI_Info_get(users_info, "big_req_size", MPI_MAX_INFO_VAL, value,
++                     &flag);
++      if (flag && (int_val = atoi(value)) > 0) {
++            tmp_val = int_val;
++          MPI_Bcast(&tmp_val, 1, MPI_INT, 0, fd->comm);
++          if (tmp_val != int_val) {
++              MPIO_ERR_CREATE_CODE_INFO_NOT_SAME(myname,
++                                                 "big_req_size",
++                                                 error_code);
++                ADIOI_Free(value);
++              return;
++          }
++          MPI_Info_set(fd->info, "big_req_size", value);
++        }
++        /* ds_in_coll: disable data sieving in collective IO */
++      MPI_Info_get(users_info, "ds_in_coll", MPI_MAX_INFO_VAL,
++                   value, &flag);
++      if (flag && (!strcmp(value, "enable") ||
++                     !strcmp(value, "ENABLE"))) {
++            tmp_val = int_val = 1;
++          MPI_Bcast(&tmp_val, 1, MPI_INT, 0, fd->comm);
++          if (tmp_val != int_val) {
++              MPIO_ERR_CREATE_CODE_INFO_NOT_SAME(myname,
++                                                 "ds_in_coll",
++                                                 error_code);
++                ADIOI_Free(value);
++                return;
++          }
++          MPI_Info_set(fd->info, "ds_in_coll", "enable");
++      }
++        /* contiguous_data: whether the data are contiguous */
++      MPI_Info_get(users_info, "contiguous_data", MPI_MAX_INFO_VAL,
++                   value, &flag);
++        if (flag && (!strcmp(value, "yes") ||
++                     !strcmp(value, "YES"))) {
++            tmp_val = int_val = 1;
++          MPI_Bcast(&tmp_val, 1, MPI_INT, 0, fd->comm);
++          if (tmp_val != int_val) {
++              MPIO_ERR_CREATE_CODE_INFO_NOT_SAME(myname,
++                                                 "contiguous_data",
++                                                 error_code);
++                ADIOI_Free(value);
++                return;
++          }
++          MPI_Info_set(fd->info, "contiguous_data", "yes");
++      }
++        /* same_io_size: whether the req size is same */
++      MPI_Info_get(users_info, "same_io_size", MPI_MAX_INFO_VAL,
++                   value, &flag);
++        if (flag && (!strcmp(value, "yes") ||
++                     !strcmp(value, "YES"))) {
++            tmp_val = int_val = 1;
++          MPI_Bcast(&tmp_val, 1, MPI_INT, 0, fd->comm);
++          if (tmp_val != int_val) {
++              MPIO_ERR_CREATE_CODE_INFO_NOT_SAME(myname,
++                                                 "same_io_size",
++                                                 error_code);
++                ADIOI_Free(value);
++                return;
++          }
++          MPI_Info_set(fd->info, "same_io_size", "yes");
+       }
+-      
+-      MPI_Barrier(fd->comm);
+-      /* set the values for collective I/O and data sieving parameters */
+-      ADIOI_GEN_SetInfo(fd, users_info, error_code);
+-    } else {
+-      /* The file has been opened previously and fd->fd_sys is a valid
+-           file descriptor. cannot set striping parameters now. */
+-      
+-      /* set the values for collective I/O and data sieving parameters */
+-      ADIOI_GEN_SetInfo(fd, users_info, error_code);
+     }
+- 
+-    if (ADIOI_Direct_read) fd->direct_read = 1;
+-    if (ADIOI_Direct_write) fd->direct_write = 1;
+-
+     ADIOI_Free(value);
++    /* set the values for collective I/O and data sieving parameters */
++    ADIOI_GEN_SetInfo(fd, users_info, error_code);
+ 
+-    *error_code = MPI_SUCCESS;
++    if (ADIOI_Direct_read) fd->direct_read = 1;
++    if (ADIOI_Direct_write) fd->direct_write = 1;
+ }
+diff -ruN ad_lustre_orig/ad_lustre_open.c ad_lustre/ad_lustre_open.c
+--- ad_lustre_orig/ad_lustre_open.c    2008-09-17 14:36:57.000000000 +0800
++++ ad_lustre/ad_lustre_open.c 2008-09-17 18:55:50.000000000 +0800
+@@ -1,18 +1,21 @@
+ /* -*- Mode: C; c-basic-offset:4 ; -*- */
+-/* 
+- *   Copyright (C) 1997 University of Chicago. 
++/*
++ *   Copyright (C) 1997 University of Chicago.
+  *   See COPYRIGHT notice in top-level directory.
+  *
+  *   Copyright (C) 2007 Oak Ridge National Laboratory
++ *
++ *   Copyright (C) 2008 Sun Microsystems, Lustre group
+  */
+ 
+ #include "ad_lustre.h"
+ 
+ void ADIOI_LUSTRE_Open(ADIO_File fd, int *error_code)
+ {
+-    int perm, old_mask, amode, amode_direct;
++    int perm, old_mask, amode = 0, amode_direct = 0, flag = 0, err, myrank;
++    int stripe_size = 0, stripe_count = 0, stripe_offset = -1;
+     struct lov_user_md lum = { 0 };
+-    char *value;
++    char *value = (char *) ADIOI_Malloc((MPI_MAX_INFO_VAL + 1) * sizeof(char));
+ 
+ #if defined(MPICH2) || !defined(PRINT_ERR_MSG)
+     static char myname[] = "ADIOI_LUSTRE_OPEN";
+@@ -22,12 +25,57 @@
+       old_mask = umask(022);
+       umask(old_mask);
+       perm = old_mask ^ 0666;
+-    }
+-    else perm = fd->perm;
++    } else
++      perm = fd->perm;
+ 
+-    amode = 0;
+-    if (fd->access_mode & ADIO_CREATE)
++    if (fd->access_mode & ADIO_CREATE) {
+       amode = amode | O_CREAT;
++        /* Check striping info
++         * if already set by SetInfo(), set them to lum; otherwise, set by lum
++         */
++        MPI_Info_get(fd->info, "striping_unit", MPI_MAX_INFO_VAL, value,
++                   &flag);
++        if (flag)
++          stripe_size = atoi(value);
++
++        MPI_Info_get(fd->info, "striping_factor", MPI_MAX_INFO_VAL, value,
++                   &flag);
++        if (flag)
++          stripe_count = atoi(value);
++
++        MPI_Info_get(fd->info, "start_iodevice", MPI_MAX_INFO_VAL, value,
++                   &flag);
++        if (flag)
++          stripe_offset = atoi(value);
++
++        /* if user has specified striping info,
++         * process 0 will try to check and set it.
++         */
++        if ((stripe_size > 0) || (stripe_count > 0) || (stripe_offset >= 0)) {
++          MPI_Comm_rank(fd->comm, &myrank);
++          if (myrank == 0) {
++              int fd_sys = open(fd->filename, amode, perm);
++              if (fd_sys == -1) {
++                  if (errno != EEXIST)
++                      FPRINTF(stderr, "Failure to open file %s %d %d\n",
++                              strerror(errno), amode, perm);
++              } else {
++                  lum.lmm_magic = LOV_USER_MAGIC;
++                  lum.lmm_pattern = 1;
++                  lum.lmm_stripe_size = stripe_size;
++                  lum.lmm_stripe_count = stripe_count;
++                  lum.lmm_stripe_offset = stripe_offset;
++
++                  if (ioctl(fd_sys, LL_IOC_LOV_SETSTRIPE, &lum))
++                      FPRINTF(stderr,
++                              "Failure to set striping info to Lustre!\n");
++                  close(fd_sys);
++              }
++          }
++          MPI_Barrier(fd->comm);
++        }
++    }
++
+     if (fd->access_mode & ADIO_RDONLY)
+       amode = amode | O_RDONLY;
+     if (fd->access_mode & ADIO_WRONLY)
+@@ -42,32 +90,36 @@
+     fd->fd_sys = open(fd->filename, amode|O_CREAT, perm);
+ 
+     if (fd->fd_sys != -1) {
+-        int err;
+-
+-        value = (char *) ADIOI_Malloc((MPI_MAX_INFO_VAL+1)*sizeof(char));
+-
+         /* get file striping information and set it in info */
+-        lum.lmm_magic = LOV_USER_MAGIC;
+-        err = ioctl(fd->fd_sys, LL_IOC_LOV_GETSTRIPE, (void *) &lum);
+-
+-        if (!err) {
+-            sprintf(value, "%d", lum.lmm_stripe_size);
+-            MPI_Info_set(fd->info, "striping_unit", value);
+-
+-            sprintf(value, "%d", lum.lmm_stripe_count);
+-            MPI_Info_set(fd->info, "striping_factor", value);
+-
+-            sprintf(value, "%d", lum.lmm_stripe_offset);
+-            MPI_Info_set(fd->info, "start_iodevice", value);
+-        }
+-        ADIOI_Free(value);
++      lum.lmm_magic = LOV_USER_MAGIC;
++      err = ioctl(fd->fd_sys, LL_IOC_LOV_GETSTRIPE, (void *) &lum);
+ 
++      if (!err) {
++          if (lum.lmm_stripe_size && lum.lmm_stripe_count &&
++                (lum.lmm_stripe_offset >= 0)) {
++              sprintf(value, "%d", lum.lmm_stripe_size);
++              MPI_Info_set(fd->info, "striping_unit", value);
++
++              sprintf(value, "%d", lum.lmm_stripe_count);
++              MPI_Info_set(fd->info, "striping_factor", value);
++
++              sprintf(value, "%d", lum.lmm_stripe_offset);
++              MPI_Info_set(fd->info, "start_iodevice", value);
++          } else {
++              FPRINTF(stderr, "Striping info is invalid!\n");
++              ADIOI_Free(value);
++              MPI_Abort(MPI_COMM_WORLD, 1);
++          }
++      } else {
++          FPRINTF(stderr, "Failed to get striping info from Lustre!\n");
++            ADIOI_Free(value);
++          MPI_Abort(MPI_COMM_WORLD, 1);
++      }
+         if (fd->access_mode & ADIO_APPEND)
+             fd->fp_ind = fd->fp_sys_posn = lseek(fd->fd_sys, 0, SEEK_END);
+-    } 
+-
++    }
+     if ((fd->fd_sys != -1) && (fd->access_mode & ADIO_APPEND))
+-      fd->fp_ind = fd->fp_sys_posn = lseek(fd->fd_sys, 0, SEEK_END);
++        fd->fp_ind = fd->fp_sys_posn = lseek(fd->fd_sys, 0, SEEK_END);
+ 
+     fd->fd_direct = -1;
+     if (fd->direct_write || fd->direct_read) {
+@@ -81,20 +133,22 @@
+     }
+ 
+     /* --BEGIN ERROR HANDLING-- */
+-    if (fd->fd_sys == -1 || ((fd->fd_direct == -1) && 
+-              (fd->direct_write || fd->direct_read))) {
++    if (fd->fd_sys == -1 || ((fd->fd_direct == -1) &&
++      (fd->direct_write || fd->direct_read))) {
+       if (errno == ENAMETOOLONG)
+           *error_code = MPIO_Err_create_code(MPI_SUCCESS,
+-                                             MPIR_ERR_RECOVERABLE, myname,
+-                                             __LINE__, MPI_ERR_BAD_FILE,
++                                             MPIR_ERR_RECOVERABLE,
++                                             myname, __LINE__,
++                                             MPI_ERR_BAD_FILE,
+                                              "**filenamelong",
+                                              "**filenamelong %s %d",
+                                              fd->filename,
+                                              strlen(fd->filename));
+       else if (errno == ENOENT)
+           *error_code = MPIO_Err_create_code(MPI_SUCCESS,
+-                                             MPIR_ERR_RECOVERABLE, myname,
+-                                             __LINE__, MPI_ERR_NO_SUCH_FILE,
++                                             MPIR_ERR_RECOVERABLE,
++                                             myname, __LINE__,
++                                             MPI_ERR_NO_SUCH_FILE,
+                                              "**filenoexist",
+                                              "**filenoexist %s",
+                                              fd->filename);
+@@ -108,27 +162,30 @@
+                                              fd->filename);
+       else if (errno == EACCES) {
+           *error_code = MPIO_Err_create_code(MPI_SUCCESS,
+-                                             MPIR_ERR_RECOVERABLE, myname,
+-                                             __LINE__, MPI_ERR_ACCESS,
++                                             MPIR_ERR_RECOVERABLE,
++                                             myname, __LINE__,
++                                             MPI_ERR_ACCESS,
+                                              "**fileaccess",
+-                                             "**fileaccess %s", 
+-                                             fd->filename );
+-      }
+-      else if (errno == EROFS) {
++                                             "**fileaccess %s",
++                                             fd->filename);
++      } else if (errno == EROFS) {
+           /* Read only file or file system and write access requested */
+           *error_code = MPIO_Err_create_code(MPI_SUCCESS,
+-                                             MPIR_ERR_RECOVERABLE, myname,
+-                                             __LINE__, MPI_ERR_READ_ONLY,
+-                                             "**ioneedrd", 0 );
+-      }
+-      else {
++                                             MPIR_ERR_RECOVERABLE,
++                                             myname, __LINE__,
++                                             MPI_ERR_READ_ONLY,
++                                             "**ioneedrd", 0);
++      } else {
+           *error_code = MPIO_Err_create_code(MPI_SUCCESS,
+-                                             MPIR_ERR_RECOVERABLE, myname,
+-                                             __LINE__, MPI_ERR_IO, "**io",
++                                             MPIR_ERR_RECOVERABLE,
++                                             myname, __LINE__,
++                                             MPI_ERR_IO, "**io",
+                                              "**io %s", strerror(errno));
+       }
+-    }
++    } else {
+     /* --END ERROR HANDLING-- */
+-    else *error_code = MPI_SUCCESS;
++        *error_code = MPI_SUCCESS;
++    }
+ 
++    ADIOI_Free(value);
+ }
+diff -ruN ad_lustre_orig/ad_lustre_rwcontig.c ad_lustre/ad_lustre_rwcontig.c
+--- ad_lustre_orig/ad_lustre_rwcontig.c        2008-09-17 14:36:57.000000000 +0800
++++ ad_lustre/ad_lustre_rwcontig.c     2008-10-15 22:44:35.000000000 +0800
+@@ -1,9 +1,11 @@
+ /* -*- Mode: C; c-basic-offset:4 ; -*- */
+-/* 
+- *   Copyright (C) 1997 University of Chicago. 
++/*
++ *   Copyright (C) 1997 University of Chicago.
+  *   See COPYRIGHT notice in top-level directory.
+  *
+  *   Copyright (C) 2007 Oak Ridge National Laboratory
++ *
++ *   Copyright (C) 2008 Sun Microsystems, Lustre group
+  */
+ 
+ #define _XOPEN_SOURCE 600
+diff -ruN ad_lustre_orig/ad_lustre_wrcoll.c ad_lustre/ad_lustre_wrcoll.c
+--- ad_lustre_orig/ad_lustre_wrcoll.c  1970-01-01 08:00:00.000000000 +0800
++++ ad_lustre/ad_lustre_wrcoll.c       2008-10-17 16:34:36.000000000 +0800
+@@ -0,0 +1,880 @@
++/* -*- Mode: C; c-basic-offset:4 ; -*- */
++/*
++ *   Copyright (C) 1997 University of Chicago.
++ *   See COPYRIGHT notice in top-level directory.
++ *
++ *   Copyright (C) 2007 Oak Ridge National Laboratory
++ *
++ *   Copyright (C) 2008 Sun Microsystems, Lustre group
++ */
++
++#include "ad_lustre.h"
++#include "adio_extern.h"
++
++/* prototypes of functions used for collective writes only. */
++static void ADIOI_LUSTRE_Exch_and_write(ADIO_File fd, void *buf,
++                                      MPI_Datatype datatype, int nprocs,
++                                      int myrank,
++                                      ADIOI_Access *others_req,
++                                      ADIOI_Access *my_req,
++                                      ADIO_Offset *offset_list,
++                                      int *len_list,
++                                      int contig_access_count,
++                                      int * striping_info,
++                                      int *buf_idx, int *error_code);
++static void ADIOI_LUSTRE_Fill_send_buffer(ADIO_File fd, void *buf,
++                                        ADIOI_Flatlist_node * flat_buf,
++                                        char **send_buf,
++                                        ADIO_Offset * offset_list,
++                                        int *len_list, int *send_size,
++                                        MPI_Request * requests,
++                                        int *sent_to_proc, int nprocs,
++                                        int myrank, int contig_access_count,
++                                        int * striping_info,
++                                        int *send_buf_idx,
++                                          int *curr_to_proc,
++                                        int *done_to_proc, int iter,
++                                        MPI_Aint buftype_extent);
++static void ADIOI_LUSTRE_W_Exchange_data(ADIO_File fd, void *buf,
++                                       char *write_buf,
++                                       ADIOI_Flatlist_node * flat_buf,
++                                       ADIO_Offset * offset_list,
++                                       int *len_list, int *send_size,
++                                       int *recv_size, ADIO_Offset off,
++                                       int size, int *count,
++                                       int *start_pos, int *partial_recv,
++                                       int *sent_to_proc, int nprocs,
++                                       int myrank, int buftype_is_contig,
++                                       int contig_access_count,
++                                       int * striping_info,
++                                       ADIOI_Access * others_req,
++                                       int *send_buf_idx,
++                                       int *curr_to_proc,
++                                       int *done_to_proc, int *hole,
++                                       int iter, MPI_Aint buftype_extent,
++                                       int *buf_idx, int *error_code);
++void ADIOI_Heap_merge(ADIOI_Access * others_req, int *count,
++                      ADIO_Offset * srt_off, int *srt_len, int *start_pos,
++                      int nprocs, int nprocs_recv, int total_elements);
++
++void ADIOI_LUSTRE_WriteStridedColl(ADIO_File fd, void *buf, int count,
++                                 MPI_Datatype datatype,
++                                 int file_ptr_type, ADIO_Offset offset,
++                                 ADIO_Status * status, int *error_code)
++{
++    ADIOI_Access *my_req;
++    /* array of nprocs access structures, one for each other process has
++       this process's request */
++
++    ADIOI_Access *others_req;
++    /* array of nprocs access structures, one for each other process
++       whose request is written by this process. */
++
++    int i, filetype_is_contig, nprocs, myrank, do_collect = 0;
++    int contig_access_count = 0, buftype_is_contig, interleave_count = 0;
++    int *count_my_req_per_proc, count_my_req_procs, count_others_req_procs;
++    ADIO_Offset orig_fp, start_offset, end_offset, off;
++    ADIO_Offset *offset_list = NULL, *st_offsets = NULL, *end_offsets = NULL;
++    int *buf_idx = NULL, *len_list = NULL, *striping_info = NULL;
++    int old_error, tmp_error;
++
++    MPI_Comm_size(fd->comm, &nprocs);
++    MPI_Comm_rank(fd->comm, &myrank);
++
++    orig_fp = fd->fp_ind;
++
++    /* IO patten identification if cb_write isn't disabled */
++    if (fd->hints->cb_write != ADIOI_HINT_DISABLE) {
++      /* For this process's request, calculate the list of offsets and
++         lengths in the file and determine the start and end offsets. */
++      ADIOI_Calc_my_off_len(fd, count, datatype, file_ptr_type, offset,
++                            &offset_list, &len_list, &start_offset,
++                            &end_offset, &contig_access_count);
++
++      /* each process communicates its start and end offsets to other
++         processes. The result is an array each of start and end offsets stored
++         in order of process rank. */
++      st_offsets = (ADIO_Offset *) ADIOI_Malloc(nprocs * sizeof(ADIO_Offset));
++      end_offsets = (ADIO_Offset *) ADIOI_Malloc(nprocs * sizeof(ADIO_Offset));
++      MPI_Allgather(&start_offset, 1, ADIO_OFFSET, st_offsets, 1,
++                    ADIO_OFFSET, fd->comm);
++      MPI_Allgather(&end_offset, 1, ADIO_OFFSET, end_offsets, 1,
++                    ADIO_OFFSET, fd->comm);
++      /* are the accesses of different processes interleaved? */
++      for (i = 1; i < nprocs; i++)
++          if ((st_offsets[i] < end_offsets[i-1]) &&
++                (st_offsets[i] <= end_offsets[i]))
++                interleave_count++;
++      /* This is a rudimentary check for interleaving, but should suffice
++         for the moment. */
++
++      /* Two typical access patterns can benefit from collective write.
++         *   1) the processes are interleaved, and
++         *   2) the req size is small.
++         */
++        if (interleave_count > 0) {
++          do_collect = 1;
++        } else {
++            do_collect = ADIOI_LUSTRE_Docollect(fd, contig_access_count,
++                                              len_list, nprocs);
++        }
++    }
++    ADIOI_Datatype_iscontig(datatype, &buftype_is_contig);
++
++    /* Decide if collective I/O should be done */
++    if ((!do_collect && fd->hints->cb_write == ADIOI_HINT_AUTO) ||
++        fd->hints->cb_write == ADIOI_HINT_DISABLE) {
++
++      int filerange_is_contig = 0;
++
++      /* use independent accesses */
++      if (fd->hints->cb_write != ADIOI_HINT_DISABLE) {
++          ADIOI_Free(offset_list);
++          ADIOI_Free(len_list);
++            ADIOI_Free(st_offsets);
++            ADIOI_Free(end_offsets);
++      }
++
++      fd->fp_ind = orig_fp;
++      ADIOI_Datatype_iscontig(fd->filetype, &filetype_is_contig);
++      if (buftype_is_contig && filetype_is_contig) {
++          if (file_ptr_type == ADIO_EXPLICIT_OFFSET) {
++              off = fd->disp + (fd->etype_size) * offset;
++              ADIO_WriteContig(fd, buf, count, datatype,
++                               ADIO_EXPLICIT_OFFSET,
++                               off, status, error_code);
++          } else
++              ADIO_WriteContig(fd, buf, count, datatype, ADIO_INDIVIDUAL,
++                               0, status, error_code);
++      } else {
++          ADIO_WriteStrided(fd, buf, count, datatype, file_ptr_type,
++                            offset, status, error_code);
++      }
++      return;
++    }
++
++    /* Get Lustre hints information */
++    ADIOI_LUSTRE_Get_striping_info(fd, &striping_info, 1);
++    /* calculate what portions of the access requests of this process are
++     * located in which process
++     */
++    ADIOI_LUSTRE_Calc_my_req(fd, offset_list, len_list, contig_access_count,
++                             striping_info, nprocs, &count_my_req_procs,
++                             &count_my_req_per_proc, &my_req, &buf_idx);
++    /* calculate what process's requests will be written by this process */
++    ADIOI_LUSTRE_Calc_others_req(fd, count_my_req_procs,
++                                 count_my_req_per_proc,
++                               my_req, nprocs, myrank,
++                                 start_offset, end_offset, striping_info,
++                                 &count_others_req_procs, &others_req);
++    ADIOI_Free(count_my_req_per_proc);
++
++    /* exchange data and write in sizes of no more than stripe_size. */
++    ADIOI_LUSTRE_Exch_and_write(fd, buf, datatype, nprocs, myrank,
++                                others_req, my_req,
++                                offset_list, len_list, contig_access_count,
++                              striping_info, buf_idx, error_code);
++
++    old_error = *error_code;
++    if (*error_code != MPI_SUCCESS)
++      *error_code = MPI_ERR_IO;
++
++    /* optimization: if only one process performing i/o, we can perform
++     * a less-expensive Bcast  */
++#ifdef ADIOI_MPE_LOGGING
++    MPE_Log_event(ADIOI_MPE_postwrite_a, 0, NULL);
++#endif
++    if (fd->hints->cb_nodes == 1)
++      MPI_Bcast(error_code, 1, MPI_INT,
++                fd->hints->ranklist[0], fd->comm);
++    else {
++      tmp_error = *error_code;
++      MPI_Allreduce(&tmp_error, error_code, 1, MPI_INT,
++                    MPI_MAX, fd->comm);
++    }
++#ifdef ADIOI_MPE_LOGGING
++    MPE_Log_event(ADIOI_MPE_postwrite_b, 0, NULL);
++#endif
++
++    if ((old_error != MPI_SUCCESS) && (old_error != MPI_ERR_IO))
++      *error_code = old_error;
++
++
++    if (!buftype_is_contig)
++      ADIOI_Delete_flattened(datatype);
++
++    /* free all memory allocated for collective I/O */
++    /* free others_req */
++    for (i = 0; i < nprocs; i++) {
++      if (others_req[i].count) {
++          ADIOI_Free(others_req[i].offsets);
++          ADIOI_Free(others_req[i].lens);
++          ADIOI_Free(others_req[i].mem_ptrs);
++      }
++    }
++    ADIOI_Free(others_req);
++    /* free my_req here */
++    for (i = 0; i < nprocs; i++) {
++      if (my_req[i].count) {
++          ADIOI_Free(my_req[i].offsets);
++          ADIOI_Free(my_req[i].lens);
++      }
++    }
++    ADIOI_Free(my_req);
++    ADIOI_Free(buf_idx);
++    ADIOI_Free(offset_list);
++    ADIOI_Free(len_list);
++    ADIOI_Free(st_offsets);
++    ADIOI_Free(end_offsets);
++    ADIOI_Free(striping_info);
++
++#ifdef HAVE_STATUS_SET_BYTES
++    if (status) {
++      int bufsize, size;
++      /* Don't set status if it isn't needed */
++      MPI_Type_size(datatype, &size);
++      bufsize = size * count;
++      MPIR_Status_set_bytes(status, datatype, bufsize);
++    }
++    /* This is a temporary way of filling in status. The right way is to
++     * keep track of how much data was actually written during collective I/O.
++     */
++#endif
++
++    fd->fp_sys_posn = -1;     /* set it to null. */
++}
++
++static void ADIOI_LUSTRE_Exch_and_write(ADIO_File fd, void *buf,
++                                      MPI_Datatype datatype, int nprocs,
++                                      int myrank, ADIOI_Access *others_req,
++                                        ADIOI_Access *my_req,
++                                      ADIO_Offset *offset_list,
++                                        int *len_list, int contig_access_count,
++                                      int *striping_info, int *buf_idx,
++                                        int *error_code)
++{
++    int hole, i, j, m, flag, ntimes = 1 , max_ntimes, buftype_is_contig;
++    ADIO_Offset st_loc = -1, end_loc = -1, min_st_loc, max_end_loc;
++    ADIO_Offset off, req_off, send_off, iter_st_off, *off_list;
++    ADIO_Offset max_size, step_size = 0;
++    int real_size, req_len, send_len;
++    int *recv_curr_offlen_ptr, *recv_count, *recv_size;
++    int *send_curr_offlen_ptr, *send_size;
++    int *partial_recv, *sent_to_proc, *recv_start_pos;
++    int *send_buf_idx, *curr_to_proc, *done_to_proc;
++    char *write_buf = NULL, *value;
++    MPI_Status status;
++    ADIOI_Flatlist_node *flat_buf = NULL;
++    MPI_Aint buftype_extent;
++    int stripe_size = striping_info[0], avail_cb_nodes = striping_info[2];
++    int lflag, data_sieving = 0;
++
++    *error_code = MPI_SUCCESS;        /* changed below if error */
++
++    /* calculate the number of writes of stripe size to be done.
++     * That gives the no. of communication phases as well.
++     * Note:
++     *   Because we redistribute data in stripe-contiguous pattern for Lustre,
++     *   each process has the same no. of communication phases.
++     */
++
++    for (i = 0; i < nprocs; i++) {
++      if (others_req[i].count) {
++          st_loc = others_req[i].offsets[0];
++          end_loc = others_req[i].offsets[0];
++          break;
++      }
++    }
++    for (i = 0; i < nprocs; i++) {
++      for (j = 0; j < others_req[i].count; j++) {
++          st_loc = ADIOI_MIN(st_loc, others_req[i].offsets[j]);
++          end_loc = ADIOI_MAX(end_loc, (others_req[i].offsets[j] +
++                                          others_req[i].lens[j] - 1));
++      }
++    }
++    /* this process does no writing. */
++    if ((st_loc == -1) && (end_loc == -1))
++      ntimes = 0;
++    MPI_Allreduce(&end_loc, &max_end_loc, 1, MPI_LONG_LONG_INT, MPI_MAX, fd->comm);
++    /* avoid min_st_loc be -1 */
++    if (st_loc == -1)
++        st_loc = max_end_loc;
++    MPI_Allreduce(&st_loc, &min_st_loc, 1, MPI_LONG_LONG_INT, MPI_MIN, fd->comm);
++    /* align downward */
++    min_st_loc -= min_st_loc % (ADIO_Offset)stripe_size;
++
++    /* Each time, only avail_cb_nodes number of IO clients perform IO,
++     * so, step_size=avail_cb_nodes*stripe_size IO will be performed at most,
++     * and ntimes=whole_file_portion/step_size
++     */
++    step_size = (ADIO_Offset) avail_cb_nodes * stripe_size;
++    max_ntimes = (int)((max_end_loc - min_st_loc) / step_size + 1);
++    if (ntimes)
++      write_buf = (char *) ADIOI_Malloc(stripe_size);
++
++    /* calculate the start offset for each iteration */
++    off_list = (ADIO_Offset *) ADIOI_Malloc(max_ntimes * sizeof(ADIO_Offset));
++    for (m = 0; m < max_ntimes; m ++)
++        off_list[m] = max_end_loc;
++    for (i = 0; i < nprocs; i++) {
++        for (j = 0; j < others_req[i].count; j ++) {
++            req_off = others_req[i].offsets[j];
++            m = (int)((req_off - min_st_loc) / step_size);
++            off_list[m] = ADIOI_MIN(off_list[m], req_off);
++        }
++    }
++
++    recv_curr_offlen_ptr = (int *) ADIOI_Calloc(nprocs, sizeof(int));
++    send_curr_offlen_ptr = (int *) ADIOI_Calloc(nprocs, sizeof(int));
++    /* their use is explained below. calloc initializes to 0. */
++
++    recv_count = (int *) ADIOI_Malloc(nprocs * sizeof(int));
++    /* to store count of how many off-len pairs per proc are satisfied
++       in an iteration. */
++
++    send_size = (int *) ADIOI_Malloc(nprocs * sizeof(int));
++    /* total size of data to be sent to each proc. in an iteration.
++       Of size nprocs so that I can use MPI_Alltoall later. */
++
++    recv_size = (int *) ADIOI_Malloc(nprocs * sizeof(int));
++    /* total size of data to be recd. from each proc. in an iteration. */
++
++    sent_to_proc = (int *) ADIOI_Calloc(nprocs, sizeof(int));
++    /* amount of data sent to each proc so far. Used in
++       ADIOI_Fill_send_buffer. initialized to 0 here. */
++
++    send_buf_idx = (int *) ADIOI_Malloc(nprocs * sizeof(int));
++    curr_to_proc = (int *) ADIOI_Malloc(nprocs * sizeof(int));
++    done_to_proc = (int *) ADIOI_Malloc(nprocs * sizeof(int));
++    /* Above three are used in ADIOI_Fill_send_buffer */
++
++    recv_start_pos = (int *) ADIOI_Malloc(nprocs * sizeof(int));
++    /* used to store the starting value of recv_curr_offlen_ptr[i] in
++       this iteration */
++
++    ADIOI_Datatype_iscontig(datatype, &buftype_is_contig);
++    if (!buftype_is_contig) {
++      ADIOI_Flatten_datatype(datatype);
++      flat_buf = ADIOI_Flatlist;
++      while (flat_buf->type != datatype)
++          flat_buf = flat_buf->next;
++    }
++    MPI_Type_extent(datatype, &buftype_extent);
++
++    iter_st_off = min_st_loc;
++
++    /* Although we have recognized the data according to OST index,
++     * a read-modify-write will be done if there is a hole between the data.
++     * For example: if blocksize=60, xfersize=30 and stripe_size=100,
++     * then rank0 will collect data [0, 30] and [60, 90] then write. There
++     * is a hole in [30, 60], which will cause a read-modify-write in [0, 90].
++     *
++     * To reduce its impact on the performance, we disable data sieving
++     * by default, unless the hint "ds_in_coll" is enabled.
++     */
++    /* check the hint for data sieving */
++    value = (char *) ADIOI_Malloc((MPI_MAX_INFO_VAL + 1) * sizeof(char));
++    MPI_Info_get(fd->info, "ds_in_coll", MPI_MAX_INFO_VAL, value, &lflag);
++    if (lflag && !strcmp(value, "enable"))
++        data_sieving = 1;
++    ADIOI_Free(value);
++
++    for (m = 0; m < max_ntimes; m++) {
++      /* go through all others_req and my_req to check which will be received
++         * and sent in this iteration.
++         */
++
++      /* Note that MPI guarantees that displacements in filetypes are in
++         monotonically nondecreasing order and that, for writes, the
++         filetypes cannot specify overlapping regions in the file. This
++         simplifies implementation a bit compared to reads. */
++
++      /*
++           off         = start offset in the file for the data to be written in
++                         this iteration
++           iter_st_off = start offset of this iteration
++           real_size   = size of data written (bytes) corresponding to off
++           max_size    = possible maximum size of data written in this iteration
++           req_off     = offset in the file for a particular contiguous request minus
++                         what was satisfied in previous iteration
++           send_off    = offset the request needed by other processes in this iteration
++           req_len     = size corresponding to req_off
++           send_len    = size corresponding to send_off
++         */
++
++      /* first calculate what should be communicated */
++      for (i = 0; i < nprocs; i++)
++          recv_count[i] = recv_size[i] = send_size[i] = 0;
++
++        off = off_list[m];
++        max_size = ADIOI_MIN(step_size, max_end_loc - iter_st_off + 1);
++        real_size = (int) ADIOI_MIN((off / stripe_size + 1) * stripe_size - off,
++                                    end_loc - off + 1);
++
++      for (i = 0; i < nprocs; i++) {
++            if (my_req[i].count) {
++                for (j = send_curr_offlen_ptr[i]; j < my_req[i].count; j++) {
++                    send_off = my_req[i].offsets[j];
++                    send_len = my_req[i].lens[j];
++                    if (send_off < iter_st_off + max_size) {
++                        send_size[i] += send_len;
++                    } else {
++                        break;
++                    }
++                }
++                send_curr_offlen_ptr[i] = j;
++            }
++          if (others_req[i].count) {
++              recv_start_pos[i] = recv_curr_offlen_ptr[i];
++              for (j = recv_curr_offlen_ptr[i]; j < others_req[i].count; j++) {
++                    req_off = others_req[i].offsets[j];
++                    req_len = others_req[i].lens[j];
++                  if (req_off < iter_st_off + max_size) {
++                      recv_count[i]++;
++                      MPI_Address(write_buf + req_off - off,
++                                  &(others_req[i].mem_ptrs[j]));
++                        recv_size[i] += req_len;
++                  } else {
++                      break;
++                    }
++              }
++              recv_curr_offlen_ptr[i] = j;
++          }
++      }
++        /* use variable "hole" to pass data_sieving flag into W_Exchange_data */
++        hole = data_sieving;
++      ADIOI_LUSTRE_W_Exchange_data(fd, buf, write_buf, flat_buf, offset_list,
++                                     len_list, send_size, recv_size, off, real_size,
++                                     recv_count, recv_start_pos, partial_recv,
++                                     sent_to_proc, nprocs, myrank,
++                                     buftype_is_contig, contig_access_count,
++                                     striping_info, others_req, send_buf_idx,
++                                     curr_to_proc, done_to_proc, &hole, m,
++                                     buftype_extent, buf_idx, error_code);
++      if (*error_code != MPI_SUCCESS)
++            goto over;
++
++      flag = 0;
++      for (i = 0; i < nprocs; i++)
++          if (recv_count[i]) {
++              flag = 1;
++              break;
++          }
++      if (flag) {
++            /* check whether to do data sieving */
++            if(data_sieving) {
++              ADIO_WriteContig(fd, write_buf, real_size, MPI_BYTE,
++                               ADIO_EXPLICIT_OFFSET, off, &status,
++                               error_code);
++            } else {
++                /* if there is no hole, write data in one time;
++                 * otherwise, write data in several times */
++                if (!hole) {
++                    ADIO_WriteContig(fd, write_buf, real_size, MPI_BYTE,
++                                     ADIO_EXPLICIT_OFFSET, off, &status,
++                                     error_code);
++                } else {
++                    for (i = 0; i < nprocs; i++) {
++                        if (others_req[i].count) {
++                            for (j = 0; j < others_req[i].count; j++) {
++                                if (others_req[i].offsets[j] < off + real_size &&
++                                    others_req[i].offsets[j] >= off) {
++                                    ADIO_WriteContig(fd,
++                                                     write_buf + others_req[i].offsets[j] - off,
++                                                     others_req[i].lens[j],
++                                                     MPI_BYTE, ADIO_EXPLICIT_OFFSET,
++                                                     others_req[i].offsets[j], &status,
++                                                     error_code);
++                                  if (*error_code != MPI_SUCCESS)
++                                      goto over;
++                                }
++                            }
++                        }
++                    }
++                }
++            }
++          if (*error_code != MPI_SUCCESS)
++              goto over;
++      }
++        iter_st_off += max_size;
++    }
++over:
++    if (ntimes)
++      ADIOI_Free(write_buf);
++    ADIOI_Free(recv_curr_offlen_ptr);
++    ADIOI_Free(send_curr_offlen_ptr);
++    ADIOI_Free(recv_count);
++    ADIOI_Free(send_size);
++    ADIOI_Free(recv_size);
++    ADIOI_Free(sent_to_proc);
++    ADIOI_Free(recv_start_pos);
++    ADIOI_Free(send_buf_idx);
++    ADIOI_Free(curr_to_proc);
++    ADIOI_Free(done_to_proc);
++    ADIOI_Free(off_list);
++}
++
++static void ADIOI_LUSTRE_W_Exchange_data(ADIO_File fd, void *buf,
++                                       char *write_buf,
++                                       ADIOI_Flatlist_node * flat_buf,
++                                       ADIO_Offset * offset_list,
++                                       int *len_list, int *send_size,
++                                       int *recv_size, ADIO_Offset off,
++                                       int size, int *count,
++                                       int *start_pos, int *partial_recv,
++                                       int *sent_to_proc, int nprocs,
++                                       int myrank, int buftype_is_contig,
++                                       int contig_access_count,
++                                       int * striping_info,
++                                       ADIOI_Access * others_req,
++                                       int *send_buf_idx,
++                                       int *curr_to_proc, int *done_to_proc,
++                                         int *hole, int iter,
++                                         MPI_Aint buftype_extent,
++                                       int *buf_idx, int *error_code)
++{
++    int i, j, nprocs_recv, nprocs_send, err;
++    char **send_buf = NULL;
++    MPI_Request *requests, *send_req;
++    MPI_Datatype *recv_types;
++    MPI_Status *statuses, status;
++    int *srt_len, sum, sum_recv;
++    ADIO_Offset *srt_off;
++    int data_sieving = *hole;
++    static char myname[] = "ADIOI_W_EXCHANGE_DATA";
++
++    /* create derived datatypes for recv */
++    nprocs_recv = 0;
++    for (i = 0; i < nprocs; i++)
++      if (recv_size[i])
++          nprocs_recv++;
++
++    recv_types = (MPI_Datatype *) ADIOI_Malloc((nprocs_recv + 1) *
++                                             sizeof(MPI_Datatype));
++    /* +1 to avoid a 0-size malloc */
++
++    j = 0;
++    for (i = 0; i < nprocs; i++) {
++      if (recv_size[i]) {
++          MPI_Type_hindexed(count[i],
++                            &(others_req[i].lens[start_pos[i]]),
++                            &(others_req[i].mem_ptrs[start_pos[i]]),
++                            MPI_BYTE, recv_types + j);
++          /* absolute displacements; use MPI_BOTTOM in recv */
++          MPI_Type_commit(recv_types + j);
++          j++;
++      }
++    }
++
++    /* To avoid a read-modify-write,
++     * check if there are holes in the data to be written.
++     * For this, merge the (sorted) offset lists others_req using a heap-merge.
++     */
++
++    sum = 0;
++    for (i = 0; i < nprocs; i++)
++      sum += count[i];
++    srt_off = (ADIO_Offset *) ADIOI_Malloc((sum + 1) * sizeof(ADIO_Offset));
++    srt_len = (int *) ADIOI_Malloc((sum + 1) * sizeof(int));
++    /* +1 to avoid a 0-size malloc */
++
++    ADIOI_Heap_merge(others_req, count, srt_off, srt_len, start_pos,
++                   nprocs, nprocs_recv, sum);
++
++    /* check if there are any holes */
++    *hole = 0;
++    for (i = 0; i < sum - 1; i++) {
++        if (srt_off[i] + srt_len[i] < srt_off[i + 1]) {
++            *hole = 1;
++          break;
++      }
++    }
++    /* In some cases (see John Bent ROMIO REQ # 835), an odd interaction
++     * between aggregation, nominally contiguous regions, and cb_buffer_size
++     * should be handled with a read-modify-write (otherwise we will write out
++     * more data than we receive from everyone else (inclusive), so override
++     * hole detection
++     */
++    if (*hole == 0) {
++        sum_recv = 0;
++        for (i = 0; i < nprocs; i++)
++            sum_recv += recv_size[i];
++      if (size > sum_recv)
++          *hole = 1;
++    }
++    /* check the hint for data sieving */
++    if (data_sieving && nprocs_recv && *hole) {
++        ADIO_ReadContig(fd, write_buf, size, MPI_BYTE,
++                        ADIO_EXPLICIT_OFFSET, off, &status, &err);
++        // --BEGIN ERROR HANDLING--
++        if (err != MPI_SUCCESS) {
++            *error_code = MPIO_Err_create_code(err,
++                                               MPIR_ERR_RECOVERABLE,
++                                               myname, __LINE__,
++                                               MPI_ERR_IO,
++                                               "**ioRMWrdwr", 0);
++            ADIOI_Free(recv_types);
++            ADIOI_Free(srt_off);
++            ADIOI_Free(srt_len);
++            return;
++        }
++        // --END ERROR HANDLING--
++    }
++    ADIOI_Free(srt_off);
++    ADIOI_Free(srt_len);
++
++    nprocs_send = 0;
++    for (i = 0; i < nprocs; i++)
++      if (send_size[i])
++          nprocs_send++;
++
++    if (fd->atomicity) {
++      /* bug fix from Wei-keng Liao and Kenin Coloma */
++      requests = (MPI_Request *) ADIOI_Malloc((nprocs_send + 1) *
++                                                sizeof(MPI_Request));
++      send_req = requests;
++    } else {
++      requests = (MPI_Request *) ADIOI_Malloc((nprocs_send + nprocs_recv + 1)*
++                                                sizeof(MPI_Request));
++      /* +1 to avoid a 0-size malloc */
++
++      /* post receives */
++      j = 0;
++      for (i = 0; i < nprocs; i++) {
++          if (recv_size[i]) {
++              MPI_Irecv(MPI_BOTTOM, 1, recv_types[j], i,
++                        myrank + i + 100 * iter, fd->comm, requests + j);
++              j++;
++          }
++      }
++      send_req = requests + nprocs_recv;
++    }
++
++    /* post sends.
++     * if buftype_is_contig, data can be directly sent from
++     * user buf at location given by buf_idx. else use send_buf.
++     */
++    if (buftype_is_contig) {
++      j = 0;
++      for (i = 0; i < nprocs; i++)
++          if (send_size[i]) {
++              MPI_Isend(((char *) buf) + buf_idx[i], send_size[i],
++                        MPI_BYTE, i, myrank + i + 100 * iter, fd->comm,
++                        send_req + j);
++              j++;
++              buf_idx[i] += send_size[i];
++          }
++    } else if (nprocs_send) {
++      /* buftype is not contig */
++      send_buf = (char **) ADIOI_Malloc(nprocs * sizeof(char *));
++      for (i = 0; i < nprocs; i++)
++          if (send_size[i])
++              send_buf[i] = (char *) ADIOI_Malloc(send_size[i]);
++
++      ADIOI_LUSTRE_Fill_send_buffer(fd, buf, flat_buf, send_buf, offset_list,
++                                      len_list, send_size, send_req,
++                                      sent_to_proc, nprocs, myrank,
++                                      contig_access_count, striping_info,
++                                      send_buf_idx, curr_to_proc, done_to_proc,
++                                      iter, buftype_extent);
++      /* the send is done in ADIOI_Fill_send_buffer */
++    }
++
++      /* bug fix from Wei-keng Liao and Kenin Coloma */
++    if (fd->atomicity) {
++      j = 0;
++      for (i = 0; i < nprocs; i++) {
++          MPI_Status wkl_status;
++          if (recv_size[i]) {
++              MPI_Recv(MPI_BOTTOM, 1, recv_types[j], i,
++                       myrank + i + 100 * iter, fd->comm, &wkl_status);
++              j++;
++          }
++      }
++    }
++
++    for (i = 0; i < nprocs_recv; i++)
++      MPI_Type_free(recv_types + i);
++    ADIOI_Free(recv_types);
++
++      /* bug fix from Wei-keng Liao and Kenin Coloma */
++      /* +1 to avoid a 0-size malloc */
++    if (fd->atomicity) {
++      statuses = (MPI_Status *) ADIOI_Malloc((nprocs_send + 1) *
++                                             sizeof(MPI_Status));
++    } else {
++      statuses = (MPI_Status *) ADIOI_Malloc((nprocs_send + nprocs_recv + 1) *
++                                             sizeof(MPI_Status));
++    }
++
++#ifdef NEEDS_MPI_TEST
++    i = 0;
++    if (fd->atomicity) {
++      /* bug fix from Wei-keng Liao and Kenin Coloma */
++      while (!i)
++          MPI_Testall(nprocs_send, send_req, &i, statuses);
++    } else {
++      while (!i)
++          MPI_Testall(nprocs_send + nprocs_recv, requests, &i, statuses);
++    }
++#else
++      /* bug fix from Wei-keng Liao and Kenin Coloma */
++    if (fd->atomicity)
++      MPI_Waitall(nprocs_send, send_req, statuses);
++    else
++      MPI_Waitall(nprocs_send + nprocs_recv, requests, statuses);
++#endif
++    ADIOI_Free(statuses);
++    ADIOI_Free(requests);
++    if (!buftype_is_contig && nprocs_send) {
++      for (i = 0; i < nprocs; i++)
++          if (send_size[i])
++              ADIOI_Free(send_buf[i]);
++      ADIOI_Free(send_buf);
++    }
++}
++
++#define ADIOI_BUF_INCR \
++{ \
++    while (buf_incr) { \
++        size_in_buf = ADIOI_MIN(buf_incr, flat_buf_sz); \
++        user_buf_idx += size_in_buf; \
++        flat_buf_sz -= size_in_buf; \
++        if (!flat_buf_sz) { \
++            if (flat_buf_idx < (flat_buf->count - 1)) flat_buf_idx++; \
++            else { \
++                flat_buf_idx = 0; \
++                n_buftypes++; \
++            } \
++            user_buf_idx = flat_buf->indices[flat_buf_idx] + \
++                              n_buftypes*buftype_extent; \
++            flat_buf_sz = flat_buf->blocklens[flat_buf_idx]; \
++        } \
++        buf_incr -= size_in_buf; \
++    } \
++}
++
++
++#define ADIOI_BUF_COPY \
++{ \
++    while (size) { \
++        size_in_buf = ADIOI_MIN(size, flat_buf_sz); \
++        memcpy(&(send_buf[p][send_buf_idx[p]]), \
++               ((char *) buf) + user_buf_idx, size_in_buf); \
++        send_buf_idx[p] += size_in_buf; \
++        user_buf_idx += size_in_buf; \
++        flat_buf_sz -= size_in_buf; \
++        if (!flat_buf_sz) { \
++            if (flat_buf_idx < (flat_buf->count - 1)) flat_buf_idx++; \
++            else { \
++                flat_buf_idx = 0; \
++                n_buftypes++; \
++            } \
++            user_buf_idx = flat_buf->indices[flat_buf_idx] + \
++                              n_buftypes*buftype_extent; \
++            flat_buf_sz = flat_buf->blocklens[flat_buf_idx]; \
++        } \
++        size -= size_in_buf; \
++        buf_incr -= size_in_buf; \
++    } \
++    ADIOI_BUF_INCR \
++}
++
++static void ADIOI_LUSTRE_Fill_send_buffer(ADIO_File fd, void *buf,
++                                        ADIOI_Flatlist_node * flat_buf,
++                                        char **send_buf,
++                                        ADIO_Offset * offset_list,
++                                        int *len_list, int *send_size,
++                                        MPI_Request * requests,
++                                        int *sent_to_proc, int nprocs,
++                                        int myrank,
++                                        int contig_access_count,
++                                        int * striping_info,
++                                        int *send_buf_idx,
++                                        int *curr_to_proc,
++                                        int *done_to_proc, int iter,
++                                        MPI_Aint buftype_extent)
++{
++    /* this function is only called if buftype is not contig */
++    int i, p, flat_buf_idx, size;
++    int flat_buf_sz, buf_incr, size_in_buf, jj, n_buftypes;
++    ADIO_Offset off, len, rem_len, user_buf_idx;
++
++    /* curr_to_proc[p] = amount of data sent to proc. p that has already
++     * been accounted for so far
++     * done_to_proc[p] = amount of data already sent to proc. p in
++     * previous iterations
++     * user_buf_idx = current location in user buffer
++     * send_buf_idx[p] = current location in send_buf of proc. p
++     */
++
++    for (i = 0; i < nprocs; i++) {
++      send_buf_idx[i] = curr_to_proc[i] = 0;
++      done_to_proc[i] = sent_to_proc[i];
++    }
++    jj = 0;
++
++    user_buf_idx = flat_buf->indices[0];
++    flat_buf_idx = 0;
++    n_buftypes = 0;
++    flat_buf_sz = flat_buf->blocklens[0];
++
++    /* flat_buf_idx = current index into flattened buftype
++     * flat_buf_sz = size of current contiguous component in flattened buf
++     */
++    for (i = 0; i < contig_access_count; i++) {
++      off = offset_list[i];
++      rem_len = (ADIO_Offset) len_list[i];
++
++      /*this request may span to more than one process */
++      while (rem_len != 0) {
++          len = rem_len;
++          /* NOTE: len value is modified by ADIOI_Calc_aggregator() to be no
++           * longer than the single region that processor "p" is responsible
++           * for.
++           */
++          p = ADIOI_LUSTRE_Calc_aggregator(fd, off, &len, striping_info);
++
++          if (send_buf_idx[p] < send_size[p]) {
++              if (curr_to_proc[p] + len > done_to_proc[p]) {
++                  if (done_to_proc[p] > curr_to_proc[p]) {
++                      size = (int) ADIOI_MIN(curr_to_proc[p] + len -
++                                             done_to_proc[p],
++                                             send_size[p] -
++                                             send_buf_idx[p]);
++                      buf_incr = done_to_proc[p] - curr_to_proc[p];
++                      ADIOI_BUF_INCR
++                          buf_incr = (int) (curr_to_proc[p] + len -
++                                            done_to_proc[p]);
++                      curr_to_proc[p] = done_to_proc[p] + size;
++                      ADIOI_BUF_COPY
++                    } else {
++                      size = (int) ADIOI_MIN(len, send_size[p] -
++                                             send_buf_idx[p]);
++                      buf_incr = (int) len;
++                      curr_to_proc[p] += size;
++                      ADIOI_BUF_COPY
++                    }
++                  if (send_buf_idx[p] == send_size[p]) {
++                      MPI_Isend(send_buf[p], send_size[p], MPI_BYTE, p,
++                                myrank + p + 100 * iter, fd->comm,
++                                requests + jj);
++                      jj++;
++                  }
++              } else {
++                  curr_to_proc[p] += (int) len;
++                  buf_incr = (int) len;
++                  ADIOI_BUF_INCR
++                }
++          } else {
++              buf_incr = (int) len;
++              ADIOI_BUF_INCR
++            }
++          off += len;
++          rem_len -= len;
++      }
++    }
++    for (i = 0; i < nprocs; i++)
++      if (send_size[i])
++          sent_to_proc[i] = curr_to_proc[i];
++}
+diff -ruN ad_lustre_orig/ad_lustre_wrstr.c ad_lustre/ad_lustre_wrstr.c
+--- ad_lustre_orig/ad_lustre_wrstr.c   1970-01-01 08:00:00.000000000 +0800
++++ ad_lustre/ad_lustre_wrstr.c        2008-10-13 15:34:53.000000000 +0800
+@@ -0,0 +1,472 @@
++/* -*- Mode: C; c-basic-offset:4 ; -*- */
++/*
++ *   Copyright (C) 1997 University of Chicago.
++ *   See COPYRIGHT notice in top-level directory.
++ *
++ *   Copyright (C) 2007 Oak Ridge National Laboratory
++ *
++ *   Copyright (C) 2008 Sun Microsystems, Lustre group
++ */
++
++#include "ad_lustre.h"
++#include "adio_extern.h"
++
++#define ADIOI_BUFFERED_WRITE \
++{ \
++    if (req_off >= writebuf_off + writebuf_len) { \
++        if (writebuf_len) { \
++           ADIO_WriteContig(fd, writebuf, writebuf_len, MPI_BYTE, \
++                  ADIO_EXPLICIT_OFFSET, writebuf_off, &status1, error_code); \
++           if (!(fd->atomicity)) \
++                ADIOI_UNLOCK(fd, writebuf_off, SEEK_SET, writebuf_len); \
++           if (*error_code != MPI_SUCCESS) { \
++               *error_code = MPIO_Err_create_code(*error_code, \
++                                                  MPIR_ERR_RECOVERABLE, myname, \
++                                                  __LINE__, MPI_ERR_IO, \
++                                                  "**iowswc", 0); \
++               ADIOI_Free(writebuf); \
++               return; \
++           } \
++        } \
++      writebuf_off = req_off; \
++        /* stripe_size alignment */ \
++        writebuf_len = (int) ADIOI_MIN(end_offset - writebuf_off + 1, \
++                                       (writebuf_off / stripe_size + 1) * \
++                                       stripe_size - writebuf_off);\
++      if (!(fd->atomicity)) \
++            ADIOI_WRITE_LOCK(fd, writebuf_off, SEEK_SET, writebuf_len); \
++      ADIO_ReadContig(fd, writebuf, writebuf_len, MPI_BYTE, ADIO_EXPLICIT_OFFSET,\
++                        writebuf_off, &status1, error_code); \
++      if (*error_code != MPI_SUCCESS) { \
++          *error_code = MPIO_Err_create_code(*error_code, \
++                                             MPIR_ERR_RECOVERABLE, myname, \
++                                             __LINE__, MPI_ERR_IO, \
++                                             "**iowsrc", 0); \
++            ADIOI_Free(writebuf); \
++          return; \
++      } \
++    } \
++    write_sz = (int) ADIOI_MIN(req_len, writebuf_off + writebuf_len - req_off); \
++    memcpy(writebuf + req_off - writebuf_off, (char *)buf + userbuf_off, write_sz);\
++    while (write_sz != req_len) {\
++        ADIO_WriteContig(fd, writebuf, writebuf_len, MPI_BYTE, \
++                         ADIO_EXPLICIT_OFFSET, writebuf_off, &status1, error_code); \
++        if (!(fd->atomicity)) \
++            ADIOI_UNLOCK(fd, writebuf_off, SEEK_SET, writebuf_len); \
++        if (*error_code != MPI_SUCCESS) { \
++            *error_code = MPIO_Err_create_code(*error_code, \
++                                               MPIR_ERR_RECOVERABLE, myname, \
++                                               __LINE__, MPI_ERR_IO, \
++                                               "**iowswc", 0); \
++            ADIOI_Free(writebuf); \
++            return; \
++        } \
++        req_len -= write_sz; \
++        userbuf_off += write_sz; \
++        writebuf_off += writebuf_len; \
++        /* stripe_size alignment */ \
++        writebuf_len = (int) ADIOI_MIN(end_offset - writebuf_off + 1, \
++                                       (writebuf_off / stripe_size + 1) * \
++                                       stripe_size - writebuf_off);\
++      if (!(fd->atomicity)) \
++            ADIOI_WRITE_LOCK(fd, writebuf_off, SEEK_SET, writebuf_len); \
++        ADIO_ReadContig(fd, writebuf, writebuf_len, MPI_BYTE, ADIO_EXPLICIT_OFFSET,\
++                        writebuf_off, &status1, error_code); \
++      if (*error_code != MPI_SUCCESS) { \
++          *error_code = MPIO_Err_create_code(*error_code, \
++                                             MPIR_ERR_RECOVERABLE, myname, \
++                                             __LINE__, MPI_ERR_IO, \
++                                             "**iowsrc", 0); \
++            ADIOI_Free(writebuf); \
++          return; \
++      } \
++        write_sz = ADIOI_MIN(req_len, writebuf_len); \
++        memcpy(writebuf, (char *)buf + userbuf_off, write_sz);\
++    } \
++}
++
++
++/* this macro is used when filetype is contig and buftype is not contig.
++   it does not do a read-modify-write and does not lock*/
++#define ADIOI_BUFFERED_WRITE_WITHOUT_READ \
++{ \
++    if (req_off >= writebuf_off + writebuf_len) { \
++      writebuf_off = req_off; \
++        /* stripe_size alignment */ \
++        writebuf_len = (int) ADIOI_MIN(end_offset - writebuf_off + 1, \
++                                       (writebuf_off / stripe_size + 1) * \
++                                       stripe_size - writebuf_off);\
++    } \
++    write_sz = (int) ADIOI_MIN(req_len, writebuf_off + writebuf_len - req_off); \
++    memcpy(writebuf + req_off - writebuf_off, (char *)buf + userbuf_off, write_sz);\
++    while (req_len) { \
++        ADIO_WriteContig(fd, writebuf, writebuf_len, MPI_BYTE, \
++                         ADIO_EXPLICIT_OFFSET, writebuf_off, &status1, error_code); \
++        if (*error_code != MPI_SUCCESS) { \
++            *error_code = MPIO_Err_create_code(*error_code, \
++                                               MPIR_ERR_RECOVERABLE, myname, \
++                                               __LINE__, MPI_ERR_IO, \
++                                               "**iowswc", 0); \
++            ADIOI_Free(writebuf); \
++            return; \
++        } \
++        req_len -= write_sz; \
++        userbuf_off += write_sz; \
++        writebuf_off += writebuf_len; \
++        /* stripe_size alignment */ \
++        writebuf_len = (int) ADIOI_MIN(end_offset - writebuf_off + 1, \
++                                       (writebuf_off / stripe_size + 1) * \
++                                       stripe_size - writebuf_off);\
++        write_sz = ADIOI_MIN(req_len, writebuf_len); \
++        memcpy(writebuf, (char *)buf + userbuf_off, write_sz);\
++    } \
++}
++
++void ADIOI_LUSTRE_WriteStrided(ADIO_File fd, void *buf, int count,
++                             MPI_Datatype datatype, int file_ptr_type,
++                             ADIO_Offset offset, ADIO_Status * status,
++                             int *error_code)
++{
++    /* offset is in units of etype relative to the filetype. */
++    ADIOI_Flatlist_node *flat_buf, *flat_file;
++    int i, j, k, bwr_size, fwr_size = 0, st_index = 0;
++    int bufsize, num, size, sum, n_etypes_in_filetype, size_in_filetype;
++    int n_filetypes, etype_in_filetype;
++    ADIO_Offset abs_off_in_filetype = 0;
++    int filetype_size, etype_size, buftype_size, req_len;
++    MPI_Aint filetype_extent, buftype_extent;
++    int buf_count, buftype_is_contig, filetype_is_contig;
++    ADIO_Offset userbuf_off;
++    ADIO_Offset off, req_off, disp, end_offset = 0, writebuf_off, start_off;
++    char *writebuf;
++    int flag, st_fwr_size, st_n_filetypes, writebuf_len, write_sz;
++    ADIO_Status status1;
++    int new_bwr_size, new_fwr_size;
++    char * value;
++    int stripe_size, lflag = 0;
++    static char myname[] = "ADIOI_LUSTRE_WriteStrided";
++    int myrank;
++    MPI_Comm_rank(fd->comm, &myrank);
++
++    if (fd->hints->ds_write == ADIOI_HINT_DISABLE) {
++      /* if user has disabled data sieving on writes, use naive
++       * approach instead.
++       */
++      ADIOI_GEN_WriteStrided_naive(fd,
++                                   buf,
++                                   count,
++                                   datatype,
++                                   file_ptr_type,
++                                   offset, status, error_code);
++      return;
++    }
++
++    *error_code = MPI_SUCCESS;        /* changed below if error */
++
++    ADIOI_Datatype_iscontig(datatype, &buftype_is_contig);
++    ADIOI_Datatype_iscontig(fd->filetype, &filetype_is_contig);
++
++    MPI_Type_size(fd->filetype, &filetype_size);
++    if (!filetype_size) {
++      *error_code = MPI_SUCCESS;
++      return;
++    }
++
++    MPI_Type_extent(fd->filetype, &filetype_extent);
++    MPI_Type_size(datatype, &buftype_size);
++    MPI_Type_extent(datatype, &buftype_extent);
++    etype_size = fd->etype_size;
++
++    bufsize = buftype_size * count;
++
++    /* get striping info */
++    value = (char *) ADIOI_Malloc((MPI_MAX_INFO_VAL + 1) * sizeof(char));
++    MPI_Info_get(fd->info, "striping_unit", MPI_MAX_INFO_VAL, value, &lflag);
++    if (lflag)
++      stripe_size = atoi(value);
++    ADIOI_Free(value);
++
++    /* Different buftype to different filetype */
++    if (!buftype_is_contig && filetype_is_contig) {
++        /* noncontiguous in memory, contiguous in file. */
++      ADIOI_Flatten_datatype(datatype);
++      flat_buf = ADIOI_Flatlist;
++      while (flat_buf->type != datatype)
++          flat_buf = flat_buf->next;
++
++      off = (file_ptr_type == ADIO_INDIVIDUAL) ? fd->fp_ind :
++          fd->disp + etype_size * offset;
++
++      start_off = off;
++      end_offset = start_off + bufsize - 1;
++      writebuf_off = start_off;
++        /* write stripe size buffer each time */
++      writebuf = (char *) ADIOI_Malloc(ADIOI_MIN(bufsize, stripe_size));
++        writebuf_len = (int) ADIOI_MIN(bufsize,
++                                       (writebuf_off / stripe_size + 1) *
++                                       stripe_size - writebuf_off);
++
++        /* if atomicity is true, lock the region to be accessed */
++      if (fd->atomicity)
++          ADIOI_WRITE_LOCK(fd, start_off, SEEK_SET, bufsize);
++
++      for (j = 0; j < count; j++) {
++          for (i = 0; i < flat_buf->count; i++) {
++              userbuf_off = j * buftype_extent + flat_buf->indices[i];
++              req_off = off;
++              req_len = flat_buf->blocklens[i];
++              ADIOI_BUFFERED_WRITE_WITHOUT_READ
++              off += flat_buf->blocklens[i];
++          }
++        }
++
++      /* write the buffer out finally */
++      ADIO_WriteContig(fd, writebuf, writebuf_len, MPI_BYTE,
++                       ADIO_EXPLICIT_OFFSET, writebuf_off, &status1,
++                       error_code);
++
++      if (fd->atomicity)
++          ADIOI_UNLOCK(fd, start_off, SEEK_SET, bufsize);
++      if (*error_code != MPI_SUCCESS) {
++            ADIOI_Free(writebuf);
++          return;
++        }
++      ADIOI_Free(writebuf);
++      if (file_ptr_type == ADIO_INDIVIDUAL)
++          fd->fp_ind = off;
++    } else {
++        /* noncontiguous in file */
++        /* filetype already flattened in ADIO_Open */
++      flat_file = ADIOI_Flatlist;
++      while (flat_file->type != fd->filetype)
++          flat_file = flat_file->next;
++      disp = fd->disp;
++
++      if (file_ptr_type == ADIO_INDIVIDUAL) {
++          offset = fd->fp_ind;        /* in bytes */
++          n_filetypes = -1;
++          flag = 0;
++          while (!flag) {
++              n_filetypes++;
++              for (i = 0; i < flat_file->count; i++) {
++                  if (disp + flat_file->indices[i] +
++                      (ADIO_Offset) n_filetypes * filetype_extent +
++                      flat_file->blocklens[i] >= offset) {
++                      st_index = i;
++                      fwr_size = (int) (disp + flat_file->indices[i] +
++                                        (ADIO_Offset) n_filetypes *
++                                        filetype_extent +
++                                        flat_file->blocklens[i] -
++                                        offset);
++                      flag = 1;
++                      break;
++                  }
++              }
++          }
++      } else {
++          n_etypes_in_filetype = filetype_size / etype_size;
++          n_filetypes = (int) (offset / n_etypes_in_filetype);
++          etype_in_filetype = (int) (offset % n_etypes_in_filetype);
++          size_in_filetype = etype_in_filetype * etype_size;
++
++          sum = 0;
++          for (i = 0; i < flat_file->count; i++) {
++              sum += flat_file->blocklens[i];
++              if (sum > size_in_filetype) {
++                  st_index = i;
++                  fwr_size = sum - size_in_filetype;
++                  abs_off_in_filetype = flat_file->indices[i] +
++                      size_in_filetype - (sum - flat_file->blocklens[i]);
++                  break;
++              }
++          }
++
++          /* abs. offset in bytes in the file */
++          offset = disp + (ADIO_Offset) n_filetypes *filetype_extent +
++                   abs_off_in_filetype;
++      }
++
++      start_off = offset;
++
++      /* If the file bytes is actually contiguous, we do not need data sieve at all */
++      if (bufsize <= fwr_size) {
++            req_off = start_off;
++            req_len = bufsize;
++            end_offset = start_off + bufsize - 1;
++          writebuf = (char *) ADIOI_Malloc(ADIOI_MIN(bufsize, stripe_size));
++          memset(writebuf, -1, ADIOI_MIN(bufsize, stripe_size));
++            writebuf_off = 0;
++            writebuf_len = 0;
++            userbuf_off = 0;
++            ADIOI_BUFFERED_WRITE_WITHOUT_READ
++      } else {
++          /* Calculate end_offset, the last byte-offset that will be accessed.
++             e.g., if start_offset=0 and 100 bytes to be write, end_offset=99 */
++          st_fwr_size = fwr_size;
++          st_n_filetypes = n_filetypes;
++          i = 0;
++          j = st_index;
++          off = offset;
++          fwr_size = ADIOI_MIN(st_fwr_size, bufsize);
++          while (i < bufsize) {
++              i += fwr_size;
++              end_offset = off + fwr_size - 1;
++
++              if (j < (flat_file->count - 1))
++                  j++;
++              else {
++                  j = 0;
++                  n_filetypes++;
++              }
++
++              off = disp + flat_file->indices[j] +
++                    (ADIO_Offset) n_filetypes * filetype_extent;
++              fwr_size = ADIOI_MIN(flat_file->blocklens[j], bufsize - i);
++          }
++
++          writebuf_off = 0;
++          writebuf_len = 0;
++          writebuf = (char *) ADIOI_Malloc(stripe_size);
++          memset(writebuf, -1, stripe_size);
++          /* if atomicity is true, lock the region to be accessed */
++          if (fd->atomicity)
++              ADIOI_WRITE_LOCK(fd, start_off, SEEK_SET, bufsize);
++
++          if (buftype_is_contig && !filetype_is_contig) {
++              /* contiguous in memory, noncontiguous in file. should be the most
++                 common case. */
++              i = 0;
++              j = st_index;
++              off = offset;
++              n_filetypes = st_n_filetypes;
++              fwr_size = ADIOI_MIN(st_fwr_size, bufsize);
++              while (i < bufsize) {
++                  if (fwr_size) {
++                      /* TYPE_UB and TYPE_LB can result in
++                         fwr_size = 0. save system call in such cases */
++                      /*
++                        lseek(fd->fd_sys, off, SEEK_SET);
++                      err = write(fd->fd_sys, ((char *) buf) + i, fwr_size);
++                        */
++                      req_off = off;
++                      req_len = fwr_size;
++                      userbuf_off = i;
++                      ADIOI_BUFFERED_WRITE
++                    }
++                  i += fwr_size;
++
++                  if (off + fwr_size < disp + flat_file->indices[j] +
++                                       flat_file->blocklens[j] +
++                          (ADIO_Offset) n_filetypes * filetype_extent)
++                      off += fwr_size;
++                  /* did not reach end of contiguous block in filetype.
++                  no more I/O needed. off is incremented by fwr_size. */
++                  else {
++                      if (j < (flat_file->count - 1))
++                          j++;
++                      else {
++                          j = 0;
++                          n_filetypes++;
++                      }
++                      off = disp + flat_file->indices[j] +
++                            (ADIO_Offset) n_filetypes * filetype_extent;
++                      fwr_size = ADIOI_MIN(flat_file->blocklens[j],
++                                             bufsize - i);
++                  }
++              }
++          } else {
++                  /* noncontiguous in memory as well as in file */
++              ADIOI_Flatten_datatype(datatype);
++              flat_buf = ADIOI_Flatlist;
++              while (flat_buf->type != datatype)
++                  flat_buf = flat_buf->next;
++
++              k = num = buf_count = 0;
++              i = (int) (flat_buf->indices[0]);
++              j = st_index;
++              off = offset;
++              n_filetypes = st_n_filetypes;
++              fwr_size = st_fwr_size;
++              bwr_size = flat_buf->blocklens[0];
++
++              while (num < bufsize) {
++                  size = ADIOI_MIN(fwr_size, bwr_size);
++                  if (size) {
++                      /*
++                        lseek(fd->fd_sys, off, SEEK_SET);
++                       err = write(fd->fd_sys, ((char *) buf) + i, size);
++                        */
++                      req_off = off;
++                      req_len = size;
++                      userbuf_off = i;
++                      ADIOI_BUFFERED_WRITE
++                    }
++
++                  new_fwr_size = fwr_size;
++                  new_bwr_size = bwr_size;
++
++                  if (size == fwr_size) {
++                      /* reached end of contiguous block in file */
++                      if (j < (flat_file->count - 1)) {
++                          j++;
++                        } else {
++                          j = 0;
++                          n_filetypes++;
++                      }
++                      off = disp + flat_file->indices[j] +
++                            (ADIO_Offset) n_filetypes * filetype_extent;
++                        new_fwr_size = flat_file->blocklens[j];
++                      if (size != bwr_size) {
++                          i += size;
++                          new_bwr_size -= size;
++                      }
++                  }
++                  if (size == bwr_size) {
++                      /* reached end of contiguous block in memory */
++                      k = (k + 1) % flat_buf->count;
++                      buf_count++;
++                      i = (int) (buftype_extent *
++                                  (buf_count / flat_buf->count) +
++                                flat_buf->indices[k]);
++                      new_bwr_size = flat_buf->blocklens[k];
++                      if (size != fwr_size) {
++                          off += size;
++                          new_fwr_size -= size;
++                      }
++                  }
++                  num += size;
++                  fwr_size = new_fwr_size;
++                  bwr_size = new_bwr_size;
++              }
++            }
++
++          /* write the buffer out finally */
++          if (writebuf_len) {
++              ADIO_WriteContig(fd, writebuf, writebuf_len,
++                               MPI_BYTE, ADIO_EXPLICIT_OFFSET,
++                               writebuf_off, &status1, error_code);
++              if (!(fd->atomicity))
++                  ADIOI_UNLOCK(fd, writebuf_off, SEEK_SET, writebuf_len);
++              if (*error_code != MPI_SUCCESS) {
++                    ADIOI_Free(writebuf);
++                  return;
++                }
++          }
++          if (fd->atomicity)
++              ADIOI_UNLOCK(fd, start_off, SEEK_SET, bufsize);
++      }
++        ADIOI_Free(writebuf);
++      if (file_ptr_type == ADIO_INDIVIDUAL)
++          fd->fp_ind = off;
++    }
++    fd->fp_sys_posn = -1;     /* set it to null. */
++
++#ifdef HAVE_STATUS_SET_BYTES
++    MPIR_Status_set_bytes(status, datatype, bufsize);
++    /* This is a temporary way of filling in status. The right way is to
++    keep track of how much data was actually written by ADIOI_BUFFERED_WRITE. */
++#endif
++
++    if (!buftype_is_contig)
++        ADIOI_Delete_flattened(datatype);
++}
+diff -ruN ad_lustre_orig/Makefile.in ad_lustre/Makefile.in
+--- ad_lustre_orig/Makefile.in 2008-09-17 14:36:57.000000000 +0800
++++ ad_lustre/Makefile.in      2008-10-17 17:03:06.000000000 +0800
+@@ -16,7 +16,9 @@
+ @VPATH@
+ 
+ AD_LUSTRE_OBJECTS = ad_lustre.o ad_lustre_open.o \
+-      ad_lustre_rwcontig.o ad_lustre_hints.o 
++      ad_lustre_rwcontig.o ad_lustre_wrcoll.o ad_lustre_wrstr.o  \
++      ad_lustre_hints.o ad_lustre_aggregate.o
++
+ 
+ default: $(LIBNAME)
+       @if [ "@ENABLE_SHLIB@" != "none" ] ; then \
+diff -ruN ad_lustre_orig/README ad_lustre/README
+--- ad_lustre_orig/README      2008-09-17 14:36:57.000000000 +0800
++++ ad_lustre/README   2008-10-17 16:50:15.000000000 +0800
+@@ -5,6 +5,23 @@
+   o To post the code for ParColl (Partitioned collective IO)
+  
+ -----------------------------------------------------
++V05: 
++-----------------------------------------------------
++Improved data redistribution
++  o Improve I/O pattern identification. Besides checking interleaving,
++    if request I/O size is small, collective I/O will be performed.
++    The hint big_req_size can be used to define the req size value.
++  o Provide hint CO for load balancing to control the number of
++    IO clients for each OST
++  o Produce stripe-contiguous I/O pattern that Lustre prefers
++  o Reduce the collective overhead by hints contiguous_data and
++    same_io_size to remove unnecessary MPI_Alltoall()
++  o Control read-modify-write in data sieving in collective IO
++    by hint ds_in_coll.
++  o Reduce extent lock conflicts by make each OST accessed by one or
++    more constant clients.
++
++-----------------------------------------------------
+ V04: 
+ -----------------------------------------------------
+   o Direct IO and Lockless IO support
+--- common/ad_write_coll_orig.c        2008-10-15 11:24:31.000000000 +0800
++++ common/ad_write_coll.c     2008-10-15 11:25:39.000000000 +0800
+@@ -42,7 +42,7 @@
+                            int *send_buf_idx, int *curr_to_proc, 
+                            int *done_to_proc, int iter, 
+                            MPI_Aint buftype_extent);
+-static void ADIOI_Heap_merge(ADIOI_Access *others_req, int *count, 
++void ADIOI_Heap_merge(ADIOI_Access *others_req, int *count, 
+                       ADIO_Offset *srt_off, int *srt_len, int *start_pos,
+                       int nprocs, int nprocs_recv, int total_elements);
+ 
+@@ -921,7 +921,7 @@
+ 
+ 
+ 
+-static void ADIOI_Heap_merge(ADIOI_Access *others_req, int *count, 
++void ADIOI_Heap_merge(ADIOI_Access *others_req, int *count, 
+                     ADIO_Offset *srt_off, int *srt_len, int *start_pos,
+                     int nprocs, int nprocs_recv, int total_elements)
+ {
diff --git a/lustre/doc/Makefile.am b/lustre/doc/Makefile.am

index 0d740e9..16fe376 100644 (file)
--- a/lustre/doc/Makefile.am
+++ b/lustre/doc/Makefile.am
@@ -1,7 +1,39 @@
-# Copyright (C) 2001, 2002 Cluster File Systems, Inc.
  #
-# This code is issued under the GNU General Public License.
-# See the file COPYING in this distribution
+# GPL HEADER START
+#
+# DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License version 2 only,
+# as published by the Free Software Foundation.
+#
+# This program is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+# General Public License version 2 for more details (a copy is included
+# in the LICENSE file that accompanied this code).
+#
+# You should have received a copy of the GNU General Public License
+# version 2 along with this program; If not, see
+# http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+#
+# Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+# CA 95054 USA or visit www.sun.com if you need additional information or
+# have any questions.
+#
+# GPL HEADER END
+#
+
+#
+# Copyright  2008 Sun Microsystems, Inc. All rights reserved
+# Use is subject to license terms.
+#
+
+#
+# This file is part of Lustre, http://www.lustre.org/
+# Lustre is a trademark of Sun Microsystems, Inc.
+#
+
  LYX2PDF = GS_OPTIONS=-dCompatibilityLevel=1.1 $(srcdir)/tex2pdf -overwrite
  TEX2PDF = GS_OPTIONS=-dCompatibilityLevel=1.1 $(srcdir)/tex2pdf -overwrite
  LYX2PS = lyx --export ps
@@ -15,7 +47,8 @@ TEXEXPAND = texexpand
  SUFFIXES = .lin .lyx .pdf .ps .sgml .html .txt .tex .fig .eps .dvi
  
  MANFILES = lustre.7 lfs.1 mount.lustre.8 mkfs.lustre.8 tunefs.lustre.8 lctl.8 \
-       llverdev.8 llbackup.8 llapi_quotactl.3
+       llverdev.8 llbackup.8 llapi_quotactl.3 llobdstat.8 llstat.8 plot-llstat.8 \
+       l_getgroups.8 lst.8 routerstat.8 lshowmount.8 ll_recover_lost_found_objs.8
  if UTILS
  man_MANS = $(MANFILES)
  endif
diff --git a/lustre/doc/VERSIONING b/lustre/doc/VERSIONING

index a719103..1b4cc5c 100644 (file)
--- a/lustre/doc/VERSIONING
+++ b/lustre/doc/VERSIONING
@@ -65,7 +65,7 @@ New numbers are used as follows:
   - odd : when a new development cycle starts after a release
  3. patch:
   - when a development snapshot or release update becomes available
- - all these are announced on lustre-{announce,devel}@clusterfs.com
+ - all these are announced on lustre-{announce,devel}@lists.lustre.org
  4. test:
   - when developers feel it is time to exchange a named version
  
diff --git a/lustre/doc/l_getgroups.8 b/lustre/doc/l_getgroups.8

new file mode 100644 (file)

index 0000000..9219971
--- /dev/null
+++ b/lustre/doc/l_getgroups.8
@@ -0,0 +1,20 @@
+.TH l_getgroups 1 "Jul 7, 2008" Lustre "utilities"
+.SH NAME
+l_getgroups \- Handle Lustre user/group cache upcall
+.SH SYNOPSIS
+.B "l_getgroups [-v] [-d | mdsname] uid"
+.br
+.B "l_getgroups [-v] -s"
+.SH DESCRIPTION
+The group upcall file contains the path to an executable that, when
+properly installed, is invoked to resolve a numeric UID to a group
+membership list. This utility should complete the mds_grp_downcall_data
+data structure (see Data structures) and write it to the
+/proc/fs/lustre/mds/mds-service/group_info pseudo-file.
+.LP
+.B l_getgroups
+is the reference implementation of the user/group cache upcall
+.SH FILES
+/proc/fs/lustre/mds/mds-service/group_upcall
+.SH SEE ALSO
+Lustre Programming Interfaces section of Lustre Operations Manual.
diff --git a/lustre/doc/lctl.8 b/lustre/doc/lctl.8

index 9243863..088bd1f 100644 (file)
--- a/lustre/doc/lctl.8
+++ b/lustre/doc/lctl.8
@@ -174,14 +174,14 @@ lctl > quit
  # lctl conf_param testfs-MDT0000 sys.timeout=40
  
  .SH BUGS
-Please report all bugs to ClusterFileSystems, support@clusterfs.com
+Please report all bugs to Sun Microsystems, Inc. http://bugzilla.lustre.org/
  .SH AVAILABILITY
  .B lctl
  is part of the 
  .BR Lustre (7) 
-filesystem package and is available from CFS
+filesystem package and is available from Sun Microsystems, Inc.
  .br
-http://clusterfs.com
+http://www.sun.com/software/products/lustre/index.xml
  .SH SEE ALSO
  .BR Lustre (7),
  .BR mkfs.lustre (8),
diff --git a/lustre/doc/lfs.1 b/lustre/doc/lfs.1

index d00fcac..8eb2a8e 100644 (file)
--- a/lustre/doc/lfs.1
+++ b/lustre/doc/lfs.1
@@ -17,18 +17,29 @@ lfs \- Lustre utility to create a file with specific striping pattern, find the
          \fB[[!] --uid|-u N] [[!] --user|-U <name>]
          \fB<dirname|filename>\fR
  .br
+.B lfs osts
+.br
  .B lfs getstripe [--obd|-O <uuid>] [--quiet|-q] [--verbose|-v] 
-              \fB[--recursive|-r] <dirname|filename>\fR
+        \fB[--recursive|-r] <dirname|filename>\fR
  .br
  .B lfs setstripe [--size|-s stripe-size] [--count|-c stripe-cnt]
-              \fB[--index|-i start-ost] <filename|dirname>\fR
+        \fB[--offset|-o start-ost] [--pool|-p pool-name]
+        \fB<dir|filename>\fR
  .br
-.B lfs setstripe -d <dirname>
+.B lfs setstripe -d <dir>
  .br
-.B lfs quotachown [-i] <filesystem>
+.B lfs poollist <filesystem>[.<pool>] | <pathname>
+.br
+.B lfs quota [-v] [-o obd_uuid] [-u|-g] <username|groupname> <filesystem>
+.br
+.B lfs quota <filesystem>
+.br
+.B lfs quota -t [-u|-g] <filesystem>
  .br
  .B lfs quotacheck [-ug] <filesystem>
  .br
+.B lfs quotachown [-i] <filesystem>
+.br
  .B lfs quotaon [-ugf] <filesystem>
  .br
  .B lfs quotaoff [-ug] <filesystem>
@@ -56,15 +67,10 @@ lfs \- Lustre utility to create a file with specific striping pattern, find the
               \fB[-b <block-grace>] [-i <inode-grace>]
               \fB<filesystem>\fR
  .br
-
-.B lfs quota [-o obd_uuid] [-u|-g] <username|groupname> <filesystem>
-.br
-.B lfs quota -t [-u|-g] <filesystem>
-.br
  .B lfs help
  .SH DESCRIPTION
  .B lfs
-can be used to create a new file with a specific striping pattern, determine the default striping pattern, gather the extended attributes (object numbers and location) for a specific file. It can be invoked interactively without any arguments or in a non-interactive mode with one of the arguements supported. 
+can be used to create a new file with a specific striping pattern, determine the default striping pattern, gather the extended attributes (object numbers and location) for a specific file, find files with specific attributes, list OST information, or set quota limits. It can be invoked interactively without any arguments or in a non-interactive mode with one of the arguements supported. 
  .SH OPTIONS
  The various options supported by lctl are listed and explained below:
  .TP
@@ -75,15 +81,17 @@ Display the status of MDS or OSTs (as specified in the command) or all the serve
  Report filesystem disk space usage or inodes usage of each MDT/OST.
  .TP
  .B find 
-To search the directory tree rooted at the given dir/file name for the files that match the given parameters: \fB--atime\fR (file was last accessed N*24 hours ago), \fB--ctime\fR (file's status was last changed N*24 hours ago), \fB--mtime\fR (file's data was last modified N*24 hours ago), \fB--obd\fR (file has an object on a specific OST or OSTs), \fB--size\fR (file has size in bytes, or \fBk\fRilo-, \fBM\fRega-, \fBG\fRiga-, \fBT\fRera-, \fBP\fReta-, or \fBE\fRxabytes if a suffix is given), \fB--type\fR (file has the type: \fBb\fRlock, \fBc\fRharacter, \fBd\fRirectory, \fBp\fRipe, \fBf\fRile, sym\fBl\fRink, \fBs\fRocket, or \fBD\fRoor (Solaris)), \fB--uid\fR (file has specific numeric user ID), \fB--user\fR (file owned by specific user, numeric user ID allowed), \fB--gid\fR (file has specific group ID), \fB--group\fR (file belongs to specific group, numeric group ID allowed). The option \fB--maxdepth\fR allows find to decend at most N levels of directory tree. The options \fB--print\fR and \fB--print0\fR print full file name, followed by a newline or NUL character correspondingly.  Using \fB!\fR before an option negates its meaning (\fIfiles NOT matching the parameter\fR).  Using \fB+\fR before a numeric value means \fIfiles with the parameter OR MORE\fR, while \fB-\fR before a numeric value means \fIfiles with the parameter OR LESS\fR.
-.TP
-.B getstripe
-To list the striping info for a given filename or files in a directory, optionally recursively, for all files in a directory tree: \fB--quiet\fR (don't print object IDs), \fB--verbose\fR (print striping parameters), \fB--recursive\fR (recurse into subdirectories).
+To search the directory tree rooted at the given dir/file name for the files that match the given parameters: \fB--atime\fR (file was last accessed N*24 hours ago), \fB--ctime\fR (file's status was last changed N*24 hours ago), \fB--mtime\fR (file's data was last modified N*24 hours ago), \fB--obd\fR (file has an object on a specific OST or OSTs), \fB--size\fR (file has size in bytes, or \fBk\fRilo-, \fBM\fRega-, \fBG\fRiga-, \fBT\fRera-, \fBP\fReta-, or \fBE\fRxabytes if a suffix is given), \fB--type\fR (file has the type: \fBb\fRlock, \fBc\fRharacter, \fBd\fRirectory, \fBp\fRipe, \fBf\fRile, sym\fBl\fRink, \fBs\fRocket, or \fBD\fRoor (Solaris)), \fB--uid\fR (file has specific numeric user ID), \fB--user\fR (file owned by specific user, numeric user ID allowed), \fB--gid\fR (file has specific group ID), \fB--group\fR (file belongs to specific group, numeric group ID allowed). The option \fB--maxdepth\fR limits find to decend at most N levels of directory tree. The options \fB--print\fR and \fB--print0\fR print full file name, followed by a newline or NUL character correspondingly.  Using \fB!\fR before an option negates its meaning (\fIfiles NOT matching the parameter\fR).  Using \fB+\fR before a numeric value means \fIfiles with the parameter OR MORE\fR, while \fB-\fR before a numeric value means \fIfiles with the parameter OR LESS\fR.
  .TP
  .B osts 
  List all the OSTs for the filesystem
  .TP
-.B setstripe [--size stripe-size] [--count stripe-cnt] [--index start-ost]
+.B getstripe
+To list the striping info for a given filename or files in a directory, optionally recursively, for all files in a directory tree: \fB--quiet\fR (don't print object IDs), \fB--verbose\fR (print striping parameters), \fB--recursive\fR (recurse into subdirectories).
+.TP
+.B setstripe [--size stripe-size] [--count stripe-cnt] 
+       \fB[--offset start-ost] [--pool pool-name]\fR
+.br
  To create a new file, or set the directory default, with the specified striping parameters.  The
  .I stripe-count
  is the number of OSTs to stripe a file over. A
@@ -96,15 +104,24 @@ is the number of bytes to store on each OST before moving to the next OST.  A
  .I stripe-size
  of 0 means to use the filesystem-wide default stripe size (default 1MB).  The
  .I start-ost
-is the OST index (starting at 0) on which to start striping for this file.  A
+is the OST index (base 10, starting at 0) on which to start striping for this file.  A
  .I start-ost
-of -1 allows the MDS to specify the starting index and it is strongly
-recommended that the starting OST not be given, as this allows space and
-load balancing to be done by the MDS as needed.
+of -1 allows the MDS to choose the starting index and it is strongly recommended, as this allows space and load balancing to be done by the MDS as needed.  The
+.I pool-name
+is the name of a predefined pool of OSTs (see 
+.I lctl
+) that will be used for striping. The 
+.I stripe-count, stripe-size, start-ost
+will be used as well; the 
+.I start-ost
+must be part of the pool or an error will be returned. 
  .TP
-.B lfs setstripe -d
+.B setstripe -d
  Delete the default striping on the specified directory.
  .TP
+.B poollist <filesystem>[.<pool>] | <pathname>
+List the pools in \fBfilesystem\fR or \fBpathname\fR, or the OSTs in \fBfilesystem.pool\fR
+.TP
  .B quotachown
  To change files' owner and group on OSTs of the specified filesystem
  .TP
@@ -126,8 +143,8 @@ To set filesystem quotas for users or groups. Limits can be specified with -b, -
  .B setquota -t [-u|-g] [--block-grace <block-grace>] [--inode-grace <inode-grace>] <filesystem>
  To set filesystem quota grace times for users or groups. Grace time is specified in "XXwXXdXXhXXmXXs" format or as an integer seconds value, see EXAMPLES
  .TP
-.B quota [-o obd_uuid] [-u|-g] <username|groupname> <filesystem>
-To display disk usage and limits, either for the full filesystem, or for objects on a specific obd. A user or group name must be specified.
+.B quota [-v] [-o obd_uuid] [-u|-g] <username|groupname> <filesystem>
+To display disk usage and limits, either for the full filesystem, or for objects on a specific obd. A user or group name can be specified. If both user and group are omitted quotas for current uid/gid are shown. -v provides more verbose (with per-obd statistics) output.
  .TP
  .B quota -t [-u|-g] <filesystem>
  To display block and inode grace times for user (-u) or group (-g) quotas
@@ -196,7 +213,7 @@ Show grace times for user quotas on /mnt/lustre
  The \fBlfs find\fR command isn't as comprehensive as \fBfind\fR(1).
  Report bugs using http://bugzilla.lustre.org.
  .SH AUTHOR
-The lfs command is part of the Lustre filesystem.  Contact info@clusterfs.com.
+The lfs command is part of the Lustre filesystem.  Contact http://www.lustre.org/
  .SH SEE ALSO
  .BR lctl (8),
  .BR lustre (7)
diff --git a/lustre/doc/ll_recover_lost_found_objs.8 b/lustre/doc/ll_recover_lost_found_objs.8

new file mode 100644 (file)

index 0000000..717c2ef
--- /dev/null
+++ b/lustre/doc/ll_recover_lost_found_objs.8
@@ -0,0 +1,25 @@
+.TH ll_recover_lost_found_objs 1 "Aug 21, 2008" Lustre "utilities"
+.SH NAME
+ll_recover_lost_found_objs \- recover Lustre OST objects in lost+found
+.SH SYNOPSIS
+.B "ll_recover_lost_found_objs [-hv] -d directory"
+.br
+.SH DESCRIPTION
+.B ll_recover_lost_found_objs
+recovers objects from lost+found that might result from a
+Lustre OST with a corrupted directory. Running e2fsck will fix the
+directory, but puts all of the objects into lost+found, where they are
+inaccessible to Lustre.
+.TP
+.I "\-h"
+Print help message.
+.TP
+.I "\-v"
+Increase verbosity.
+.TP
+.I "\-d directory"
+Set lost+found directory path.
+.SH EXAMPLE
+.fi
+ll_recover_lost_found_objs -d /mnt/ost/lost+found
+.fi
diff --git a/lustre/doc/llobdstat.8 b/lustre/doc/llobdstat.8

new file mode 100644 (file)

index 0000000..604cf8a
--- /dev/null
+++ b/lustre/doc/llobdstat.8
@@ -0,0 +1,35 @@
+.TH llobdstat 1 "Jul 7, 2008" Lustre "utilities"
+.SH NAME
+llobdstat \- display OST statistics
+.SH SYNOPSIS
+.B "llobdstat ost_name [interval]"
+.br
+.SH DESCRIPTION
+.B llobdstat
+displays a line of OST statistics for the given
+.I ost_name
+every
+.I interval
+seconds.  It should be run directly on an OSS node.
+Type control-C to stop statistics printing.
+.SH EXAMPLE
+.nf
+# llobdstat liane-OST0002 1
+/usr/bin/llobdstat on /proc/fs/lustre/obdfilter/liane-OST0002/stats
+Processor counters run at 2800.189 MHz
+Read: 1.21431e+07, Write: 9.93363e+08, create/destroy: 24/1499, stat: 34, punch: 18
+[NOTE: cx: create, dx: destroy, st: statfs, pu: punch ]
+
+Timestamp   Read-delta  ReadRate  Write-delta  WriteRate
+--------------------------------------------------------
+1217026053    0.00MB    0.00MB/s     0.00MB    0.00MB/s
+1217026054    0.00MB    0.00MB/s     0.00MB    0.00MB/s
+1217026055    0.00MB    0.00MB/s     0.00MB    0.00MB/s
+1217026056    0.00MB    0.00MB/s     0.00MB    0.00MB/s
+1217026057    0.00MB    0.00MB/s     0.00MB    0.00MB/s
+1217026058    0.00MB    0.00MB/s     0.00MB    0.00MB/s
+1217026059    0.00MB    0.00MB/s     0.00MB    0.00MB/s st:1
+...
+.fi
+.SH FILES
+/proc/fs/lustre/obdfilter/<ostname>/stats.
diff --git a/lustre/doc/llstat.8 b/lustre/doc/llstat.8

new file mode 100644 (file)

index 0000000..cbe96e0
--- /dev/null
+++ b/lustre/doc/llstat.8
@@ -0,0 +1,45 @@
+.TH llstat 1 "Jul 7, 2008" Lustre "utilities"
+.SH NAME
+llstat \- print Lustre statistics
+.SH SYNOPSIS
+.B "llstat [-c] [-g] [-i interval] stats_file"
+.br
+.SH DESCRIPTION
+.B llstat
+can display statistics from any of several lustre stats files that
+share a common format, updated every \fIinterval\fR seconds.
+Use control-C to stop statistics printing.
+.TP
+.I "\-c"
+Clear the stats file first.
+.TP
+.I "\-i interval"
+Polling period in seconds.
+.TP
+.I "\-g"
+Graphable output format.
+.TP
+.I "\-h"
+Display help information.
+.TP
+.I "stats_file"
+Either the full path to a stats file, or the shorthand:
+\fImds\fR or \fIost\fR.
+.SH EXAMPLE
+To monitor /proc/fs/lustre/ost/OSS/ost/stats every second:
+.IP
+llstat -i 1 ost
+.SH FILES
+.nf
+/proc/fs/lustre/mdt/MDS/*/stats
+/proc/fs/lustre/mds/*/exports/*/stats
+/proc/fs/lustre/mdc/*/stats
+/proc/fs/lustre/ldlm/services/*/stats
+/proc/fs/lustre/ldlm/namespaces/*/pool/stats
+/proc/fs/lustre/mgs/MGS/exports/*/stats
+/proc/fs/lustre/ost/OSS/*/stats
+/proc/fs/lustre/osc/*/stats
+/proc/fs/lustre/obdfilter/*/exports/*/stats
+/proc/fs/lustre/obdfilter/*/stats
+/proc/fs/lustre/llite/*/stats
+.fi
diff --git a/lustre/doc/llverdev.8 b/lustre/doc/llverdev.8

index fb41540..8785054 100644 (file)
--- a/lustre/doc/llverdev.8
+++ b/lustre/doc/llverdev.8
@@ -1,5 +1,4 @@
  .\" -*- nroff -*-
-.\" Copyright 2006 by Cluster FileSystems.  All Rights Reserved.
  .\" Copyright 2008 by Sun Microsystems.  All Rights Reserved.
  .\" This file may be copied under the terms of the GNU Public License, v2.
  .\"
diff --git a/lustre/doc/lshowmount.8 b/lustre/doc/lshowmount.8

new file mode 100644 (file)

index 0000000..a2ad08e
--- /dev/null
+++ b/lustre/doc/lshowmount.8
@@ -0,0 +1,43 @@
+.TH LSHOWMOUNT 8 Lustre LLNL LSHOWMOUNT
+.SH NAME
+lshowmount \- show lustre exports
+.SH SYNOPSIS
+.B "lshowmount [-ehlv]"
+.br
+.SH DESCRIPTION
+.B lshowmount
+Utility to show the hosts that have lustre currently mounted to a server.
+Ths utility looks for any exports from the mgs, mds, and obdfilter.
+.SH OPTIONS
+.B lshowmount
+accepts the following options:
+.TP
+.I "-e | --enumerate"
+causes
+.B lshowmount
+to list each client mounted on a separate line instead of trying
+to compress the list of clients into a hostrange string.
+.TP
+.I "-h | --help"
+causes
+.B lshowmount
+to print out a usage message.
+.TP
+.I "-l | --lookup"
+causes
+.B lshowmount
+to try to lookup the hostname for nids that look like IP addresses.
+.TP
+.I "-v | --verbose"
+causes
+.B lshowmount
+to output export information for each service instead of only displaying
+the aggregate information for all Lustre services on the server.
+.SH FILES
+/proc/fs/lustre/mgs/<server>/exports/<uuid>/nid
+.br
+/proc/fs/lustre/mds/<server>/exports/<uuid>/nid
+.br
+/proc/fs/lustre/obdfilter/<server>/exports/<uuid>/nid
+.SH AUTHOR
+Herb Wartens
diff --git a/lustre/doc/lst.8 b/lustre/doc/lst.8

new file mode 100644 (file)

index 0000000..a34bcd9
--- /dev/null
+++ b/lustre/doc/lst.8
@@ -0,0 +1,88 @@
+.TH lst 1 "Jul 7, 2008" Lustre "utilities"
+.SH NAME
+lst \- Start the Lustre LNET Self-test
+.SH SYNOPSIS
+.B "lst"
+.SH DESCRIPTION
+LNET self-test helps site administrators confirm that Lustre
+Networking (LNET) has been properly installed and configured.
+The self-test also confirms that LNET and the network software
+and hardware underlying it are performing according to expectations.
+.LP
+Every LNET self-test runs in the context of a session.  A node can be
+associated with only one session at a time to ensure the session has
+exclusive use of the nodes on which it is running. A session is created,
+controlled and monitored from a single node; this is referred to as the
+self-test console.
+.LP
+Any node may act as the self-test console.  Nodes are named and allocated
+to a self-test session in groups.  This allows all nodes in a group to
+be referenced by a single name.
+.LP
+Test configurations are built by describing and running test batches.
+A test batch is simply a named collection of tests, with each test
+composed of a number of individual point-to-point tests running in parallel.
+These individual point-to-point tests are instantiated according to the
+test type, source group, target group and distribution specified when the
+test is added to the test batch.
+.LP
+.SH MODULES
+To run LNET self-test, load these modules: libcfs, lnet, lnet_selftest
+and any one of the klnds (i.e, ksocklnd, ko2iblnd...).
+To load all necessary modules, run modprobe lnet_selftest, which
+recursively loads the modules that lnet_selftest depends on.
+.LP
+There are two types of nodes for LNET self-test: the console node and
+test nodes. Both node types require all previously-specified modules
+to be loaded. (The userspace test node does not require these modules.)
+.LP
+Test nodes can be in either kernel or userspace.  A console user can
+invite a kernel test node to join the test session by running
+.I "lst add_group NID",
+but the user cannot actively add a userspace test node to the
+test-session.  However, the console user can passively accept a test
+node to the test session while the test node runs lstclient to connect
+to the console.
+.SH UTILITIES
+LNET self-test has two user utilities, lst and lstclient.
+.LP
+.B lst
+is the user interface for the self-test console (run on console node).
+It provides a list of commands to control the entire test system,
+such as create session, create test groups, etc.
+.LP
+.B lstclient
+is the userspace self-test program, which is linked with userspace
+LNDs and LNET. A user can use lstclient to join a self-test session:
+.nf
+lstclient -sesid CONSOLE_NID group NAME
+.fi
+.SH EXAMPLE SCRIPT
+Below is a sample LNET self-test script which simulates the traffic
+pattern of a set of Lustre servers on a TCP network, accessed by Lustre
+clients on an IB network (connected via LNET routers), with half the
+clients reading and half the clients writing.
+.LP
+.nf
+#!/bin/bash
+export LST_SESSION=$$
+lst new_session read/write
+lst add_group servers 192.168.10.[8,10,12-16]@tcp
+lst add_group readers 192.168.1.[1-253/2]@o2ib
+lst add_group writers 192.168.1.[2-254/2]@o2ib
+lst add_batch bulk_rw
+lst add_test --batch bulk_rw --from readers --to servers \
+    brw read check=simple size=1M
+lst add_test --batch bulk_rw --from writers --to servers \
+    brw write check=full size=4K
+# start running
+lst run bulk_rw
+# display server stats for 30 seconds
+lst stat servers & sleep 30; kill $?
+# tear down
+lst end_session
+.fi
+.SH SEE ALSO
+This manual page was extracted from Introduction to LNET Self-Test,
+section 19.4.1 of the Lustre Operations Manual.  For more detailed
+information, please refer to that document.
diff --git a/lustre/doc/lustre.7 b/lustre/doc/lustre.7

index 682246f..efa92bd 100644 (file)
--- a/lustre/doc/lustre.7
+++ b/lustre/doc/lustre.7
@@ -1,5 +1,5 @@
  .\" -*- nroff -*-
-.\" Copyright 2006 by Cluster FileSystems.  All Rights Reserved.
+.\" Copyright 2008 by Sun Microsystems, Inc.  All Rights Reserved.
  .\" This file may be copied under the terms of the GNU Public License.
  .\"
  .TH Lustre 7 "2006 Jun 15" Lustre "A high-performance cluster file system"
diff --git a/lustre/doc/mkfs.lustre.8 b/lustre/doc/mkfs.lustre.8

index 43b9876..ab14db5 100644 (file)
--- a/lustre/doc/mkfs.lustre.8
+++ b/lustre/doc/mkfs.lustre.8
@@ -1,5 +1,4 @@
  .\" -*- nroff -*-
-.\" Copyright 2006 by Cluster File Systems, Inc.  All Rights Reserved.
  .\" Copyright 2008 by Sun Microsystems.  All Rights Reserved.
  .\" This file may be copied under the terms of the GNU Public License, v2.
  .\"
diff --git a/lustre/doc/mount.lustre.8 b/lustre/doc/mount.lustre.8

index 4980db3..40085b3 100644 (file)
--- a/lustre/doc/mount.lustre.8
+++ b/lustre/doc/mount.lustre.8
@@ -1,5 +1,4 @@
  .\" -*- nroff -*-
-.\" Copyright 2006 by Cluster FileSystems.  All Rights Reserved.
  .\" Copyright 2008 by Sun Microsystems.  All Rights Reserved.
  .\" This file may be copied under the terms of the GNU Public License v2.
  .\"
@@ -117,6 +116,18 @@ Start the Lustre metadata target service from /dev/sda1 on mountpoint /mnt/test/
  .B mount -t lustre -L testfs-MDT0000 -o abort_recov /mnt/test/mdt
  Start the testfs-MDT0000 service (by using the disk label), but abort the
  recovery process.
+.SH NOTES
+If the Service Tags tool (from the sun-servicetag package) can be found in
+/opt/sun/servicetag/bin/stclient an inventory service tag will be created
+reflecting the Lustre service being provided.  If this tool cannot be found
+.B mount.lustre
+will silently ignore it and no service tag is created.  The
+.BR stclient (1)
+tool only creates the local service tag.  No information is sent to the asset
+management system until you run the Registration Client to collect the tags
+and then upload them to the inventory system using your inventory system account.
+See https://inventory.sun.com/ for more details on a web-based, free, IT asset
+management system.
  .SH BUGS
  Not very many mount options can be changed with
  .BR "-o remount" .
diff --git a/lustre/doc/plot-llstat.8 b/lustre/doc/plot-llstat.8

new file mode 100644 (file)

index 0000000..9abb522
--- /dev/null
+++ b/lustre/doc/plot-llstat.8
@@ -0,0 +1,43 @@
+.TH plot0llstat 1 "Jul 7, 2008" Lustre "utilities"
+.SH NAME
+plot-llstat \- plot Lustre statistics
+.SH SYNOPSIS
+.B "plot-llstat results_filename [parameter_index]"
+.br
+.SH DESCRIPTION
+The
+.B plot-llstat
+script is used to generate csv file and instructions files for gnuplot
+from the output of
+.B llstat.
+Since
+.B llstat
+is generic in nature,
+.B plot-llstat
+is also a generic script.
+.LP
+.I parameter_index
+can be 1 for count per interval, 2 for count per second (default),
+or 3 for total count.
+.LP
+plot-llstat script creates dat(csv) file using number of operations
+specified by the user. Number of operations equals to number of columns in csv
+file. And values in those columns are equals to the corresponding value of
+the
+.I "parameter_index"
+parameter from the output file.
+.LP
+.B plot-llstat
+also creates .scr file that contains instructions for gnuplot to plot
+the graph. After generating .dat and .scr files
+.B plot-llstat
+invokes
+.B gnuplot
+to display graph.
+.SH EXAMPLE
+.nf
+llstat -i2 -g -c lustre-OST0000 > log
+plot-llstat log 3
+.fi
+.SH SEE ALSO
+llstat(8)
diff --git a/lustre/doc/postbar b/lustre/doc/postbar

index 349d41c..f1111eb 100755 (executable)
--- a/lustre/doc/postbar
+++ b/lustre/doc/postbar
@@ -1,6 +1,8 @@
  #! /usr/bin/perl
  # postbar - Massage chbar.sh output into valid LaTeX
-# Copyright (C) 2002  Cluster File Systems, Inc.
+# Copyright  2008 Sun Microsystems, Inc. All rights reserved
+# Use is subject to license terms.
+#
  # Gord Eagle <gord@clusterfs.com>, 2002-08-10
  
  my $progname = $0;
diff --git a/lustre/doc/routerstat.8 b/lustre/doc/routerstat.8

new file mode 100644 (file)

index 0000000..159b171
--- /dev/null
+++ b/lustre/doc/routerstat.8
@@ -0,0 +1,27 @@
+.TH routerstat 1 "Jul 7, 2008" Lustre "utilities"
+.SH NAME
+routerstat \- print Lustre router statistics
+.SH SYNOPSIS
+.B "routerstat [interval]
+.br
+.SH DESCRIPTION
+.B routerstat
+watches LNET router statistics.  If no
+.I interval
+is specified, stats are sampled and printed only once;
+otherwise, stats are sampled and printed every
+.I interval
+seconds.
+.LP
+Output includes the following fields:
+.LP
+.nf
+M - msgs_alloc(msgs_max)
+E - errors
+S - send_count/send_length
+R - recv_count/recv_length
+F - route_count/route_length
+D - drop_count/drop_length
+.fi
+.SH FILES
+/proc/sys/lnet/stats
diff --git a/lustre/doc/tunefs.lustre.8 b/lustre/doc/tunefs.lustre.8

index e99e15a..ec1c46b 100644 (file)
--- a/lustre/doc/tunefs.lustre.8
+++ b/lustre/doc/tunefs.lustre.8
@@ -1,5 +1,4 @@
  .\" -*- nroff -*-
-.\" Copyright 2006 by Cluster File Systems, Inc.  All Rights Reserved.
  .\" Copyright 2008 by Sun Microsystems, Inc.  All Rights Reserved.
  .\" This file may be copied under the terms of the GNU Public License, v2.
  .\"
diff --git a/lustre/include/Makefile.am b/lustre/include/Makefile.am

index c641829..3c60c26 100644 (file)
--- a/lustre/include/Makefile.am
+++ b/lustre/include/Makefile.am
@@ -1,18 +1,46 @@
+#
+# GPL HEADER START
+#
+# DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License version 2 only,
+# as published by the Free Software Foundation.
+#
+# This program is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+# General Public License version 2 for more details (a copy is included
+# in the LICENSE file that accompanied this code).
+#
+# You should have received a copy of the GNU General Public License
+# version 2 along with this program; If not, see
+# http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+#
+# Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+# CA 95054 USA or visit www.sun.com if you need additional information or
+# have any questions.
+#
+# GPL HEADER END
+#
+
+#
+# Copyright  2008 Sun Microsystems, Inc. All rights reserved
+# Use is subject to license terms.
+#
  
-# Copyright (C) 2001  Cluster File Systems, Inc.
  #
-# This code is issued under the GNU General Public License.
-# See the file COPYING in this distribution
+# This file is part of Lustre, http://www.lustre.org/
+# Lustre is a trademark of Sun Microsystems, Inc.
+#
  
  SUBDIRS = linux lustre
  
  EXTRA_DIST = ioctl.h liblustre.h lprocfs_status.h lustre_cfg.h         \
-            lustre_commit_confd.h lustre_debug.h lustre_disk.h \
-            lustre_dlm.h lustre_export.h lustre_fsfilt.h lustre_ha.h \
-            lustre_handles.h lustre_import.h lustre_lib.h class_hash.h \
-            lustre_lite.h lustre_log.h lustre_mds.h lustre_net.h \
-             lustre_param.h lustre_quota.h lustre_ucache.h lvfs.h \
-            obd_cache.h obd_class.h obd_echo.h obd.h obd_lov.h \
-            obd_ost.h obd_support.h lustre_cache.h lustre_ver.h \
-            interval_tree.h
-
+            lustre_debug.h lustre_disk.h lustre_dlm.h lustre_export.h \
+            lustre_fsfilt.h lustre_ha.h lustre_handles.h lustre_import.h \
+            lustre_lib.h class_hash.h lustre_lite.h lustre_log.h \
+            lustre_mds.h lustre_net.h lustre_param.h lustre_quota.h \
+            lustre_ucache.h lvfs.h obd_cache.h obd_class.h obd_echo.h \
+            obd.h obd_lov.h obd_ost.h obd_support.h lustre_cache.h \
+             lustre_ver.h interval_tree.h
diff --git a/lustre/include/class_hash.h b/lustre/include/class_hash.h

index f27a046..e2b2b11 100644 (file)
--- a/lustre/include/class_hash.h
+++ b/lustre/include/class_hash.h
@@ -1,5 +1,37 @@
  /* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
   * vim:expandtab:shiftwidth=8:tabstop=8:
+ *
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright  2008 Sun Microsystems, Inc. All rights reserved
+ * Use is subject to license terms.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
   */
  
  #ifndef __CLASS_HASH_H
@@ -7,140 +39,302 @@
  
  #include <lustre_lib.h>
  
-/* #define LUSTRE_HASH_DEBUG 1 */
-
-/* define the hash bucket*/
-struct lustre_hash_bucket { 
-        struct hlist_head lhb_head;
-        spinlock_t lhb_lock;
-#ifdef LUSTRE_HASH_DEBUG
-        /* the number of hash item per bucket, 
-         * it will help us to analyse the hash distribute 
-         */
-        int lhb_item_count; 
-#endif
-};
-
-struct lustre_hash_operations;
-
-struct lustre_class_hash_body {
-        char hashname[128];
-        spinlock_t lchb_lock; /* body lock */
-        struct lustre_hash_bucket *lchb_hash_tables;
-        __u32 lchb_hash_max_size; /* define the hash tables size */
-        /* define the hash operations */
-        struct lustre_hash_operations *lchb_hash_operations;
-};
-
-/* hash operations method define */
-struct lustre_hash_operations {
-        __u32 (*lustre_hashfn) (struct lustre_class_hash_body *hash_body, 
-                                void *key);
-        int   (*lustre_hash_key_compare) (void *key, 
-                                          struct hlist_node *compared_hnode);
-        /* add refcount */ 
-        void* (*lustre_hash_object_refcount_get) (struct hlist_node *hash_item);
-        /* dec refcount */
-        void  (*lustre_hash_object_refcount_put) (struct hlist_node *hash_item);
-};
-
-static inline struct hlist_node * 
-lustre_hash_getitem_in_bucket_nolock(struct lustre_class_hash_body *hash_body, 
-                                     int hashent, void *key)
-{
-        struct lustre_hash_bucket *bucket;
-        struct hlist_node  *hash_item_node;
-        struct lustre_hash_operations *hop = hash_body->lchb_hash_operations;
-        int find = 0;
-        ENTRY;
-
-        bucket = &hash_body->lchb_hash_tables[hashent];
-        hlist_for_each(hash_item_node, &(bucket->lhb_head)) {
-                find = hop->lustre_hash_key_compare(key, hash_item_node);
-                if (find == 1)
-                        break;
+struct lustre_hash_ops;
+
+typedef struct lustre_hash_bucket {
+        struct hlist_head           lhb_head;       /* entries list */
+        atomic_t                    lhb_count;      /* current entries */
+        rwlock_t                    lhb_rwlock;     /* lustre_hash_bucket */
+} lustre_hash_bucket_t;
+
+#define LUSTRE_MAX_HASH_NAME 16
+
+typedef struct lustre_hash {
+        int                         lh_cur_bits;    /* current hash bits */
+        int                         lh_cur_mask;    /* current hash mask */
+        int                         lh_min_bits;    /* min hash bits */
+        int                         lh_max_bits;    /* max hash bits */
+        int                         lh_min_theta;   /* resize min threshold */
+        int                         lh_max_theta;   /* resize max threshold */
+        int                         lh_flags;       /* hash flags */
+        atomic_t                    lh_count;       /* current entries */
+        atomic_t                    lh_rehash_count;/* resize count */
+        struct lustre_hash_bucket  *lh_buckets;     /* hash buckets */
+        struct lustre_hash_ops     *lh_ops;         /* hash operations */
+        rwlock_t                    lh_rwlock;      /* lustre_hash */
+        char                        lh_name[LUSTRE_MAX_HASH_NAME];
+} lustre_hash_t;
+
+typedef struct lustre_hash_ops {
+        unsigned (*lh_hash)(lustre_hash_t *lh, void *key, unsigned mask);
+        void *   (*lh_key)(struct hlist_node *hnode);
+        int      (*lh_compare)(void *key, struct hlist_node *hnode);
+        void *   (*lh_get)(struct hlist_node *hnode);
+        void *   (*lh_put)(struct hlist_node *hnode);
+        void     (*lh_exit)(struct hlist_node *hnode);
+} lustre_hash_ops_t;
+
+#define LH_DEBUG        0x0001          /* Enable expensive debug checks */
+#define LH_REHASH       0x0002          /* Enable dynamic hash resizing */
+
+#define LHO(lh)         (lh)->lh_ops
+#define LHP(lh, op)     (lh)->lh_ops->lh_ ## op
+
+static inline unsigned
+lh_hash(lustre_hash_t *lh, void *key, unsigned mask)
+{
+        LASSERT(lh);
+        LASSERT(LHO(lh));
+
+        if (LHP(lh, hash))
+                return LHP(lh, hash)(lh, key, mask);
+
+        return -EOPNOTSUPP;
+}
+
+static inline void *
+lh_key(lustre_hash_t *lh, struct hlist_node *hnode)
+{
+        LASSERT(lh);
+        LASSERT(hnode);
+        LASSERT(LHO(lh));
+
+        if (LHP(lh, key))
+                return LHP(lh, key)(hnode);
+
+        return NULL;
+}
+
+/* Returns 1 on a match,
+ * XXX: This would be better if it returned, -1, 0, or 1 for
+ *      <, =, > respectivly.  It could then be used to implement
+ *      a LH_SORT feature flags which could keep each lustre hash
+ *      bucket in order.  This would increase insertion times
+ *      but could reduce lookup times for deep chains.  Ideally,
+ *      the rehash should keep chain depth short but if that
+ *      ends up not being the case this would be a nice feature.
+ */
+static inline int
+lh_compare(lustre_hash_t *lh, void *key, struct hlist_node *hnode)
+{
+        LASSERT(lh);
+        LASSERT(hnode);
+        LASSERT(LHO(lh));
+
+        if (LHP(lh, compare))
+                return LHP(lh, compare)(key, hnode);
+
+        return -EOPNOTSUPP;
+}
+
+static inline void *
+lh_get(lustre_hash_t *lh, struct hlist_node *hnode)
+{
+        LASSERT(lh);
+        LASSERT(hnode);
+        LASSERT(LHO(lh));
+
+        if (LHP(lh, get))
+                return LHP(lh, get)(hnode);
+
+        return NULL;
+}
+
+static inline void *
+lh_put(lustre_hash_t *lh, struct hlist_node *hnode)
+{
+        LASSERT(lh);
+        LASSERT(hnode);
+        LASSERT(LHO(lh));
+
+        if (LHP(lh, put))
+                return LHP(lh, put)(hnode);
+
+        return NULL;
+}
+
+static inline void
+lh_exit(lustre_hash_t *lh, struct hlist_node *hnode)
+{
+        LASSERT(lh);
+        LASSERT(hnode);
+        LASSERT(LHO(lh));
+
+        if (LHP(lh, exit))
+                return LHP(lh, exit)(hnode);
+}
+
+/* Validate hnode references the correct key */
+static inline void
+__lustre_hash_key_validate(lustre_hash_t *lh, void *key,
+                           struct hlist_node *hnode)
+{
+        if (unlikely(lh->lh_flags & LH_DEBUG))
+                LASSERT(lh_compare(lh, key, hnode));
+}
+
+/* Validate hnode is in the correct bucket */
+static inline void
+__lustre_hash_bucket_validate(lustre_hash_t *lh, lustre_hash_bucket_t *lhb,
+                              struct hlist_node *hnode)
+{
+        unsigned i;
+
+        if (unlikely(lh->lh_flags & LH_DEBUG)) {
+                i = lh_hash(lh, lh_key(lh, hnode), lh->lh_cur_mask);
+                LASSERT(&lh->lh_buckets[i] == lhb);
          }
-        RETURN(find == 1 ? hash_item_node : NULL);
-}
-
-static inline int 
-lustre_hash_delitem_nolock(struct lustre_class_hash_body *hash_body, 
-                           int hashent, struct hlist_node * hash_item)
-{
-        struct lustre_hash_operations *hop = hash_body->lchb_hash_operations;
-
-        hlist_del_init(hash_item);
-
-        hop->lustre_hash_object_refcount_put(hash_item);
-
-#ifdef LUSTRE_HASH_DEBUG
-        hash_body->lchb_hash_tables[hashent].lhb_item_count--;
-        CDEBUG(D_INFO, "hashname[%s] bucket[%d] has [%d] hashitem\n", 
-                        hash_body->hashname, hashent, 
-                        hash_body->lchb_hash_tables[hashent].lhb_item_count);
-#endif
-
-        RETURN(0);
-}
-
-typedef void (*hash_item_iterate_cb) (void *obj, void *data);
-
-int lustre_hash_init(struct lustre_class_hash_body **hash_body,
-                     char *hashname, __u32 hashsize, 
-                     struct lustre_hash_operations *hash_operations);
-void lustre_hash_exit(struct lustre_class_hash_body **hash_body);
-int lustre_hash_additem_unique(struct lustre_class_hash_body *hash_body, 
-                               void *key, struct hlist_node *actual_hnode);
-void *lustre_hash_findadd_unique(struct lustre_class_hash_body *hash_body,
-                                 void *key, struct hlist_node *actual_hnode);
-int lustre_hash_additem(struct lustre_class_hash_body *hash_body, void *key, 
-                        struct hlist_node *actual_hnode);
-int lustre_hash_delitem_by_key(struct lustre_class_hash_body *hash_body, 
-                               void *key);
-int lustre_hash_delitem(struct lustre_class_hash_body *hash_body, void *key, 
-                        struct hlist_node *hash_item);
-void lustre_hash_bucket_iterate(struct lustre_class_hash_body *hash_body,
-                                void *key, hash_item_iterate_cb,
-                                void *data);
-void lustre_hash_iterate_all(struct lustre_class_hash_body *hash_body,
-                             hash_item_iterate_cb, void *data);
-
-void * lustre_hash_get_object_by_key(struct lustre_class_hash_body *hash_body,
-                                      void *key);
-
-__u32 djb2_hashfn(struct lustre_class_hash_body *hash_body, void* key,
-                  size_t size);
-
-/* ( uuid <-> export ) hash operations define */
-__u32 uuid_hashfn(struct lustre_class_hash_body *hash_body,  void * key);
-int uuid_hash_key_compare(void *key, struct hlist_node * compared_hnode);
-void * uuid_export_refcount_get(struct hlist_node * actual_hnode);
-void uuid_export_refcount_put(struct hlist_node * actual_hnode);
-
-/* ( nid <-> export ) hash operations define */
-__u32 nid_hashfn(struct lustre_class_hash_body *hash_body,  void * key);
-int nid_hash_key_compare(void *key, struct hlist_node * compared_hnode);
-void * nid_export_refcount_get(struct hlist_node * actual_hnode);
-void nid_export_refcount_put(struct hlist_node * actual_hnode);
-
-/* ( net_peer <-> connection ) hash operations define */
-__u32 conn_hashfn(struct lustre_class_hash_body *hash_body,  void * key);
-int conn_hash_key_compare(void *key, struct hlist_node * compared_hnode);
-void * conn_refcount_get(struct hlist_node * actual_hnode);
-void conn_refcount_put(struct hlist_node * actual_hnode);
-
-/* ( nid <-> nidstats ) hash operations define. uses nid_hashfn */
-int nidstats_hash_key_compare(void *key, struct hlist_node * compared_hnode);
-void* nidstats_refcount_get(struct hlist_node * actual_hnode);
-void nidstats_refcount_put(struct hlist_node * actual_hnode);
-extern struct lustre_hash_operations nid_stat_hash_operations;
-
-#ifdef __KERNEL__
-/* ( lqs <-> qctxt ) hash operations define b=10600 */
-__u32 lqs_hashfn(struct lustre_class_hash_body *hash_body,  void * key);
-int lqs_hash_key_compare(void *key, struct hlist_node * compared_hnode);
-void * lqs_refcount_get(struct hlist_node * actual_hnode);
-void lqs_refcount_put(struct hlist_node * actual_hnode);
-#endif
+}
+
+static inline struct hlist_node *
+__lustre_hash_bucket_lookup(lustre_hash_t *lh,
+                            lustre_hash_bucket_t *lhb, void *key)
+{
+        struct hlist_node *hnode;
+
+        hlist_for_each(hnode, &lhb->lhb_head)
+                if (lh_compare(lh, key, hnode))
+                        return hnode;
+
+        return NULL;
+}
+
+static inline void *
+__lustre_hash_bucket_add(lustre_hash_t *lh,
+                         lustre_hash_bucket_t *lhb,
+                         struct hlist_node *hnode)
+{
+        hlist_add_head(hnode, &(lhb->lhb_head));
+        atomic_inc(&lhb->lhb_count);
+        atomic_inc(&lh->lh_count);
+
+        return lh_get(lh, hnode);
+}
+
+static inline void *
+__lustre_hash_bucket_del(lustre_hash_t *lh,
+                         lustre_hash_bucket_t *lhb,
+                         struct hlist_node *hnode)
+{
+        hlist_del_init(hnode);
+        LASSERT(atomic_read(&lhb->lhb_count) > 0);
+        atomic_dec(&lhb->lhb_count);
+        LASSERT(atomic_read(&lh->lh_count) > 0);
+        atomic_dec(&lh->lh_count);
+
+        return lh_put(lh, hnode);
+}
+
+/* Hash init/cleanup functions */
+lustre_hash_t *lustre_hash_init(char *name, unsigned int cur_bits, 
+                                unsigned int max_bits,
+                                lustre_hash_ops_t *ops, int flags);
+void lustre_hash_exit(lustre_hash_t *lh);
+
+/* Hash addition functions */
+void lustre_hash_add(lustre_hash_t *lh, void *key,
+                     struct hlist_node *hnode);
+int lustre_hash_add_unique(lustre_hash_t *lh, void *key,
+                           struct hlist_node *hnode);
+void *lustre_hash_findadd_unique(lustre_hash_t *lh, void *key,
+                                 struct hlist_node *hnode);
+
+/* Hash deletion functions */
+void *lustre_hash_del(lustre_hash_t *lh, void *key, struct hlist_node *hnode);
+void *lustre_hash_del_key(lustre_hash_t *lh, void *key);
+
+/* Hash lookup/for_each functions */
+void *lustre_hash_lookup(lustre_hash_t *lh, void *key);
+typedef void (*lh_for_each_cb)(void *obj, void *data);
+void lustre_hash_for_each(lustre_hash_t *lh, lh_for_each_cb, void *data);
+void lustre_hash_for_each_safe(lustre_hash_t *lh, lh_for_each_cb, void *data);
+void lustre_hash_for_each_empty(lustre_hash_t *lh, lh_for_each_cb, void *data);
+void lustre_hash_for_each_key(lustre_hash_t *lh, void *key,
+                              lh_for_each_cb, void *data);
+
+/* 
+ * Rehash - Theta is calculated to be the average chained
+ * hash depth assuming a perfectly uniform hash funcion. 
+ */
+int lustre_hash_rehash(lustre_hash_t *lh, int bits);
+void lustre_hash_rehash_key(lustre_hash_t *lh, void *old_key,
+                            void *new_key, struct hlist_node *hnode);
+
+
+#define LH_THETA_BITS  10
+
+/* Return integer component of theta */
+static inline int __lustre_hash_theta_int(int theta)
+{
+        return (theta >> LH_THETA_BITS);
+}
+
+/* Return a fractional value between 0 and 999 */
+static inline int __lustre_hash_theta_frac(int theta)
+{
+        return ((theta * 1000) >> LH_THETA_BITS) - 
+               (__lustre_hash_theta_int(theta) * 1000);
+}
+
+static inline int __lustre_hash_theta(lustre_hash_t *lh)
+{
+        return (atomic_read(&lh->lh_count) << LH_THETA_BITS) >> lh->lh_cur_bits;
+}
+
+static inline void __lustre_hash_set_theta(lustre_hash_t *lh, int min, int max)
+{
+        LASSERT(min < max);
+        lh->lh_min_theta = min;
+        lh->lh_max_theta = max;
+}
+
+/* Generic debug formatting routines mainly for proc handler */
+int lustre_hash_debug_header(char *str, int size);
+int lustre_hash_debug_str(lustre_hash_t *lh, char *str, int size);
+
+/* 2^31 + 2^29 - 2^25 + 2^22 - 2^19 - 2^16 + 1 */
+#define CFS_GOLDEN_RATIO_PRIME_32 0x9e370001UL
+/*  2^63 + 2^61 - 2^57 + 2^54 - 2^51 - 2^18 + 1 */
+#define CFS_GOLDEN_RATIO_PRIME_64 0x9e37fffffffc0001ULL
+
+/*
+ * Generic djb2 hash algorithm for character arrays.
+ */
+static inline unsigned
+lh_djb2_hash(void *key, size_t size, unsigned mask)
+{
+        unsigned i, hash = 5381;
+
+        LASSERT(key != NULL);
+
+        for (i = 0; i < size; i++)
+                hash = hash * 33 + ((char *)key)[i];
+
+        return (hash & mask);
+}
+
+/*
+ * Generic u32 hash algorithm.
+ */
+static inline unsigned
+lh_u32_hash(__u32 key, unsigned mask)
+{
+        return ((key * CFS_GOLDEN_RATIO_PRIME_32) & mask);
+}
+
+/*
+ * Generic u64 hash algorithm.
+ */
+static inline unsigned
+lh_u64_hash(__u64 key, unsigned mask)
+{
+        return ((unsigned)(key * CFS_GOLDEN_RATIO_PRIME_64) & mask);
+}
+
+#define lh_for_each_bucket(lh, lhb, pos)         \
+        for (pos = 0;                            \
+             pos <= lh->lh_cur_mask &&           \
+             ({ lhb = &lh->lh_buckets[i]; 1; }); \
+             pos++)
  
  #endif /* __CLASS_HASH_H */
diff --git a/lustre/include/darwin/lprocfs_status.h b/lustre/include/darwin/lprocfs_status.h

index dc17b9f..70056c5 100644 (file)
--- a/lustre/include/darwin/lprocfs_status.h
+++ b/lustre/include/darwin/lprocfs_status.h
@@ -1,25 +1,41 @@
  /* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
   * vim:expandtab:shiftwidth=8:tabstop=8:
   *
- *  Copyright (C) 2002 Cluster File Systems, Inc.
+ * GPL HEADER START
   *
- *   This file is part of Lustre, http://www.lustre.org.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   *
- *   Lustre is free software; you can redistribute it and/or
- *   modify it under the terms of version 2 of the GNU General Public
- *   License as published by the Free Software Foundation.
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
   *
- *   Lustre is distributed in the hope that it will be useful,
- *   but WITHOUT ANY WARRANTY; without even the implied warranty of
- *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- *   GNU General Public License for more details.
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
   *
- *   You should have received a copy of the GNU General Public License
- *   along with Lustre; if not, write to the Free Software
- *   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
   *
- *   Top level header file for LProc SNMP
- *   Author: Hariharan Thantry thantry@users.sourceforge.net
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright  2008 Sun Microsystems, Inc. All rights reserved
+ * Use is subject to license terms.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/include/darwin/lprocfs_status.h
+ *
+ * Author: Hariharan Thantry thantry@users.sourceforge.net
   */
  #ifndef _DARWIN_LPROCFS_SNMP_H
  #define _DARWIN_LPROCFS_SNMP_H
diff --git a/lustre/include/darwin/lustre_compat.h b/lustre/include/darwin/lustre_compat.h

index d11c8d6..1769792 100644 (file)
--- a/lustre/include/darwin/lustre_compat.h
+++ b/lustre/include/darwin/lustre_compat.h
@@ -1,3 +1,39 @@
+/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
+ * vim:expandtab:shiftwidth=8:tabstop=8:
+ *
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright  2008 Sun Microsystems, Inc. All rights reserved
+ * Use is subject to license terms.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
  #ifndef __DARWIN_LUSTRE_COMPAT_H__
  #define __DARWIN_LUSTRE_COMPAT_H__
  
diff --git a/lustre/include/darwin/lustre_debug.h b/lustre/include/darwin/lustre_debug.h

index b2b72f6..6db177d 100644 (file)
--- a/lustre/include/darwin/lustre_debug.h
+++ b/lustre/include/darwin/lustre_debug.h
@@ -1,23 +1,37 @@
  /* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
   * vim:expandtab:shiftwidth=8:tabstop=8:
   *
- *  Copyright (C) 2002 Cluster File Systems, Inc.
+ * GPL HEADER START
   *
- *   This file is part of Lustre, http://www.lustre.org.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   *
- *   Lustre is free software; you can redistribute it and/or
- *   modify it under the terms of version 2 of the GNU General Public
- *   License as published by the Free Software Foundation.
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
   *
- *   Lustre is distributed in the hope that it will be useful,
- *   but WITHOUT ANY WARRANTY; without even the implied warranty of
- *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- *   GNU General Public License for more details.
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
   *
- *   You should have received a copy of the GNU General Public License
- *   along with Lustre; if not, write to the Free Software
- *   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
   *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright  2008 Sun Microsystems, Inc. All rights reserved
+ * Use is subject to license terms.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
   */
  
  #ifndef _DARWIN_LUSTRE_DEBUG_H
diff --git a/lustre/include/darwin/lustre_dlm.h b/lustre/include/darwin/lustre_dlm.h

index 98587f3..952a187 100644 (file)
--- a/lustre/include/darwin/lustre_dlm.h
+++ b/lustre/include/darwin/lustre_dlm.h
@@ -1,6 +1,37 @@
  /* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
- * (visit-tags-table FILE)
   * vim:expandtab:shiftwidth=8:tabstop=8:
+ *
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright  2008 Sun Microsystems, Inc. All rights reserved
+ * Use is subject to license terms.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
   */
  
  #ifndef _DARWIN_LUSTRE_DLM_H__
diff --git a/lustre/include/darwin/lustre_fsfilt.h b/lustre/include/darwin/lustre_fsfilt.h

index e3d9a7e..c9c1f3f 100644 (file)
--- a/lustre/include/darwin/lustre_fsfilt.h
+++ b/lustre/include/darwin/lustre_fsfilt.h
@@ -1,25 +1,41 @@
  /* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
   * vim:expandtab:shiftwidth=8:tabstop=8:
   *
- *  Copyright (C) 2001-2004 Cluster File Systems, Inc. <info@clusterfs.com>
+ * GPL HEADER START
   *
- *   This file is part of Lustre, http://www.lustre.org.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   *
- *   Lustre is free software; you can redistribute it and/or
- *   modify it under the terms of version 2 of the GNU General Public
- *   License as published by the Free Software Foundation.
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
   *
- *   Lustre is distributed in the hope that it will be useful,
- *   but WITHOUT ANY WARRANTY; without even the implied warranty of
- *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- *   GNU General Public License for more details.
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
   *
- *   You should have received a copy of the GNU General Public License
- *   along with Lustre; if not, write to the Free Software
- *   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
   *
- * Filesystem interface helper.
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright  2008 Sun Microsystems, Inc. All rights reserved
+ * Use is subject to license terms.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/include/darwin/lustre_fsfilt.h
   *
+ * Filesystem interface helper.
   */
  
  #ifndef _DARWIN_LUSTRE_FSFILT_H
diff --git a/lustre/include/darwin/lustre_handles.h b/lustre/include/darwin/lustre_handles.h

index 341a25b..3cc0390 100644 (file)
--- a/lustre/include/darwin/lustre_handles.h
+++ b/lustre/include/darwin/lustre_handles.h
@@ -1,3 +1,39 @@
+/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
+ * vim:expandtab:shiftwidth=8:tabstop=8:
+ *
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright  2008 Sun Microsystems, Inc. All rights reserved
+ * Use is subject to license terms.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
  #ifndef __DARWIN_LUSTRE_HANDLES_H_
  #define __DARWIN_LUSTR_HANDLES_H_
  
@@ -9,4 +45,3 @@
  #include <libcfs/libcfs.h>
  
  #endif
-
diff --git a/lustre/include/darwin/lustre_lib.h b/lustre/include/darwin/lustre_lib.h

index 5adadae..c13599c 100644 (file)
--- a/lustre/include/darwin/lustre_lib.h
+++ b/lustre/include/darwin/lustre_lib.h
@@ -1,25 +1,41 @@
  /* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
   * vim:expandtab:shiftwidth=8:tabstop=8:
   *
- *  Copyright (C) 2001 Cluster File Systems, Inc. <braam@clusterfs.com>
+ * GPL HEADER START
   *
- *   This file is part of Lustre, http://www.lustre.org.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   *
- *   Lustre is free software; you can redistribute it and/or
- *   modify it under the terms of version 2 of the GNU General Public
- *   License as published by the Free Software Foundation.
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
   *
- *   Lustre is distributed in the hope that it will be useful,
- *   but WITHOUT ANY WARRANTY; without even the implied warranty of
- *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- *   GNU General Public License for more details.
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
   *
- *   You should have received a copy of the GNU General Public License
- *   along with Lustre; if not, write to the Free Software
- *   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
   *
- * Basic Lustre library routines.
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright  2008 Sun Microsystems, Inc. All rights reserved
+ * Use is subject to license terms.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
   *
+ * lustre/include/darwin/lustre_lib.h
+ *
+ * Basic Lustre library routines.
   */
  
  #ifndef _DARWIN_LUSTRE_LIB_H
@@ -72,5 +88,3 @@ static inline sigset_t l_w_e_set_sigs(sigset_t sigs)
  #endif
  
  #endif
-
-
diff --git a/lustre/include/darwin/lustre_lite.h b/lustre/include/darwin/lustre_lite.h

index 611c2a8..49434d6 100644 (file)
--- a/lustre/include/darwin/lustre_lite.h
+++ b/lustre/include/darwin/lustre_lite.h
@@ -1,12 +1,41 @@
  /* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
   * vim:expandtab:shiftwidth=8:tabstop=8:
   *
- * lustre lite cluster file system
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
   *
- * This code is issued under the GNU General Public License.
- * See the file COPYING in this distribution
+ * GPL HEADER END
+ */
+/*
+ * Copyright  2008 Sun Microsystems, Inc. All rights reserved
+ * Use is subject to license terms.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/include/darwin/lustre_lite.h
   *
- * Copyright (C) 2002 Cluster File Systems, Inc. <info@clusterfs.com>
+ * lustre lite cluster file system
   */
  
  #ifndef _DARWIN_LL_H
diff --git a/lustre/include/darwin/lustre_log.h b/lustre/include/darwin/lustre_log.h

index d777465..fafae6f 100644 (file)
--- a/lustre/include/darwin/lustre_log.h
+++ b/lustre/include/darwin/lustre_log.h
@@ -1,3 +1,39 @@
+/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
+ * vim:expandtab:shiftwidth=8:tabstop=8:
+ *
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright  2008 Sun Microsystems, Inc. All rights reserved
+ * Use is subject to license terms.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
  #ifndef _DARWIN_LUSTRE_LOG_H
  #define _DARWIN_LUSTRE_LOG_H
  
diff --git a/lustre/include/darwin/lustre_mds.h b/lustre/include/darwin/lustre_mds.h

index 7fd8549..3288f11 100644 (file)
--- a/lustre/include/darwin/lustre_mds.h
+++ b/lustre/include/darwin/lustre_mds.h
@@ -1,22 +1,37 @@
  /* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
   * vim:expandtab:shiftwidth=8:tabstop=8:
   *
- *  Copyright (C) 2001-2003 Cluster File Systems, Inc. <info@clusterfs.com>
+ * GPL HEADER START
   *
- *   This file is part of Lustre, http://www.lustre.org.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   *
- *   Lustre is free software; you can redistribute it and/or
- *   modify it under the terms of version 2 of the GNU General Public
- *   License as published by the Free Software Foundation.
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
   *
- *   Lustre is distributed in the hope that it will be useful,
- *   but WITHOUT ANY WARRANTY; without even the implied warranty of
- *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- *   GNU General Public License for more details.
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
   *
- *   You should have received a copy of the GNU General Public License
- *   along with Lustre; if not, write to the Free Software
- *   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright  2008 Sun Microsystems, Inc. All rights reserved
+ * Use is subject to license terms.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
   */
  
  #ifndef _DARWIN_LUSTRE_MDS_H
diff --git a/lustre/include/darwin/lustre_net.h b/lustre/include/darwin/lustre_net.h

index f028545..2254d5e 100644 (file)
--- a/lustre/include/darwin/lustre_net.h
+++ b/lustre/include/darwin/lustre_net.h
@@ -1,23 +1,37 @@
  /* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
   * vim:expandtab:shiftwidth=8:tabstop=8:
   *
- *  Copyright (C) 2002, 2003 Cluster File Systems, Inc.
+ * GPL HEADER START
   *
- *   This file is part of Lustre, http://www.lustre.org.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   *
- *   Lustre is free software; you can redistribute it and/or
- *   modify it under the terms of version 2 of the GNU General Public
- *   License as published by the Free Software Foundation.
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
   *
- *   Lustre is distributed in the hope that it will be useful,
- *   but WITHOUT ANY WARRANTY; without even the implied warranty of
- *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- *   GNU General Public License for more details.
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
   *
- *   You should have received a copy of the GNU General Public License
- *   along with Lustre; if not, write to the Free Software
- *   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
   *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright  2008 Sun Microsystems, Inc. All rights reserved
+ * Use is subject to license terms.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
   */
  
  #ifndef _DARWIN_LUSTRE_NET_H
diff --git a/lustre/include/darwin/lustre_quota.h b/lustre/include/darwin/lustre_quota.h

index 5d0864f..4274da4 100644 (file)
--- a/lustre/include/darwin/lustre_quota.h
+++ b/lustre/include/darwin/lustre_quota.h
@@ -1,6 +1,39 @@
  /* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
   * vim:expandtab:shiftwidth=8:tabstop=8:
+ *
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
   */
+/*
+ * Copyright  2008 Sun Microsystems, Inc. All rights reserved
+ * Use is subject to license terms.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
  #ifndef _DARWIN_LUSTRE_QUOTA_H
  #define _DARWIN_LUSTRE_QUOTA_H
  
diff --git a/lustre/include/darwin/lustre_types.h b/lustre/include/darwin/lustre_types.h

index 651cf2d..c382fce 100644 (file)
--- a/lustre/include/darwin/lustre_types.h
+++ b/lustre/include/darwin/lustre_types.h
@@ -1,3 +1,39 @@
+/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
+ * vim:expandtab:shiftwidth=8:tabstop=8:
+ *
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright  2008 Sun Microsystems, Inc. All rights reserved
+ * Use is subject to license terms.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
  #ifndef _LUSTRE_DARWIN_TYPES_H
  #define _LUSTRE_DARWIN_TYPES_H
  
diff --git a/lustre/include/darwin/lustre_user.h b/lustre/include/darwin/lustre_user.h

index a495e60..3e5c5e1 100644 (file)
--- a/lustre/include/darwin/lustre_user.h
+++ b/lustre/include/darwin/lustre_user.h
@@ -1,7 +1,39 @@
  /* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
   * vim:expandtab:shiftwidth=8:tabstop=8:
   *
- *   This file is part of Lustre, http://www.lustre.org
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright  2008 Sun Microsystems, Inc. All rights reserved
+ * Use is subject to license terms.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/include/darwin/lustre_user.h
   *
   * Lustre public user-space interface definitions.
   */
diff --git a/lustre/include/darwin/lvfs.h b/lustre/include/darwin/lvfs.h

index d271854..a465578 100644 (file)
--- a/lustre/include/darwin/lvfs.h
+++ b/lustre/include/darwin/lvfs.h
@@ -1,3 +1,39 @@
+/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
+ * vim:expandtab:shiftwidth=8:tabstop=8:
+ *
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright  2008 Sun Microsystems, Inc. All rights reserved
+ * Use is subject to license terms.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
  #ifndef __DARWIN_LVFS_H__
  #define __DARWIN_LVFS_H__
  
@@ -12,7 +48,7 @@
  struct lvfs_ucred { 
         __u32 luc_fsuid; 
         __u32 luc_fsgid; 
-       __u32 luc_cap; 
+       cfs_kernel_cap_t luc_cap; 
         __u32 luc_uid; 
         __u32 luc_umask;
  };
diff --git a/lustre/include/darwin/obd.h b/lustre/include/darwin/obd.h

index 175758e..a00ddec 100644 (file)
--- a/lustre/include/darwin/obd.h
+++ b/lustre/include/darwin/obd.h
@@ -1,10 +1,37 @@
  /* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
   * vim:expandtab:shiftwidth=8:tabstop=8:
   *
- * Copyright (C) 2001, 2002 Cluster File Systems, Inc.
+ * GPL HEADER START
   *
- * This code is issued under the GNU General Public License.
- * See the file COPYING in this distribution
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright  2008 Sun Microsystems, Inc. All rights reserved
+ * Use is subject to license terms.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
   */
  
  #ifndef __DARWIN_OBD_H
diff --git a/lustre/include/darwin/obd_class.h b/lustre/include/darwin/obd_class.h

index 833da61..0ad697e 100644 (file)
--- a/lustre/include/darwin/obd_class.h
+++ b/lustre/include/darwin/obd_class.h
@@ -1,23 +1,37 @@
  /* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
   * vim:expandtab:shiftwidth=8:tabstop=8:
   *
- *  Copyright (C) 2001-2003 Cluster File Systems, Inc.
+ * GPL HEADER START
   *
- *   This file is part of Lustre, http://www.lustre.org.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   *
- *   Lustre is free software; you can redistribute it and/or
- *   modify it under the terms of version 2 of the GNU General Public
- *   License as published by the Free Software Foundation.
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
   *
- *   Lustre is distributed in the hope that it will be useful,
- *   but WITHOUT ANY WARRANTY; without even the implied warranty of
- *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- *   GNU General Public License for more details.
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
   *
- *   You should have received a copy of the GNU General Public License
- *   along with Lustre; if not, write to the Free Software
- *   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
   *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright  2008 Sun Microsystems, Inc. All rights reserved
+ * Use is subject to license terms.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
   */
  
  #ifndef __DARWIN_CLASS_OBD_H
diff --git a/lustre/include/darwin/obd_support.h b/lustre/include/darwin/obd_support.h

index 8ff7200..022289c 100644 (file)
--- a/lustre/include/darwin/obd_support.h
+++ b/lustre/include/darwin/obd_support.h
@@ -1,23 +1,37 @@
  /* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
   * vim:expandtab:shiftwidth=8:tabstop=8:
   *
- *  Copyright (C) 2001, 2002 Cluster File Systems, Inc.
+ * GPL HEADER START
   *
- *   This file is part of Lustre, http://www.lustre.org.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   *
- *   Lustre is free software; you can redistribute it and/or
- *   modify it under the terms of version 2 of the GNU General Public
- *   License as published by the Free Software Foundation.
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
   *
- *   Lustre is distributed in the hope that it will be useful,
- *   but WITHOUT ANY WARRANTY; without even the implied warranty of
- *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- *   GNU General Public License for more details.
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
   *
- *   You should have received a copy of the GNU General Public License
- *   along with Lustre; if not, write to the Free Software
- *   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
   *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright  2008 Sun Microsystems, Inc. All rights reserved
+ * Use is subject to license terms.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
   */
  
  #ifndef _DARWIN_OBD_SUPPORT
diff --git a/lustre/include/interval_tree.h b/lustre/include/interval_tree.h

index 41436c1..df85d56 100644 (file)
--- a/lustre/include/interval_tree.h
+++ b/lustre/include/interval_tree.h
@@ -1,28 +1,42 @@
  /* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
- * (visit-tags-table FILE)
   * vim:expandtab:shiftwidth=8:tabstop=8:
   *
- *  Copyright (c) 2002, 2007 Cluster File Systems, Inc.
- *   Author: Huang Wei <huangwei@clusterfs.com>
- *   Author: Jay Xiong <jinshan.xiong@sun.com>
+ * GPL HEADER START
   *
- *   This file is part of the Lustre file system, http://www.lustre.org
- *   Lustre is a trademark of Cluster File Systems, Inc.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   *
- *   You may have signed or agreed to another license before downloading
- *   this software.  If so, you are bound by the terms and conditions
- *   of that agreement, and the following does not apply to you.  See the
- *   LICENSE file included with this distribution for more information.
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
   *
- *   If you did not agree to a different license, then this copy of Lustre
- *   is open source software; you can redistribute it and/or modify it
- *   under the terms of version 2 of the GNU General Public License as
- *   published by the Free Software Foundation.
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
   *
- *   In either case, Lustre is distributed in the hope that it will be
- *   useful, but WITHOUT ANY WARRANTY; without even the implied warranty
- *   of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- *   license text for more details.
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright  2008 Sun Microsystems, Inc. All rights reserved
+ * Use is subject to license terms.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/include/interval_tree.h
+ *
+ * Author: Huang Wei <huangwei@clusterfs.com>
+ * Author: Jay Xiong <jinshan.xiong@sun.com>
   */
  
  #ifndef _INTERVAL_H__
@@ -35,8 +49,10 @@ struct interval_node {
          struct interval_node   *in_left;
          struct interval_node   *in_right;
          struct interval_node   *in_parent;
-        __u8                    in_color;
-        __u8                    res1[7];  /* tags, 8-bytes aligned */
+        unsigned                in_color:1,
+                                in_intree:1, /** set if the node is in tree */
+                                in_res1:30;
+        __u8                    in_res2[4];  /** tags, 8-bytes aligned */
          __u64                   in_max_high;
          struct interval_node_extent {
                  __u64 start;
@@ -49,6 +65,11 @@ enum interval_iter {
          INTERVAL_ITER_STOP = 2
  };
  
+static inline int interval_is_intree(struct interval_node *node)
+{
+        return node->in_intree == 1;
+}
+
  static inline __u64 interval_low(struct interval_node *node)
  {
          return node->in_extent.start;
diff --git a/lustre/include/ioctl.h b/lustre/include/ioctl.h

index fc2207b..3070446 100644 (file)
--- a/lustre/include/ioctl.h
+++ b/lustre/include/ioctl.h
@@ -1,3 +1,39 @@
+/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
+ * vim:expandtab:shiftwidth=8:tabstop=8:
+ *
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright  2008 Sun Microsystems, Inc. All rights reserved
+ * Use is subject to license terms.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
  #ifndef _IOWR
  
  /* On i386 and x86_64, _ASM_I386_IOCTL_H is defined by the kernel's ioctl.h,
diff --git a/lustre/include/liblustre.h b/lustre/include/liblustre.h

index 84f9242..cf6cb1a 100644 (file)
--- a/lustre/include/liblustre.h
+++ b/lustre/include/liblustre.h
@@ -1,26 +1,43 @@
  /* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
   * vim:expandtab:shiftwidth=8:tabstop=8:
   *
- *  Copyright (C) 2001 Cluster File Systems, Inc. <info@clusterfs.com>
+ * GPL HEADER START
   *
- *   This file is part of Lustre, http://www.lustre.org.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   *
- *   Lustre is free software; you can redistribute it and/or
- *   modify it under the terms of version 2 of the GNU General Public
- *   License as published by the Free Software Foundation.
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
   *
- *   Lustre is distributed in the hope that it will be useful,
- *   but WITHOUT ANY WARRANTY; without even the implied warranty of
- *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- *   GNU General Public License for more details.
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
   *
- *   You should have received a copy of the GNU General Public License
- *   along with Lustre; if not, write to the Free Software
- *   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
   *
- * User-space Lustre headers.
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
   *
+ * GPL HEADER END
+ */
+/*
+ * Copyright  2008 Sun Microsystems, Inc. All rights reserved
+ * Use is subject to license terms.
   */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/include/liblustre.h
+ *
+ * User-space Lustre headers.
+ */
+
  #ifndef LIBLUSTRE_H__
  #define LIBLUSTRE_H__
  
@@ -112,7 +129,6 @@ typedef unsigned short umode_t;
  #define set_page_private(page, v) ((page)->private = (v))
  #endif
  
-
  static inline void inter_module_put(void *a)
  {
          return;
@@ -194,17 +210,30 @@ typedef int (write_proc_t)(struct file *file, const char *buffer,
  
  static __inline__ int ext2_set_bit(int nr, void *addr)
  {
+#ifdef __BIG_ENDIAN
+        return set_bit((nr ^ ((BITS_PER_LONG-1) & ~0x7)), addr);
+#else
          return set_bit(nr, addr);
+#endif
  }
  
  static __inline__ int ext2_clear_bit(int nr, void *addr)
  {
+#ifdef __BIG_ENDIAN
+        return clear_bit((nr ^ ((BITS_PER_LONG-1) & ~0x7)), addr);
+#else
          return clear_bit(nr, addr);
+#endif
  }
  
  static __inline__ int ext2_test_bit(int nr, void *addr)
  {
+#ifdef __BIG_ENDIAN
+        __const__ unsigned char *tmp = (__const__ unsigned char *) addr;
+        return (tmp[nr >> 3] >> (nr & 7)) & 1;
+#else
          return test_bit(nr, addr);
+#endif
  }
  
  /* modules */
@@ -321,6 +350,7 @@ typedef spinlock_t rwlock_t;
  #ifndef ERESTARTSYS
  #define ERESTARTSYS ERESTART
  #endif
+#undef HZ
  #define HZ 1
  
  /* random */
@@ -580,7 +610,7 @@ struct task_struct {
          int max_groups;
          int ngroups;
          gid_t *groups;
-        __u32 cap_effective;
+        cfs_cap_t cap_effective;
  };
  
  typedef struct task_struct cfs_task_t;
@@ -590,13 +620,6 @@ typedef struct task_struct cfs_task_t;
  
  extern struct task_struct *current;
  int in_group_p(gid_t gid);
-static inline int capable(int cap)
-{
-        if (current->cap_effective & (1 << cap))
-                return 1;
-        else
-                return 0;
-}
  
  #define set_current_state(foo) do { current->state = foo; } while (0)
  
@@ -736,12 +759,6 @@ typedef enum {
      CAP_SET=1
  } cap_flag_value_t;
  
-#define CAP_DAC_OVERRIDE        1
-#define CAP_DAC_READ_SEARCH     2
-#define CAP_FOWNER              3
-#define CAP_FSETID              4
-#define CAP_SYS_ADMIN          21
-
  cap_t   cap_get_proc(void);
  int     cap_get_flag(cap_t, cap_value_t, cap_flag_t, cap_flag_value_t *);
  
diff --git a/lustre/include/linux/Makefile.am b/lustre/include/linux/Makefile.am

index 9604a6a..f5b4014 100644 (file)
--- a/lustre/include/linux/Makefile.am
+++ b/lustre/include/linux/Makefile.am
@@ -1,7 +1,38 @@
-# Copyright (C) 2001  Cluster File Systems, Inc.
  #
-# This code is issued under the GNU General Public License.
-# See the file COPYING in this distribution
+# GPL HEADER START
+#
+# DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License version 2 only,
+# as published by the Free Software Foundation.
+#
+# This program is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+# General Public License version 2 for more details (a copy is included
+# in the LICENSE file that accompanied this code).
+#
+# You should have received a copy of the GNU General Public License
+# version 2 along with this program; If not, see
+# http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+#
+# Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+# CA 95054 USA or visit www.sun.com if you need additional information or
+# have any questions.
+#
+# GPL HEADER END
+#
+
+#
+# Copyright  2008 Sun Microsystems, Inc. All rights reserved
+# Use is subject to license terms.
+#
+
+#
+# This file is part of Lustre, http://www.lustre.org/
+# Lustre is a trademark of Sun Microsystems, Inc.
+#
  
  linuxdir = $(includedir)/linux
  
@@ -14,4 +45,3 @@ EXTRA_DIST = lprocfs_status.h lustre_debug.h lustre_lib.h \
    lustre_log.h lustre_compat25.h lustre_fsfilt.h lustre_mds.h obd.h \
    lvfs.h lvfs_linux.h lustre_lite.h  lustre_quota.h \
    lustre_user.h lustre_types.h lustre_patchless_compat.h lustre_intent.h
-
diff --git a/lustre/include/linux/lprocfs_status.h b/lustre/include/linux/lprocfs_status.h

index 85574fa..936042c 100644 (file)
--- a/lustre/include/linux/lprocfs_status.h
+++ b/lustre/include/linux/lprocfs_status.h
@@ -1,25 +1,43 @@
  /* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
   * vim:expandtab:shiftwidth=8:tabstop=8:
   *
- *  Copyright (C) 2002 Cluster File Systems, Inc.
+ * GPL HEADER START
   *
- *   This file is part of Lustre, http://www.lustre.org.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   *
- *   Lustre is free software; you can redistribute it and/or
- *   modify it under the terms of version 2 of the GNU General Public
- *   License as published by the Free Software Foundation.
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
   *
- *   Lustre is distributed in the hope that it will be useful,
- *   but WITHOUT ANY WARRANTY; without even the implied warranty of
- *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- *   GNU General Public License for more details.
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
   *
- *   You should have received a copy of the GNU General Public License
- *   along with Lustre; if not, write to the Free Software
- *   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
   *
- *   Top level header file for LProc SNMP
- *   Author: Hariharan Thantry thantry@users.sourceforge.net
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright  2008 Sun Microsystems, Inc. All rights reserved
+ * Use is subject to license terms.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/include/linux/lprocfs_status.h
+ *
+ * Top level header file for LProc SNMP
+ *
+ * Author: Hariharan Thantry thantry@users.sourceforge.net
   */
  #ifndef _LINUX_LPROCFS_SNMP_H
  #define _LINUX_LPROCFS_SNMP_H
@@ -39,12 +57,7 @@
  #include <linux/smp.h>
  #include <linux/rwsem.h>
  #include <libcfs/kp30.h>
-
-# if (LINUX_VERSION_CODE >= KERNEL_VERSION(2,5,0))
-#  include <linux/statfs.h>
-# else 
-#  define kstatfs statfs
-# endif
+#include <linux/statfs.h>
  
  #else 
  #  define kstatfs statfs
diff --git a/lustre/include/linux/lustre_compat25.h b/lustre/include/linux/lustre_compat25.h

index d9e4499..9ad15c0 100644 (file)
--- a/lustre/include/linux/lustre_compat25.h
+++ b/lustre/include/linux/lustre_compat25.h
@@ -1,23 +1,37 @@
  /* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
   * vim:expandtab:shiftwidth=8:tabstop=8:
   *
- *  Copyright (c) 2003 Cluster File Systems, Inc.
+ * GPL HEADER START
   *
- *   This file is part of Lustre, http://www.lustre.org.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   *
- *   Lustre is free software; you can redistribute it and/or
- *   modify it under the terms of version 2 of the GNU General Public
- *   License as published by the Free Software Foundation.
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
   *
- *   Lustre is distributed in the hope that it will be useful,
- *   but WITHOUT ANY WARRANTY; without even the implied warranty of
- *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- *   GNU General Public License for more details.
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
   *
- *   You should have received a copy of the GNU General Public License
- *   along with Lustre; if not, write to the Free Software
- *   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
   *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright  2008 Sun Microsystems, Inc. All rights reserved
+ * Use is subject to license terms.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
   */
  
  #ifndef _LINUX_COMPAT25_H
@@ -25,8 +39,8 @@
  
  #ifdef __KERNEL__
  
-#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,5,0) && LINUX_VERSION_CODE < KERNEL_VERSION(2,5,69)
-#error sorry, lustre requires at least 2.5.69
+#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,5)
+#error sorry, lustre requires at least 2.6.5
  #endif
  
  #include <libcfs/linux/portals_compat25.h>
@@ -121,13 +135,9 @@ void groups_free(struct group_info *ginfo);
  #define gfp_t int
  #endif
  
-#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,5,0)
-
  #define lock_dentry(___dentry)          spin_lock(&(___dentry)->d_lock)
  #define unlock_dentry(___dentry)        spin_unlock(&(___dentry)->d_lock)
  
-#define lock_24kernel()         do {} while (0)
-#define unlock_24kernel()       do {} while (0)
  #define ll_kernel_locked()      kernel_locked()
  
  /*
@@ -223,204 +233,6 @@ extern void __d_move(struct dentry *dentry, struct dentry *target);
          ((!PageWriteback(page) && (cmd & OBD_BRW_READ)) || \
           (PageWriteback(page) && (cmd & OBD_BRW_WRITE)))
  
-#else /* 2.4.. */
-
-#define ll_flock_lock_file_wait(file, lock, can_sleep) \
-        do {} while(0)
-
-#define lock_dentry(___dentry)
-#define unlock_dentry(___dentry)
-
-#define lock_24kernel()         lock_kernel()
-#define unlock_24kernel()       unlock_kernel()
-#define ll_kernel_locked()      (current->lock_depth >= 0)
-
-/* 2.4 kernels have HZ=100 on i386/x86_64, this should be reasonably safe */
-#define get_jiffies_64()        (__u64)jiffies
-
-#ifdef HAVE_MM_INLINE
-#include <linux/mm_inline.h>
-#endif
-
-#ifndef pgoff_t
-#define pgoff_t unsigned long
-#endif
-
-#define ll_vfs_create(a,b,c,d)              vfs_create(a,b,c)
-#define ll_permission(inode,mask,nd)        permission(inode,mask)
-#define ILOOKUP(sb, ino, test, data)        ilookup4(sb, ino, test, data);
-#define DCACHE_DISCONNECTED                 DCACHE_NFSD_DISCONNECTED
-#define ll_dev_t                            int
-#define old_encode_dev(dev)                 (dev)
-
-/* 2.5 uses hlists for some things, like the d_hash.  we'll treat them
- * as 2.5 and let macros drop back.. */
-#ifndef HLIST_HEAD /* until we get a kernel newer than l28 */
-#define hlist_entry                     list_entry
-#define hlist_head                      list_head
-#define hlist_node                      list_head
-#define HLIST_HEAD                      LIST_HEAD
-#define INIT_HLIST_HEAD                 INIT_LIST_HEAD
-#define hlist_del_init                  list_del_init
-#define hlist_add_head                  list_add
-#endif
-
-#ifndef INIT_HLIST_NODE
-#define INIT_HLIST_NODE(p)              ((p)->next = NULL, (p)->prev = NULL)
-#endif
-
-#ifndef hlist_for_each
-#define hlist_for_each                  list_for_each
-#endif
-
-#ifndef hlist_for_each_safe
-#define hlist_for_each_safe             list_for_each_safe
-#endif
-
-#define KDEVT_INIT(val)                 (val)
-#define ext3_xattr_set_handle           ext3_xattr_set
-#define try_module_get                  __MOD_INC_USE_COUNT
-#define module_put                      __MOD_DEC_USE_COUNT
-#define LTIME_S(time)                   (time)
-
-#if !defined(CONFIG_RH_2_4_20) && !defined(cpu_online)
-#define cpu_online(cpu)                 test_bit(cpu, &(cpu_online_map))
-#endif
-
-static inline int ll_path_lookup(const char *path, unsigned flags,
-                                 struct nameidata *nd)
-{
-        int error = 0;
-        if (path_init(path, flags, nd))
-                error = path_walk(path, nd);
-        return error;
-}
-#define ll_permission(inode,mask,nd)    permission(inode,mask)
-typedef long sector_t;
-
-#define ll_pgcache_lock(mapping)        spin_lock(&pagecache_lock)
-#define ll_pgcache_unlock(mapping)      spin_unlock(&pagecache_lock)
-#define ll_call_writepage(inode, page)  \
-                               (inode)->i_mapping->a_ops->writepage(page)
-#define ll_invalidate_inode_pages(inode) invalidate_inode_pages(inode)
-#define ll_truncate_complete_page(page) truncate_complete_page(page)
-
-static inline void clear_page_dirty(struct page *page)
-{
-        if (PageDirty(page))
-                ClearPageDirty(page);
-}
-
-static inline int clear_page_dirty_for_io(struct page *page)
-{
-        struct address_space *mapping = page->mapping;
-
-        if (page->mapping && PageDirty(page)) {
-                ClearPageDirty(page);
-                ll_pgcache_lock(mapping);
-                list_del(&page->list);
-                list_add(&page->list, &mapping->locked_pages);
-                ll_pgcache_unlock(mapping);
-                return 1;
-        }
-        return 0;
-}
-
-static inline void ll_redirty_page(struct page *page)
-{
-        SetPageDirty(page);
-        ClearPageLaunder(page);
-}
-
-static inline void __d_drop(struct dentry *dentry)
-{
-        list_del_init(&dentry->d_hash);
-}
-
-static inline int cleanup_group_info(void)
-{
-        /* Get rid of unneeded supplementary groups */
-        current->ngroups = 0;
-        memset(current->groups, 0, sizeof(current->groups));
-        return 0;
-}
-
-#ifndef HAVE_COND_RESCHED
-static inline void cond_resched(void)
-{
-        if (unlikely(need_resched())) {
-                set_current_state(TASK_RUNNING);
-                schedule();
-        }
-}
-#endif
-
-/* to find proc_dir_entry from inode. 2.6 has native one -bzzz */
-#ifndef HAVE_PDE
-#define PDE(ii)         ((ii)->u.generic_ip)
-#endif
-
-#define __set_page_ll_data(page, llap) set_page_private(page, (unsigned long)llap)
-#define __clear_page_ll_data(page) set_page_private(page, 0)
-#define PageWriteback(page) 0
-#define CheckWriteback(page, cmd) 1
-#define set_page_writeback(page) do {} while (0)
-#define end_page_writeback(page) do {} while (0)
-
-static inline int mapping_mapped(struct address_space *mapping)
-{
-        if (mapping->i_mmap_shared)
-                return 1;
-        if (mapping->i_mmap)
-                return 1;
-        return 0;
-}
-
-#ifdef ZAP_PAGE_RANGE_VMA
-#define ll_zap_page_range(vma, addr, len)  zap_page_range(vma, addr, len)
-#else
-#define ll_zap_page_range(vma, addr, len)  zap_page_range(vma->vm_mm, addr, len)
-#endif
-
-#ifndef HAVE_PAGE_MAPPED
-/* Poor man's page_mapped. substract from page count, counts from
-   buffers/pagecache and our own count (we are supposed to hold one reference).
-   What is left are user mappings and also others who work with this page now,
-   but there are supposedly none. */
-static inline int page_mapped(struct page *page)
-{
-        return page_count(page) - !!page->mapping - !!page->buffers - 1;
-}
-#endif /* !HAVE_PAGE_MAPPED */
-
-static inline void touch_atime(struct vfsmount *mnt, struct dentry *dentry)
-{
-        update_atime(dentry->d_inode);
-}
-
-static inline void file_accessed(struct file *file)
-{
-#ifdef O_NOATIME
-        if (file->f_flags & O_NOATIME)
-                return;
-#endif
-        touch_atime(file->f_vfsmnt, file->f_dentry);
-}
-
-#ifndef typecheck
-/*
- * Check at compile time that something is of a particular type.
- * Always evaluates to 1 so you may use it easily in comparisons.
- */
-#define typecheck(type,x) \
-({     type __dummy; \
-       typeof(x) __dummy2; \
-       (void)(&__dummy == &__dummy2); \
-       1; \
-})
-#endif
-
-#endif /* end of 2.4 compat macros */
  
  #ifdef HAVE_PAGE_LIST
  static inline int mapping_has_pages(struct address_space *mapping)
@@ -452,14 +264,21 @@ static inline int mapping_has_pages(struct address_space *mapping)
  
  #if (LINUX_VERSION_CODE < KERNEL_VERSION(2,6,7))
  #define ll_set_dflags(dentry, flags) do { dentry->d_vfs_flags |= flags; } while(0)
-#define ll_vfs_symlink(dir, dentry, path, mode) vfs_symlink(dir, dentry, path)
+#define ll_vfs_symlink(dir, dentry, mnt, path, mode) \
+                       vfs_symlink(dir, dentry, path)
  #else
  #define ll_set_dflags(dentry, flags) do { \
                  spin_lock(&dentry->d_lock); \
                  dentry->d_flags |= flags; \
                  spin_unlock(&dentry->d_lock); \
          } while(0)
-#define ll_vfs_symlink(dir, dentry, path, mode) vfs_symlink(dir, dentry, path, mode)
+#ifdef HAVE_SECURITY_PLUG
+#define ll_vfs_symlink(dir, dentry, mnt, path, mode) \
+                vfs_symlink(dir, dentry, mnt, path, mode)
+#else
+#define ll_vfs_symlink(dir, dentry, mnt, path, mode) \
+                vfs_symlink(dir, dentry, path, mode)
+#endif
  #endif
  
  #ifndef container_of
@@ -494,6 +313,45 @@ static inline int mapping_has_pages(struct address_space *mapping)
  #define filemap_fdatawrite(mapping)      filemap_fdatasync(mapping)
  #endif
  
+#include <linux/mpage.h>        /* for generic_writepages */
+#ifndef HAVE_FILEMAP_FDATAWRITE_RANGE
+#include <linux/backing-dev.h>  /* for mapping->backing_dev_info */
+static inline int filemap_fdatawrite_range(struct address_space *mapping,
+                                           loff_t start, loff_t end)
+{
+        int rc;
+        struct writeback_control wbc = {
+                .sync_mode = WB_SYNC_ALL,
+                .nr_to_write = (end - start + PAGE_SIZE - 1) >> PAGE_SHIFT,
+        };
+
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,18)
+        wbc.range_start = start;
+        wbc.range_end = end;
+#else
+        wbc.start = start;
+        wbc.end = end;
+#endif
+
+#ifdef mapping_cap_writeback_dirty
+        if (!mapping_cap_writeback_dirty(mapping))
+               rc = 0;
+#else
+        if (mapping->backing_dev_info->memory_backed)
+                rc = 0;
+#endif
+        /* do_writepages() */
+        else if (mapping->a_ops->writepages)
+                rc = mapping->a_ops->writepages(mapping, &wbc);
+        else
+                rc = generic_writepages(mapping, &wbc);
+        return rc;
+}
+#else
+int filemap_fdatawrite_range(struct address_space *mapping,
+                             loff_t start, loff_t end);
+#endif
+
  #ifdef HAVE_VFS_KERN_MOUNT
  static inline 
  struct vfsmount *
@@ -545,5 +403,70 @@ int ll_unregister_blkdev(unsigned int dev, const char *name)
  #define LL_RENAME_DOES_D_MOVE  FS_ODD_RENAME
  #endif
  
+#ifdef HAVE_SECURITY_PLUG
+#define ll_remove_suid(inode,mnt)               remove_suid(inode,mnt)
+#define ll_vfs_rmdir(dir,entry,mnt)             vfs_rmdir(dir,entry,mnt)
+#define ll_vfs_mkdir(inode,dir,mnt,mode)        vfs_mkdir(inode,dir,mnt,mode)
+#define ll_vfs_link(old,mnt,dir,new,mnt1)       vfs_link(old,mnt,dir,new,mnt1)
+#define ll_vfs_unlink(inode,entry,mnt)          vfs_unlink(inode,entry,mnt)
+#define ll_vfs_mknod(dir,entry,mnt,mode,dev)            \
+                vfs_mknod(dir,entry,mnt,mode,dev)
+#define ll_security_inode_unlink(dir,entry,mnt)         \
+                security_inode_unlink(dir,entry,mnt)     
+#define ll_vfs_rename(old,old_dir,mnt,new,new_dir,mnt1) \
+                vfs_rename(old,old_dir,mnt,new,new_dir,mnt1)
+#else
+#define ll_remove_suid(inode,mnt)               remove_suid(inode)
+#define ll_vfs_rmdir(dir,entry,mnt)             vfs_rmdir(dir,entry)
+#define ll_vfs_mkdir(inode,dir,mnt,mode)        vfs_mkdir(inode,dir,mode)
+#define ll_vfs_link(old,mnt,dir,new,mnt1)       vfs_link(old,dir,new)
+#define ll_vfs_unlink(inode,entry,mnt)          vfs_unlink(inode,entry)
+#define ll_vfs_mknod(dir,entry,mnt,mode,dev)    vfs_mknod(dir,entry,mode,dev)
+#define ll_security_inode_unlink(dir,entry,mnt) security_inode_unlink(dir,entry)     
+#define ll_vfs_rename(old,old_dir,mnt,new,new_dir,mnt1) \
+                vfs_rename(old,old_dir,new,new_dir)
+#endif
+
+#ifndef get_cpu
+#ifdef CONFIG_PREEMPT
+#define get_cpu()       ({ preempt_disable(); smp_processor_id(); })
+#define put_cpu()       preempt_enable()
+#else
+#define get_cpu()       smp_processor_id()
+#define put_cpu()
+#endif
+#endif /* get_cpu & put_cpu */
+
+#ifndef for_each_possible_cpu
+#define for_each_possible_cpu(i) for_each_cpu(i)
+#endif
+
+#ifndef cpu_to_node
+#define cpu_to_node(cpu)         0
+#endif
+
+#ifndef abs
+static inline int abs(int x)
+{
+        return (x < 0) ? -x : x;
+}
+#endif
+
+#ifndef labs
+static inline long labs(long x)
+{
+        return (x < 0) ? -x : x;
+}
+#endif
+
+/* Using kernel fls(). Userspace will use one defined in user-bitops.h. */
+#ifndef __fls
+#define __fls fls
+#endif
+
+#ifdef HAVE_INVALIDATE_INODE_PAGES
+#define invalidate_mapping_pages(mapping,s,e) invalidate_inode_pages(mapping)
+#endif
+
  #endif /* __KERNEL__ */
  #endif /* _COMPAT25_H */
diff --git a/lustre/include/linux/lustre_debug.h b/lustre/include/linux/lustre_debug.h

index db872a9..9753420 100644 (file)
--- a/lustre/include/linux/lustre_debug.h
+++ b/lustre/include/linux/lustre_debug.h
@@ -1,23 +1,37 @@
  /* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
   * vim:expandtab:shiftwidth=8:tabstop=8:
   *
- *  Copyright (C) 2002 Cluster File Systems, Inc.
+ * GPL HEADER START
   *
- *   This file is part of Lustre, http://www.lustre.org.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   *
- *   Lustre is free software; you can redistribute it and/or
- *   modify it under the terms of version 2 of the GNU General Public
- *   License as published by the Free Software Foundation.
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
   *
- *   Lustre is distributed in the hope that it will be useful,
- *   but WITHOUT ANY WARRANTY; without even the implied warranty of
- *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- *   GNU General Public License for more details.
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
   *
- *   You should have received a copy of the GNU General Public License
- *   along with Lustre; if not, write to the Free Software
- *   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
   *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright  2008 Sun Microsystems, Inc. All rights reserved
+ * Use is subject to license terms.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
   */
  
  #ifndef _LINUX_LUSTRE_DEBUG_H
diff --git a/lustre/include/linux/lustre_dlm.h b/lustre/include/linux/lustre_dlm.h

index a19f31a..07f7691 100644 (file)
--- a/lustre/include/linux/lustre_dlm.h
+++ b/lustre/include/linux/lustre_dlm.h
@@ -1,6 +1,37 @@
  /* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
- * (visit-tags-table FILE)
   * vim:expandtab:shiftwidth=8:tabstop=8:
+ *
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright  2008 Sun Microsystems, Inc. All rights reserved
+ * Use is subject to license terms.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
   */
  
  #ifndef _LINUX_LUSTRE_DLM_H__
diff --git a/lustre/include/linux/lustre_fsfilt.h b/lustre/include/linux/lustre_fsfilt.h

index f1698e5..0e9f801 100644 (file)
--- a/lustre/include/linux/lustre_fsfilt.h
+++ b/lustre/include/linux/lustre_fsfilt.h
@@ -1,25 +1,41 @@
  /* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
   * vim:expandtab:shiftwidth=8:tabstop=8:
   *
- *  Copyright (C) 2001-2004 Cluster File Systems, Inc. <info@clusterfs.com>
+ * GPL HEADER START
   *
- *   This file is part of Lustre, http://www.lustre.org.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   *
- *   Lustre is free software; you can redistribute it and/or
- *   modify it under the terms of version 2 of the GNU General Public
- *   License as published by the Free Software Foundation.
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
   *
- *   Lustre is distributed in the hope that it will be useful,
- *   but WITHOUT ANY WARRANTY; without even the implied warranty of
- *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- *   GNU General Public License for more details.
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
   *
- *   You should have received a copy of the GNU General Public License
- *   along with Lustre; if not, write to the Free Software
- *   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
   *
- * Filesystem interface helper.
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright  2008 Sun Microsystems, Inc. All rights reserved
+ * Use is subject to license terms.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/include/linux/lustre_fsfilt.h
   *
+ * Filesystem interface helper.
   */
  
  #ifndef _LINUX_LUSTRE_FSFILT_H
@@ -108,6 +124,8 @@ struct fsfilt_operations {
          int     (* fs_qids)(struct file *file, struct inode *inode, int type,
                              struct list_head *list);
          int     (* fs_dquot)(struct lustre_dquot *dquot, int cmd);
+        int     (* fs_get_mblk)(struct super_block *sb, int *count,
+                                struct inode *inode, int frags);
          lvfs_sbdev_type (* fs_journal_sbdev)(struct super_block *sb);
  };
  
@@ -426,6 +444,15 @@ static inline int fsfilt_dquot(struct obd_device *obd,
          return -ENOTSUPP;
  }
  
+static inline int fsfilt_get_mblk(struct obd_device *obd,
+                                  struct super_block *sb, int *count,
+                                  struct inode *inode, int frags)
+{
+        if (obd->obd_fsops->fs_get_mblk)
+                return obd->obd_fsops->fs_get_mblk(sb, count, inode, frags);
+        return -ENOTSUPP;
+}
+
  static inline int fsfilt_map_inode_pages(struct obd_device *obd,
                                           struct inode *inode,
                                           struct page **page, int pages,
diff --git a/lustre/include/linux/lustre_handles.h b/lustre/include/linux/lustre_handles.h

index 166beb3..afbe6e5 100644 (file)
--- a/lustre/include/linux/lustre_handles.h
+++ b/lustre/include/linux/lustre_handles.h
@@ -1,3 +1,39 @@
+/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
+ * vim:expandtab:shiftwidth=8:tabstop=8:
+ *
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright  2008 Sun Microsystems, Inc. All rights reserved
+ * Use is subject to license terms.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
  #ifndef __LINUX_LUSTRE_HANDLES_H_
  #define __LINUX_LUSTRE_HANDLES_H_
  
diff --git a/lustre/include/linux/lustre_intent.h b/lustre/include/linux/lustre_intent.h

index 380853a..aed18dc 100644 (file)
--- a/lustre/include/linux/lustre_intent.h
+++ b/lustre/include/linux/lustre_intent.h
@@ -1,3 +1,39 @@
+/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
+ * vim:expandtab:shiftwidth=8:tabstop=8:
+ *
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright  2008 Sun Microsystems, Inc. All rights reserved
+ * Use is subject to license terms.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
  #ifndef LUSTRE_INTENT_H
  #define LUSTRE_INTENT_H
  
diff --git a/lustre/include/linux/lustre_lib.h b/lustre/include/linux/lustre_lib.h

index a2fd8ba..1092c61 100644 (file)
--- a/lustre/include/linux/lustre_lib.h
+++ b/lustre/include/linux/lustre_lib.h
@@ -1,25 +1,41 @@
  /* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
   * vim:expandtab:shiftwidth=8:tabstop=8:
   *
- *  Copyright (C) 2001 Cluster File Systems, Inc. <braam@clusterfs.com>
+ * GPL HEADER START
   *
- *   This file is part of Lustre, http://www.lustre.org.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   *
- *   Lustre is free software; you can redistribute it and/or
- *   modify it under the terms of version 2 of the GNU General Public
- *   License as published by the Free Software Foundation.
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
   *
- *   Lustre is distributed in the hope that it will be useful,
- *   but WITHOUT ANY WARRANTY; without even the implied warranty of
- *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- *   GNU General Public License for more details.
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
   *
- *   You should have received a copy of the GNU General Public License
- *   along with Lustre; if not, write to the Free Software
- *   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
   *
- * Basic Lustre library routines.
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
   *
+ * GPL HEADER END
+ */
+/*
+ * Copyright  2008 Sun Microsystems, Inc. All rights reserved
+ * Use is subject to license terms.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/include/linux/lustre_lib.h
+ *
+ * Basic Lustre library routines.
   */
  
  #ifndef _LINUX_LUSTRE_LIB_H
@@ -90,4 +106,3 @@ static inline void inode_init_lvb(struct inode *inode, struct ost_lvb *lvb)
  #endif
  
  #endif /* _LUSTRE_LIB_H */
-
diff --git a/lustre/include/linux/lustre_lite.h b/lustre/include/linux/lustre_lite.h

index c531c05..1857aac 100644 (file)
--- a/lustre/include/linux/lustre_lite.h
+++ b/lustre/include/linux/lustre_lite.h
@@ -1,5 +1,37 @@
  /* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
   * vim:expandtab:shiftwidth=8:tabstop=8:
+ *
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright  2008 Sun Microsystems, Inc. All rights reserved
+ * Use is subject to license terms.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
   */
  
  #ifndef _LINUX_LL_H
@@ -13,9 +45,7 @@
  
  #include <linux/version.h>
  
-#if (LINUX_VERSION_CODE >= KERNEL_VERSION(2,5,0))
  #include <asm/statfs.h>
-#endif
  
  #include <linux/fs.h>
  #include <linux/dcache.h>
@@ -30,6 +60,34 @@
  #include <linux/lustre_compat25.h>
  #include <linux/pagemap.h>
  
+#ifdef HAVE_PERCPU_COUNTER
+#include <linux/percpu_counter.h>
+
+typedef struct percpu_counter lcounter_t;
+
+#define lcounter_read(counter)          (int)percpu_counter_read(counter)
+#define lcounter_inc(counter)           percpu_counter_inc(counter)
+#define lcounter_dec(counter)           percpu_counter_dec(counter)
+
+#ifdef HAVE_PERCPU_2ND_ARG
+# define lcounter_init(counter)          percpu_counter_init(counter, 0)
+#else
+# define lcounter_init(counter)          percpu_counter_init(counter)
+#endif
+
+#define lcounter_destroy(counter)       percpu_counter_destroy(counter)
+
+#else
+typedef struct { atomic_t count; } lcounter_t;
+
+#define lcounter_read(counter)          atomic_read(&counter->count)
+#define lcounter_inc(counter)           atomic_inc(&counter->count)
+#define lcounter_dec(counter)           atomic_dec(&counter->count)
+#define lcounter_init(counter)          atomic_set(&counter->count, 0)
+#define lcounter_destroy(counter)       
+
+#endif /* if defined HAVE_PERCPU_COUNTER */
+
  /* lprocfs.c */
  enum {
           LPROC_LL_DIRTY_HITS = 0,
@@ -52,12 +110,7 @@ enum {
           LPROC_LL_TRUNC,
           LPROC_LL_LOCKLESS_TRUNC,
           LPROC_LL_FLOCK,
-
-#if (LINUX_VERSION_CODE > KERNEL_VERSION(2,5,0))
           LPROC_LL_GETATTR,
-#else
-         LPROC_LL_REVALIDATE,
-#endif
           LPROC_LL_STAFS,
           LPROC_LL_ALLOC_INODE,
           LPROC_LL_SETXATTR,
diff --git a/lustre/include/linux/lustre_log.h b/lustre/include/linux/lustre_log.h

index 65e1c51..c1ab0e1 100644 (file)
--- a/lustre/include/linux/lustre_log.h
+++ b/lustre/include/linux/lustre_log.h
@@ -1,35 +1,50 @@
  /* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
   * vim:expandtab:shiftwidth=8:tabstop=8:
   *
- *  Copyright (C) 2001 Cluster File Systems, Inc. <info@clusterfs.com>
+ * GPL HEADER START
   *
- *   This file is part of Lustre, http://www.lustre.org.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   *
- *   Lustre is free software; you can redistribute it and/or
- *   modify it under the terms of version 2 of the GNU General Public
- *   License as published by the Free Software Foundation.
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
   *
- *   Lustre is distributed in the hope that it will be useful,
- *   but WITHOUT ANY WARRANTY; without even the implied warranty of
- *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- *   GNU General Public License for more details.
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
   *
- *   You should have received a copy of the GNU General Public License
- *   along with Lustre; if not, write to the Free Software
- *   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
   *
- * Generic infrastructure for managing a collection of logs.
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
   *
- * These logs are used for:
+ * GPL HEADER END
+ */
+/*
+ * Copyright  2008 Sun Microsystems, Inc. All rights reserved
+ * Use is subject to license terms.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
   *
- * - orphan recovery: OST adds record on create
- * - mtime/size consistency: the OST adds a record on first write
- * - open/unlinked objects: OST adds a record on destroy
+ * lustre/include/linux/lustre_log.h
+ *
+ * Generic infrastructure for managing a collection of logs.
+ * These logs are used for:
+ *  - orphan recovery: OST adds record on create
+ *  - mtime/size consistency: the OST adds a record on first write
+ *  - open/unlinked objects: OST adds a record on destroy
   *
- * - mds unlink log: the MDS adds an entry upon delete
+ *  - mds unlink log: the MDS adds an entry upon delete
   *
- * - raid1 replication log between OST's
- * - MDS replication logs
+ *  - raid1 replication log between OST's
+ *  - MDS replication logs
   */
  
  #ifndef _LINUX_LUSTRE_LOG_H
diff --git a/lustre/include/linux/lustre_mds.h b/lustre/include/linux/lustre_mds.h

index 00e5d2c..160dbe5 100644 (file)
--- a/lustre/include/linux/lustre_mds.h
+++ b/lustre/include/linux/lustre_mds.h
@@ -1,7 +1,39 @@
  /* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
   * vim:expandtab:shiftwidth=8:tabstop=8:
   *
- *   This file is part of Lustre, http://www.lustre.org
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright  2008 Sun Microsystems, Inc. All rights reserved
+ * Use is subject to license terms.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/include/linux/lustre_mds.h
   *
   * MDS data structures.
   * See also lustre_idl.h for wire formats of requests.
diff --git a/lustre/include/linux/lustre_net.h b/lustre/include/linux/lustre_net.h

index 1d2f17e..ed8ba17 100644 (file)
--- a/lustre/include/linux/lustre_net.h
+++ b/lustre/include/linux/lustre_net.h
@@ -1,23 +1,37 @@
  /* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
   * vim:expandtab:shiftwidth=8:tabstop=8:
   *
- *  Copyright (C) 2002, 2003 Cluster File Systems, Inc.
+ * GPL HEADER START
   *
- *   This file is part of Lustre, http://www.lustre.org.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   *
- *   Lustre is free software; you can redistribute it and/or
- *   modify it under the terms of version 2 of the GNU General Public
- *   License as published by the Free Software Foundation.
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
   *
- *   Lustre is distributed in the hope that it will be useful,
- *   but WITHOUT ANY WARRANTY; without even the implied warranty of
- *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- *   GNU General Public License for more details.
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
   *
- *   You should have received a copy of the GNU General Public License
- *   along with Lustre; if not, write to the Free Software
- *   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
   *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright  2008 Sun Microsystems, Inc. All rights reserved
+ * Use is subject to license terms.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
   */
  
  #ifndef _LINUX_LUSTRE_NET_H
@@ -29,12 +43,8 @@
  
  #ifdef __KERNEL__
  #include <linux/version.h>
-#if (LINUX_VERSION_CODE < KERNEL_VERSION(2,5,0))
-#include <linux/tqueue.h>
-#else
  #include <linux/workqueue.h>
  #endif
-#endif
  
  /* XXX Liang: should be moved to other header instead of here */
  #if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,4)
diff --git a/lustre/include/linux/lustre_patchless_compat.h b/lustre/include/linux/lustre_patchless_compat.h

index afb599e..6a1b9f5 100644 (file)
--- a/lustre/include/linux/lustre_patchless_compat.h
+++ b/lustre/include/linux/lustre_patchless_compat.h
@@ -1,23 +1,37 @@
  /* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
   * vim:expandtab:shiftwidth=8:tabstop=8:
   *
- *  Copyright (C) 2002, 2003 Cluster File Systems, Inc.
+ * GPL HEADER START
   *
- *   This file is part of Lustre, http://www.lustre.org.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   *
- *   Lustre is free software; you can redistribute it and/or
- *   modify it under the terms of version 2 of the GNU General Public
- *   License as published by the Free Software Foundation.
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
   *
- *   Lustre is distributed in the hope that it will be useful,
- *   but WITHOUT ANY WARRANTY; without even the implied warranty of
- *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- *   GNU General Public License for more details.
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
   *
- *   You should have received a copy of the GNU General Public License
- *   along with Lustre; if not, write to the Free Software
- *   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
   *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright  2008 Sun Microsystems, Inc. All rights reserved
+ * Use is subject to license terms.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
   */
  
  #ifndef LUSTRE_PATCHLESS_COMPAT_H
diff --git a/lustre/include/linux/lustre_quota.h b/lustre/include/linux/lustre_quota.h

index 248bd5d..6b3e888 100644 (file)
--- a/lustre/include/linux/lustre_quota.h
+++ b/lustre/include/linux/lustre_quota.h
@@ -1,6 +1,39 @@
  /* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
   * vim:expandtab:shiftwidth=8:tabstop=8:
+ *
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
   */
+/*
+ * Copyright  2008 Sun Microsystems, Inc. All rights reserved
+ * Use is subject to license terms.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
  #ifndef _LINUX_LUSTRE_QUOTA_H
  #define _LINUX_LUSTRE_QUOTA_H
  
diff --git a/lustre/include/linux/lustre_types.h b/lustre/include/linux/lustre_types.h

index 3aec53f..8d5f0aa 100644 (file)
--- a/lustre/include/linux/lustre_types.h
+++ b/lustre/include/linux/lustre_types.h
@@ -1,3 +1,39 @@
+/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
+ * vim:expandtab:shiftwidth=8:tabstop=8:
+ *
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright  2008 Sun Microsystems, Inc. All rights reserved
+ * Use is subject to license terms.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
  #ifndef _LUSTRE_LINUX_TYPES_H
  #define _LUSTRE_LINUX_TYPES_H
  
diff --git a/lustre/include/linux/lustre_user.h b/lustre/include/linux/lustre_user.h

index 1b7fb8a..a7e3302 100644 (file)
--- a/lustre/include/linux/lustre_user.h
+++ b/lustre/include/linux/lustre_user.h
@@ -1,7 +1,39 @@
  /* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
   * vim:expandtab:shiftwidth=8:tabstop=8:
   *
- *   This file is part of Lustre, http://www.lustre.org
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright  2008 Sun Microsystems, Inc. All rights reserved
+ * Use is subject to license terms.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/include/linux/lustre_user.h
   *
   * Lustre public user-space interface definitions.
   */
@@ -23,9 +55,6 @@
  # endif
  #else
  # include <linux/version.h>
-# if LINUX_VERSION_CODE < KERNEL_VERSION(2,4,21)
-#  define NEED_QUOTA_DEFS
-# endif
  # ifdef HAVE_QUOTA_SUPPORT
  #  include <linux/quota.h>
  # endif
@@ -46,7 +75,7 @@
  #endif
  
  #if defined(__x86_64__) || defined(__ia64__) || defined(__ppc64__) || \
-    defined(__craynv) || defined (__mips64__)
+    defined(__craynv) || defined (__mips64__) || defined(__powerpc64__)
  typedef struct stat     lstat_t;
  #define lstat_f         lstat
  #define HAVE_LOV_USER_MDS_DATA
diff --git a/lustre/include/linux/lvfs.h b/lustre/include/linux/lvfs.h

index 04829c5..225204b 100644 (file)
--- a/lustre/include/linux/lvfs.h
+++ b/lustre/include/linux/lvfs.h
@@ -1,22 +1,39 @@
  /* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
   * vim:expandtab:shiftwidth=8:tabstop=8:
   *
- *  Copyright (C) 2001 Cluster File Systems, Inc. <braam@clusterfs.com>
+ * GPL HEADER START
   *
- *   This file is part of Lustre, http://www.lustre.org.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   *
- *   Lustre is free software; you can redistribute it and/or
- *   modify it under the terms of version 2 of the GNU General Public
- *   License as published by the Free Software Foundation.
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
   *
- *   Lustre is distributed in the hope that it will be useful,
- *   but WITHOUT ANY WARRANTY; without even the implied warranty of
- *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- *   GNU General Public License for more details.
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
   *
- *   You should have received a copy of the GNU General Public License
- *   along with Lustre; if not, write to the Free Software
- *   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright  2008 Sun Microsystems, Inc. All rights reserved
+ * Use is subject to license terms.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/include/linux/lvfs.h
   *
   * lustre VFS/process permission interface
   */
@@ -47,7 +64,7 @@ struct lvfs_ucred {
          struct upcall_cache_entry *luc_uce;
          __u32 luc_fsuid;
          __u32 luc_fsgid;
-        __u32 luc_cap;
+        cfs_kernel_cap_t luc_cap;
          __u32 luc_suppgid1;
          __u32 luc_suppgid2;
          __u32 luc_umask;
@@ -84,13 +101,19 @@ struct lvfs_run_ctxt {
  
  #ifdef __KERNEL__
  
-struct dentry *simple_mkdir(struct dentry *dir, char *name, int mode, int fix);
+struct dentry *simple_mkdir(struct dentry *dir, struct vfsmount *mnt, 
+                            char *name, int mode, int fix);
  struct dentry *simple_mknod(struct dentry *dir, char *name, int mode, int fix);
-int lustre_rename(struct dentry *dir, char *oldname, char *newname);
+int lustre_rename(struct dentry *dir, struct vfsmount *mnt, char *oldname,
+                  char *newname);
  int lustre_fread(struct file *file, void *buf, int len, loff_t *off);
  int lustre_fwrite(struct file *file, const void *buf, int len, loff_t *off);
  int lustre_fsync(struct file *file);
  long l_readdir(struct file * file, struct list_head *dentry_list);
+int l_notify_change(struct vfsmount *mnt, struct dentry *dchild,
+                    struct iattr *newattrs);
+int simple_truncate(struct dentry *dir, struct vfsmount *mnt,
+                               char *name, loff_t length);
  
  static inline void l_dput(struct dentry *de)
  {
diff --git a/lustre/include/linux/lvfs_linux.h b/lustre/include/linux/lvfs_linux.h

index c7b0f7d..7a1712a 100644 (file)
--- a/lustre/include/linux/lvfs_linux.h
+++ b/lustre/include/linux/lvfs_linux.h
@@ -1,15 +1,46 @@
  /* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
   * vim:expandtab:shiftwidth=8:tabstop=8:
+ *
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
   */
+/*
+ * Copyright  2008 Sun Microsystems, Inc. All rights reserved
+ * Use is subject to license terms.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
  #ifndef __LVFS_LINUX_H__
  #define __LVFS_LINUX_H__
  
  #include <linux/kernel.h>
  #include <linux/module.h>
  #include <linux/fs.h>
-#if (LINUX_VERSION_CODE >= KERNEL_VERSION(2,5,0))
  #include <linux/namei.h>
-#endif
  #include <linux/sched.h>
  
  #include <lvfs.h>
@@ -44,20 +75,10 @@ struct lvfs_dentry_params
  };
  #define LVFS_DENTRY_PARAMS_INIT         { .ldp_magic = LVFS_DENTRY_PARAM_MAGIC }
  
-# if (LINUX_VERSION_CODE >= KERNEL_VERSION(2,5,0))
-#  define BDEVNAME_DECLARE_STORAGE(foo) char foo[BDEVNAME_SIZE]
-#  define ll_bdevname(SB, STORAGE) __bdevname(kdev_t_to_nr(SB->s_dev), STORAGE)
-#  define lvfs_sbdev(SB)       ((SB)->s_bdev)
-#  define lvfs_sbdev_type      struct block_device *
+#define lvfs_sbdev(SB)       ((SB)->s_bdev)
+#define lvfs_sbdev_type      struct block_device *
     int fsync_bdev(struct block_device *);
-#  define lvfs_sbdev_sync      fsync_bdev
-# else
-#  define BDEVNAME_DECLARE_STORAGE(foo) char __unused_##foo
-#  define ll_bdevname(SB,STORAGE) ((void)__unused_##STORAGE,bdevname(lvfs_sbdev(SB)))
-#  define lvfs_sbdev(SB)       (kdev_t_to_nr((SB)->s_dev))
-#  define lvfs_sbdev_type      kdev_t
-#  define lvfs_sbdev_sync      fsync_dev
-# endif
+#define lvfs_sbdev_sync      fsync_bdev
  
  /* Instead of calling within lvfs (a layering violation) */
  #define lvfs_set_rdonly(obd, sb) \
diff --git a/lustre/include/linux/obd.h b/lustre/include/linux/obd.h

index 747bff2..a84a7d4 100644 (file)
--- a/lustre/include/linux/obd.h
+++ b/lustre/include/linux/obd.h
@@ -1,5 +1,37 @@
  /* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
   * vim:expandtab:shiftwidth=8:tabstop=8:
+ *
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright  2008 Sun Microsystems, Inc. All rights reserved
+ * Use is subject to license terms.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
   */
  
  #ifndef __LINUX_OBD_H
diff --git a/lustre/include/linux/obd_class.h b/lustre/include/linux/obd_class.h

index dcb5ba1..c373296 100644 (file)
--- a/lustre/include/linux/obd_class.h
+++ b/lustre/include/linux/obd_class.h
@@ -1,23 +1,37 @@
  /* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
   * vim:expandtab:shiftwidth=8:tabstop=8:
   *
- *  Copyright (C) 2001-2003 Cluster File Systems, Inc.
+ * GPL HEADER START
   *
- *   This file is part of Lustre, http://www.lustre.org.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   *
- *   Lustre is free software; you can redistribute it and/or
- *   modify it under the terms of version 2 of the GNU General Public
- *   License as published by the Free Software Foundation.
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
   *
- *   Lustre is distributed in the hope that it will be useful,
- *   but WITHOUT ANY WARRANTY; without even the implied warranty of
- *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- *   GNU General Public License for more details.
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
   *
- *   You should have received a copy of the GNU General Public License
- *   along with Lustre; if not, write to the Free Software
- *   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
   *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright  2008 Sun Microsystems, Inc. All rights reserved
+ * Use is subject to license terms.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
   */
  
  #ifndef __LINUX_CLASS_OBD_H
@@ -45,9 +59,7 @@ void iattr_from_obdo(struct iattr *attr, struct obdo *oa, obd_flag valid);
  void obdo_from_inode(struct obdo *dst, struct inode *src, obd_flag valid);
  void obdo_refresh_inode(struct inode *dst, struct obdo *src, obd_flag valid);
  void obdo_to_inode(struct inode *dst, struct obdo *src, obd_flag valid);
-#endif
-
-#if !defined(__KERNEL__) || (LINUX_VERSION_CODE < KERNEL_VERSION(2,5,0))
+#else
  #define to_kdev_t(dev) dev
  #define kdev_t_to_nr(dev) dev
  #endif
diff --git a/lustre/include/linux/obd_support.h b/lustre/include/linux/obd_support.h

index a2201d3..128e84e 100644 (file)
--- a/lustre/include/linux/obd_support.h
+++ b/lustre/include/linux/obd_support.h
@@ -1,23 +1,37 @@
  /* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
   * vim:expandtab:shiftwidth=8:tabstop=8:
   *
- *  Copyright (C) 2001, 2002 Cluster File Systems, Inc.
+ * GPL HEADER START
   *
- *   This file is part of Lustre, http://www.lustre.org.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   *
- *   Lustre is free software; you can redistribute it and/or
- *   modify it under the terms of version 2 of the GNU General Public
- *   License as published by the Free Software Foundation.
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
   *
- *   Lustre is distributed in the hope that it will be useful,
- *   but WITHOUT ANY WARRANTY; without even the implied warranty of
- *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- *   GNU General Public License for more details.
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
   *
- *   You should have received a copy of the GNU General Public License
- *   along with Lustre; if not, write to the Free Software
- *   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
   *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright  2008 Sun Microsystems, Inc. All rights reserved
+ * Use is subject to license terms.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
   */
  
  #ifndef _LINUX_OBD_SUPPORT
@@ -159,9 +173,8 @@ static inline cksum_type_t cksum_type_unpack(obd_flag o_flags)
  #define OBD_FAIL_WRITE(obd, id, sb)                                          \
  {                                                                            \
          if (OBD_FAIL_CHECK(id)) {                                            \
-                BDEVNAME_DECLARE_STORAGE(tmp);                               \
                  CERROR("obd_fail_loc=%x, fail write operation on %s\n",      \
-                       id, ll_bdevname(sb, tmp));                            \
+                       id, sb->s_id);                                        \
                  lvfs_set_rdonly(obd, sb);                                    \
                  /* We set FAIL_ONCE because we never "un-fail" a device */   \
                  obd_fail_loc |= OBD_FAILED | OBD_FAIL_ONCE;                  \
diff --git a/lustre/include/lprocfs_status.h b/lustre/include/lprocfs_status.h

index 688fbdc..69cee95 100644 (file)
--- a/lustre/include/lprocfs_status.h
+++ b/lustre/include/lprocfs_status.h
@@ -1,25 +1,43 @@
  /* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
   * vim:expandtab:shiftwidth=8:tabstop=8:
   *
- *  Copyright (C) 2002 Cluster File Systems, Inc.
+ * GPL HEADER START
   *
- *   This file is part of Lustre, http://www.lustre.org.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   *
- *   Lustre is free software; you can redistribute it and/or
- *   modify it under the terms of version 2 of the GNU General Public
- *   License as published by the Free Software Foundation.
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
   *
- *   Lustre is distributed in the hope that it will be useful,
- *   but WITHOUT ANY WARRANTY; without even the implied warranty of
- *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- *   GNU General Public License for more details.
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
   *
- *   You should have received a copy of the GNU General Public License
- *   along with Lustre; if not, write to the Free Software
- *   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
   *
- *   Top level header file for LProc SNMP
- *   Author: Hariharan Thantry thantry@users.sourceforge.net
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright  2008 Sun Microsystems, Inc. All rights reserved
+ * Use is subject to license terms.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/include/lprocfs_status.h
+ *
+ * Top level header file for LProc SNMP
+ *
+ * Author: Hariharan Thantry thantry@users.sourceforge.net
   */
  #ifndef _LPROCFS_SNMP_H
  #define _LPROCFS_SNMP_H
@@ -46,6 +64,10 @@ struct lprocfs_vars {
          cfs_write_proc_t *write_fptr;
          void *data;
          struct file_operations *fops;
+        /**
+         * /proc file mode.
+         */
+        mode_t proc_mode;
  };
  
  struct lprocfs_static_vars {
@@ -200,6 +222,25 @@ static inline int opcode_offset(__u32 opc) {
                          (LDLM_LAST_OPC - LDLM_FIRST_OPC) +
                          (MDS_LAST_OPC - MDS_FIRST_OPC) +
                          (OST_LAST_OPC - OST_FIRST_OPC));
+       } else if (opc < QUOTA_LAST_OPC) {
+                /* LQUOTA Opcode */
+                return (opc -  QUOTA_FIRST_OPC +
+                        (LLOG_LAST_OPC - LLOG_FIRST_OPC) +
+                        (OBD_LAST_OPC - OBD_FIRST_OPC) +
+                        (MGS_LAST_OPC - MGS_FIRST_OPC) +
+                        (LDLM_LAST_OPC - LDLM_FIRST_OPC) +
+                        (MDS_LAST_OPC - MDS_FIRST_OPC) +
+                        (OST_LAST_OPC - OST_FIRST_OPC));
+        } else if (opc < SEQ_LAST_OPC) {
+                /* SEQ opcode */
+                return (opc - SEQ_FIRST_OPC +
+                        (QUOTA_LAST_OPC - QUOTA_FIRST_OPC) +
+                        (LLOG_LAST_OPC - LLOG_FIRST_OPC) +
+                        (OBD_LAST_OPC - OBD_FIRST_OPC) +
+                        (MGS_LAST_OPC - MGS_FIRST_OPC) +
+                        (LDLM_LAST_OPC - LDLM_FIRST_OPC) +
+                        (MDS_LAST_OPC - MDS_FIRST_OPC) +
+                        (OST_LAST_OPC - OST_FIRST_OPC));
          } else {
                  /* Unknown Opcode */
                  return -1;
@@ -211,7 +252,9 @@ static inline int opcode_offset(__u32 opc) {
                              (LDLM_LAST_OPC - LDLM_FIRST_OPC)   + \
                              (MGS_LAST_OPC - MGS_FIRST_OPC)     + \
                              (OBD_LAST_OPC - OBD_FIRST_OPC)     + \
-                            (LLOG_LAST_OPC - LLOG_FIRST_OPC))
+                            (LLOG_LAST_OPC - LLOG_FIRST_OPC)   + \
+                            (QUOTA_LAST_OPC - QUOTA_FIRST_OPC) + \
+                            (SEQ_LAST_OPC - SEQ_FIRST_OPC))
  
  #define EXTRA_MAX_OPCODES ((PTLRPC_LAST_CNTR - PTLRPC_FIRST_CNTR)  + \
                             (EXTRA_LAST_OPC - EXTRA_FIRST_OPC))
@@ -233,12 +276,14 @@ enum {
          LDLM_EXTENT_ENQUEUE,
          LDLM_FLOCK_ENQUEUE,
          LDLM_IBITS_ENQUEUE,
+        MDS_REINT_SETATTR,
          MDS_REINT_CREATE,
          MDS_REINT_LINK,
-        MDS_REINT_OPEN,
-        MDS_REINT_SETATTR,
-        MDS_REINT_RENAME,
          MDS_REINT_UNLINK,
+        MDS_REINT_RENAME,
+        MDS_REINT_OPEN,
+        BRW_READ_BYTES,
+        BRW_WRITE_BYTES,
          EXTRA_LAST_OPC
  };
  
@@ -311,11 +356,11 @@ extern void lprocfs_counter_sub(struct lprocfs_stats *stats, int idx,
  #define lprocfs_counter_decr(stats, idx) \
          lprocfs_counter_sub(stats, idx, 1)
  
-extern __s64 lprocfs_read_helper(struct lprocfs_counter *lc, 
+extern __s64 lprocfs_read_helper(struct lprocfs_counter *lc,
                                   enum lprocfs_fields_flags field);
  
-static inline __u64 lprocfs_stats_collector(struct lprocfs_stats *stats, 
-                                            int idx, 
+static inline __u64 lprocfs_stats_collector(struct lprocfs_stats *stats,
+                                            int idx,
                                              enum lprocfs_fields_flags field)
  {
          __u64 ret = 0;
@@ -328,12 +373,13 @@ static inline __u64 lprocfs_stats_collector(struct lprocfs_stats *stats,
          return ret;
  }
  
-extern struct lprocfs_stats *lprocfs_alloc_stats(unsigned int num, 
+extern struct lprocfs_stats *lprocfs_alloc_stats(unsigned int num,
                                                   enum lprocfs_stats_flags flags);
  extern void lprocfs_clear_stats(struct lprocfs_stats *stats);
  extern void lprocfs_free_stats(struct lprocfs_stats **stats);
-extern void lprocfs_init_ops_stats(int num_private_stats, 
+extern void lprocfs_init_ops_stats(int num_private_stats,
                                     struct lprocfs_stats *stats);
+extern void lprocfs_init_ldlm_stats(struct lprocfs_stats *ldlm_stats);
  extern int lprocfs_alloc_obd_stats(struct obd_device *obddev,
                                     unsigned int num_private_stats);
  extern void lprocfs_counter_init(struct lprocfs_stats *stats, int index,
@@ -346,9 +392,12 @@ extern int lprocfs_add_clear_entry(struct obd_device * obd,
  extern int lprocfs_exp_setup(struct obd_export *exp,
                               lnet_nid_t *peer_nid, int *newnid);
  extern int lprocfs_exp_cleanup(struct obd_export *exp);
-extern int lprocfs_add_simple(struct proc_dir_entry *root,
-                              char *name, read_proc_t *read_proc,
-                              write_proc_t *write_proc, void *data);
+extern cfs_proc_dir_entry_t *lprocfs_add_simple(struct proc_dir_entry *root,
+                                                char *name,
+                                                read_proc_t *read_proc,
+                                                write_proc_t *write_proc,
+                                                void *data,
+                                                struct file_operations *fops);
  extern int lprocfs_register_stats(cfs_proc_dir_entry_t *root, const char *name,
                                    struct lprocfs_stats *stats);
  
@@ -369,9 +418,6 @@ extern cfs_proc_dir_entry_t *lprocfs_srch(cfs_proc_dir_entry_t *root,
  
  extern int lprocfs_obd_setup(struct obd_device *obd, struct lprocfs_vars *list);
  extern int lprocfs_obd_cleanup(struct obd_device *obd);
-extern int lprocfs_add_simple(struct proc_dir_entry *root, char *name,
-                              read_proc_t *read_proc, write_proc_t *write_proc,
-                              void *data);
  struct nid_stat;
  extern void lprocfs_free_per_client_stats(struct obd_device *obd);
  extern int lprocfs_nid_stats_clear_write(struct file *file, const char *buffer,
@@ -382,7 +428,7 @@ extern int lprocfs_nid_stats_clear_read(char *page, char **start, off_t off,
  
  extern struct file_operations lprocfs_evict_client_fops;
  
-extern int lprocfs_seq_create(cfs_proc_dir_entry_t *parent, char *name, 
+extern int lprocfs_seq_create(cfs_proc_dir_entry_t *parent, char *name,
                                mode_t mode, struct file_operations *seq_fops,
                                void *data);
  extern int lprocfs_obd_seq_create(struct obd_device *dev, char *name,
@@ -411,6 +457,8 @@ extern int lprocfs_rd_server_uuid(char *page, char **start, off_t off,
                                    int count, int *eof, void *data);
  extern int lprocfs_rd_conn_uuid(char *page, char **start, off_t off,
                                  int count, int *eof, void *data);
+extern int lprocfs_rd_import(char *page, char **start, off_t off, int count,
+                             int *eof, void *data);
  extern int lprocfs_rd_connect_flags(char *page, char **start, off_t off,
                                      int count, int *eof, void *data);
  extern int lprocfs_rd_num_exports(char *page, char **start, off_t off,
@@ -418,7 +466,7 @@ extern int lprocfs_rd_num_exports(char *page, char **start, off_t off,
  extern int lprocfs_rd_numrefs(char *page, char **start, off_t off,
                                int count, int *eof, void *data);
  struct adaptive_timeout;
-extern int lprocfs_at_hist_helper(char *page, int count, int rc, 
+extern int lprocfs_at_hist_helper(char *page, int count, int rc,
                                    struct adaptive_timeout *at);
  extern int lprocfs_rd_timeouts(char *page, char **start, off_t off,
                                 int count, int *eof, void *data);
@@ -449,7 +497,7 @@ extern int lprocfs_write_helper(const char *buffer, unsigned long count,
                                  int *val);
  extern int lprocfs_write_frac_helper(const char *buffer, unsigned long count,
                                       int *val, int mult);
-extern int lprocfs_read_frac_helper(char *buffer, unsigned long count, 
+extern int lprocfs_read_frac_helper(char *buffer, unsigned long count,
                                      long val, int mult);
  extern int lprocfs_write_u64_helper(const char *buffer, unsigned long count,
                                      __u64 *val);
@@ -470,6 +518,10 @@ extern int lprocfs_counter_write(struct file *file, const char *buffer,
  int lprocfs_obd_rd_recovery_status(char *page, char **start, off_t off,
                                     int count, int *eof, void *data);
  
+/* lprocfs_statuc.c: hash statistics */
+int lprocfs_obd_rd_hash(char *page, char **start, off_t off,
+                        int count, int *eof, void *data);
+
  extern int lprocfs_seq_release(struct inode *, struct file *);
  
  /* in lprocfs_stat.c, to protect the private data for proc entries */
@@ -488,8 +540,14 @@ extern struct rw_semaphore _lprocfs_lock;
                  return -ENODEV;                 \
          }                                       \
  } while(0)
+#define LPROCFS_WRITE_ENTRY()     do {  \
+        down_write(&_lprocfs_lock);     \
+} while(0)
+#define LPROCFS_WRITE_EXIT()      do {  \
+        up_write(&_lprocfs_lock);       \
+} while(0)
  
-/* You must use these macros when you want to refer to 
+/* You must use these macros when you want to refer to
   * the import in a client obd_device for a lprocfs entry */
  #define LPROCFS_CLIMP_CHECK(obd) do {           \
          typecheck(struct obd_device *, obd);    \
@@ -503,8 +561,8 @@ extern struct rw_semaphore _lprocfs_lock;
          up_read(&(obd)->u.cli.cl_sem);
  
  
-/* write the name##_seq_show function, call LPROC_SEQ_FOPS_RO for read-only 
-  proc entries; otherwise, you will define name##_seq_write function also for 
+/* write the name##_seq_show function, call LPROC_SEQ_FOPS_RO for read-only
+  proc entries; otherwise, you will define name##_seq_write function also for
    a read-write proc entry, and then call LPROC_SEQ_SEQ instead. Finally,
    call lprocfs_obd_seq_create(obd, filename, 0444, &name#_fops, data); */
  #define __LPROC_SEQ_FOPS(name, custom_seq_write)                           \
@@ -544,6 +602,64 @@ int lprocfs_obd_rd_recovery_maxtime(char *page, char **start, off_t off,
  int lprocfs_obd_wr_recovery_maxtime(struct file *file, const char *buffer,
                                      unsigned long count, void *data);
  #endif
+#ifdef HAVE_DELAYED_RECOVERY
+int lprocfs_obd_rd_stale_export_age(char *page, char **start, off_t off,
+                                    int count, int *eof, void *data);
+int lprocfs_obd_wr_stale_export_age(struct file *file, const char *buffer,
+                                    unsigned long count, void *data);
+int lprocfs_obd_attach_stale_exports(struct obd_device *dev);
+int lprocfs_obd_wr_flush_stale_exports(struct file *file, const char *buffer,
+                                       unsigned long count, void *data);
+#endif
+/* all quota proc functions */
+extern int lprocfs_quota_rd_bunit(char *page, char **start, off_t off, int count,
+                                  int *eof, void *data);
+extern int lprocfs_quota_wr_bunit(struct file *file, const char *buffer,
+                                  unsigned long count, void *data);
+extern int lprocfs_quota_rd_btune(char *page, char **start, off_t off, int count,
+                                  int *eof, void *data);
+extern int lprocfs_quota_wr_btune(struct file *file, const char *buffer,
+                                  unsigned long count, void *data);
+extern int lprocfs_quota_rd_iunit(char *page, char **start, off_t off, int count,
+                                  int *eof, void *data);
+extern int lprocfs_quota_wr_iunit(struct file *file, const char *buffer,
+                                  unsigned long count, void *data);
+extern int lprocfs_quota_rd_itune(char *page, char **start, off_t off, int count,
+                                  int *eof, void *data);
+extern int lprocfs_quota_wr_itune(struct file *file, const char *buffer,
+                                  unsigned long count, void *data);
+extern int lprocfs_quota_rd_type(char *page, char **start, off_t off, int count,
+                                 int *eof, void *data);
+extern int lprocfs_quota_wr_type(struct file *file, const char *buffer,
+                                 unsigned long count, void *data);
+extern int lprocfs_quota_rd_switch_seconds(char *page, char **start, off_t off,
+                                           int count, int *eof, void *data);
+extern int lprocfs_quota_wr_switch_seconds(struct file *file, const char *buffer,
+                                           unsigned long count, void *data);
+extern int lprocfs_quota_rd_sync_blk(char *page, char **start, off_t off,
+                                     int count, int *eof, void *data);
+extern int lprocfs_quota_wr_sync_blk(struct file *file, const char *buffer,
+                                     unsigned long count, void *data);
+extern int lprocfs_quota_rd_switch_qs(char *page, char **start, off_t off,
+                                      int count, int *eof, void *data);
+extern int lprocfs_quota_wr_switch_qs(struct file *file, const char *buffer,
+                                      unsigned long count, void *data);
+extern int lprocfs_quota_rd_boundary_factor(char *page, char **start, off_t off,
+                                            int count, int *eof, void *data);
+extern int lprocfs_quota_wr_boundary_factor(struct file *file, const char *buffer,
+                                            unsigned long count, void *data);
+extern int lprocfs_quota_rd_least_bunit(char *page, char **start, off_t off,
+                                        int count, int *eof, void *data);
+extern int lprocfs_quota_wr_least_bunit(struct file *file, const char *buffer,
+                                        unsigned long count, void *data);
+extern int lprocfs_quota_rd_least_iunit(char *page, char **start, off_t off,
+                                        int count, int *eof, void *data);
+extern int lprocfs_quota_wr_least_iunit(struct file *file, const char *buffer,
+                                        unsigned long count, void *data);
+extern int lprocfs_quota_rd_qs_factor(char *page, char **start, off_t off,
+                                      int count, int *eof, void *data);
+extern int lprocfs_quota_wr_qs_factor(struct file *file, const char *buffer,
+                                      unsigned long count, void *data);
  
  #else
  /* LPROCFS is not defined */
@@ -558,8 +674,8 @@ static inline void lprocfs_counter_init(struct lprocfs_stats *stats,
                                          const char *name, const char *units)
  { return; }
  
-static inline __u64 lc_read_helper(struct lprocfs_counter *lc, 
-                                   enum lprocfs_fields_flags field) 
+static inline __u64 lc_read_helper(struct lprocfs_counter *lc,
+                                   enum lprocfs_fields_flags field)
  { return 0; }
  
  static inline struct lprocfs_stats* lprocfs_alloc_stats(unsigned int num,
@@ -573,9 +689,11 @@ static inline int lprocfs_register_stats(cfs_proc_dir_entry_t *root,
                                              const char *name,
                                              struct lprocfs_stats *stats)
  { return 0; }
-static inline void lprocfs_init_ops_stats(int num_private_stats, 
+static inline void lprocfs_init_ops_stats(int num_private_stats,
                                            struct lprocfs_stats *stats)
  { return; }
+static inline void lprocfs_init_ldlm_stats(struct lprocfs_stats *ldlm_stats)
+{ return; }
  static inline int lprocfs_alloc_obd_stats(struct obd_device *obddev,
                                            unsigned int num_private_stats)
  { return 0; }
@@ -586,15 +704,16 @@ struct obd_export;
  static inline int lprocfs_add_clear_entry(struct obd_export *exp)
  { return 0; }
  static inline int lprocfs_exp_setup(struct obd_export *exp,
-                                   lnet_nid_t *peer_nid, int *newnid)
+                                    lnet_nid_t *peer_nid, int *newnid)
  { return 0; }
  static inline int lprocfs_exp_cleanup(struct obd_export *exp)
  { return 0; }
-static inline int lprocfs_add_simple(struct proc_dir_entry *root,
-                                     char *name,
-                                     read_proc_t *read_proc,
-                                     write_proc_t *write_proc,
-                                     void *data)
+static inline cfs_proc_dir_entry_t *lprocfs_add_simple(struct proc_dir_entry *root,
+                                                char *name,
+                                                read_proc_t *read_proc,
+                                                write_proc_t *write_proc,
+                                                void *data,
+                                                struct file_operations *fops)
  {return 0; }
  struct nid_stat;
  static inline void lprocfs_free_per_client_stats(struct obd_device *obd)
@@ -633,6 +752,8 @@ static inline int lprocfs_rd_server_uuid(char *page, char **start, off_t off,
  static inline int lprocfs_rd_conn_uuid(char *page, char **start, off_t off,
                                         int count, int *eof, void *data)
  { return 0; }
+static inline int lprocfs_rd_import(char *page, char **start, off_t off, int count,
+                                    int *eof, void *data) { return 0; }
  static inline int lprocfs_rd_connect_flags(char *page, char **start, off_t off,
                                             int count, int *eof, void *data)
  { return 0; }
@@ -643,7 +764,7 @@ static inline int lprocfs_rd_numrefs(char *page, char **start, off_t off,
                                       int count, int *eof, void *data)
  { return 0; }
  struct adaptive_timeout;
-static inline int lprocfs_at_hist_helper(char *page, int count, int rc, 
+static inline int lprocfs_at_hist_helper(char *page, int count, int rc,
                                           struct adaptive_timeout *at)
  { return 0; }
  static inline int lprocfs_rd_timeouts(char *page, char **start, off_t off,
@@ -658,8 +779,23 @@ static inline int lprocfs_wr_evict_client(struct file *file, const char *buffer,
  static inline int lprocfs_wr_ping(struct file *file, const char *buffer,
                                    unsigned long count, void *data)
  { return 0; }
-
-
+#ifdef HAVE_DELAYED_RECOVERY
+static inline
+int lprocfs_obd_rd_stale_export_age(char *page, char **start, off_t off,
+                                    int count, int *eof, void *data)
+{ return 0; }
+static inline
+int lprocfs_obd_wr_stale_export_age(struct file *file, const char *buffer,
+                                    unsigned long count, void *data)
+{ return 0; }
+static inline
+int lprocfs_obd_attach_stale_exports(struct obd_device *dev)
+{ return 0; }
+static inline
+int lprocfs_obd_wr_flush_stale_exports(struct file *file, const char *buffer,
+                                       unsigned long count, void *data)
+{ return 0; }
+#endif
  /* Statfs helpers */
  static inline
  int lprocfs_rd_blksize(char *page, char **start, off_t off,
@@ -698,7 +834,7 @@ int lprocfs_counter_write(struct file *file, const char *buffer,
                            unsigned long count, void *data) { return 0; }
  
  static inline
-__u64 lprocfs_stats_collector(struct lprocfs_stats *stats, int idx, 
+__u64 lprocfs_stats_collector(struct lprocfs_stats *stats, int idx,
                                 enum lprocfs_fields_flags field)
  { return (__u64)0; }
  
diff --git a/lustre/include/lustre/Makefile.am b/lustre/include/lustre/Makefile.am

index 0acd90f..8d16432 100644 (file)
--- a/lustre/include/lustre/Makefile.am
+++ b/lustre/include/lustre/Makefile.am
@@ -1,10 +1,41 @@
-# Copyright (C) 2001  Cluster File Systems, Inc.
  #
-# This code is issued under the GNU General Public License.
-# See the file COPYING in this distribution
+# GPL HEADER START
+#
+# DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License version 2 only,
+# as published by the Free Software Foundation.
+#
+# This program is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+# General Public License version 2 for more details (a copy is included
+# in the LICENSE file that accompanied this code).
+#
+# You should have received a copy of the GNU General Public License
+# version 2 along with this program; If not, see
+# http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+#
+# Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+# CA 95054 USA or visit www.sun.com if you need additional information or
+# have any questions.
+#
+# GPL HEADER END
+#
+
+#
+# Copyright  2008 Sun Microsystems, Inc. All rights reserved
+# Use is subject to license terms.
+#
+
+#
+# This file is part of Lustre, http://www.lustre.org/
+# Lustre is a trademark of Sun Microsystems, Inc.
+#
  
  if UTILS
-pkginclude_HEADERS = lustre_idl.h lustre_user.h liblustreapi.h types.h
+pkginclude_HEADERS = lustre_idl.h lustre_user.h liblustreapi.h types.h ll_fiemap.h
  endif
  
-EXTRA_DIST = lustre_idl.h lustre_user.h liblustreapi.h types.h
+EXTRA_DIST = lustre_idl.h lustre_user.h liblustreapi.h types.h ll_fiemap.h
diff --git a/lustre/include/lustre/liblustreapi.h b/lustre/include/lustre/liblustreapi.h

index f1758fd..0c88b12 100644 (file)
--- a/lustre/include/lustre/liblustreapi.h
+++ b/lustre/include/lustre/liblustreapi.h
@@ -1,8 +1,39 @@
  /* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
   * vim:expandtab:shiftwidth=8:tabstop=8:
   *
- *   This file is part of Lustre, http://www.lustre.org
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright  2008 Sun Microsystems, Inc. All rights reserved
+ * Use is subject to license terms.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
   */
+
  #ifndef _LIBLUSTREAPI_H_
  #define _LIBLUSTREAPI_H_
  
@@ -36,12 +67,21 @@ enum llapi_message_level {
  extern void llapi_msg_set_level(int level);
  extern void llapi_err(int level, char *fmt, ...);
  extern void llapi_printf(int level, char *fmt, ...);
-extern int llapi_file_create(const char *name, unsigned long stripe_size,
+extern int llapi_file_create(const char *name, unsigned long long stripe_size,
                               int stripe_offset, int stripe_count,
                               int stripe_pattern);
  extern int llapi_file_open(const char *name, int flags, int mode,
-                           unsigned long stripe_size, int stripe_offset,
+                           unsigned long long stripe_size, int stripe_offset,
                             int stripe_count, int stripe_pattern);
+extern int llapi_file_create_pool(const char *name,
+                                  unsigned long long stripe_size,
+                                  int stripe_offset, int stripe_count,
+                                  int stripe_pattern, char *pool_name);
+extern int llapi_file_open_pool(const char *name, int flags, int mode,
+                                unsigned long long stripe_size,
+                                int stripe_offset, int stripe_count,
+                                int stripe_pattern, char *pool_name);
+extern int llapi_poollist(char *name);
  extern int llapi_file_get_stripe(const char *path, struct lov_user_md *lum);
  #define HAVE_LLAPI_FILE_LOOKUP
  extern int llapi_file_lookup(int dirfd, const char *name);
@@ -72,7 +112,9 @@ struct find_param {
                          exclude_gid:1,
                          exclude_uid:1,
                          check_gid:1,
-                        check_uid:1;
+                        check_uid:1,
+                        check_pool:1,
+                        exclude_pool:1;
  
          int     verbose;
          int     quiet;
@@ -94,6 +136,8 @@ struct find_param {
          /* In-precess parameters. */
          unsigned int depth;
          dev_t   st_dev;
+
+        char poolname[LOV_MAXPOOLNAME + 1];
  };
  
  extern int llapi_getstripe(char *path, struct find_param *param);
@@ -106,11 +150,13 @@ extern int llapi_ping(char *obd_type, char *obd_name);
  extern int llapi_target_check(int num_types, char **obd_types, char *dir);
  extern int llapi_catinfo(char *dir, char *keyword, char *node_name);
  extern int llapi_file_get_lov_uuid(const char *path, struct obd_uuid *lov_uuid);
-extern int llapi_file_get_lov_fuuid(int fd, struct obd_uuid *lov_uuid);
+extern int llapi_file_fget_lov_uuid(int fd, struct obd_uuid *lov_uuid);
  extern int llapi_lov_get_uuids(int fd, struct obd_uuid *uuidp, int *ost_count);
  extern int llapi_is_lustre_mnttype(const char *type);
  extern int parse_size(char *optarg, unsigned long long *size,
                        unsigned long long *size_units, int bytes_spec);
+extern void llapi_ping_target(char *obd_type, char *obd_name,
+                              char *obd_uuid, void *args);
  struct mntent;
  #define HAVE_LLAPI_IS_LUSTRE_MNT
  extern int llapi_is_lustre_mnt(struct mntent *mnt);
diff --git a/lustre/include/lustre/ll_fiemap.h b/lustre/include/lustre/ll_fiemap.h

new file mode 100644 (file)

index 0000000..92f71a9
--- /dev/null
+++ b/lustre/include/lustre/ll_fiemap.h
@@ -0,0 +1,129 @@
+/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
+ * vim:expandtab:shiftwidth=8:tabstop=8:
+ *
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright  2008 Sun Microsystems, Inc. All rights reserved
+ * Use is subject to license terms.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/include/lustre/ll_fiemap.h
+ *
+ * FIEMAP data structures and flags. This header file will be used until
+ * fiemap.h is available in the upstream kernel.
+ *
+ * Author: Kalpak Shah <kalpak.shah@sun.com>
+ * Author: Andreas Dilger <adilger@sun.com>
+ */
+
+#ifndef _LUSTRE_FIEMAP_H
+#define _LUSTRE_FIEMAP_H
+
+#ifndef HAVE_LINUX_FIEMAP_H
+
+#include <linux/lustre_types.h>
+
+struct ll_fiemap_extent {
+        __u64   fe_logical;  /* logical offset in bytes for the start of
+                              * the extent from the beginning of the file */
+        __u64   fe_physical; /* physical offset in bytes for the start
+                              * of the extent from the beginning of the disk */
+        __u64   fe_length;   /* length in bytes for the extent */
+        __u32   fe_flags;    /* FIEMAP_EXTENT_* flags for the extent */
+        __u32   fe_device;   /* device number for this extent */
+};
+
+struct ll_user_fiemap {
+        __u64   fm_start;         /* logical offset (inclusive) at
+                                   * which to start mapping (in) */
+        __u64   fm_length;        /* logical length of mapping which
+                                   * userspace wants (in) */
+        __u32   fm_flags;         /* FIEMAP_FLAG_* flags for request (in/out) */
+        __u32   fm_mapped_extents;/* number of extents that were mapped (out) */
+        __u32   fm_extent_count;  /* size of fm_extents array (in) */
+        __u32   fm_reserved;
+        struct  ll_fiemap_extent   fm_extents[0]; /* array of mapped extents (out).
+                                                   * Lustre uses first extent to
+                                                   * send end_offset */
+};
+
+#define FIEMAP_MAX_OFFSET      (~0ULL)
+
+#define FIEMAP_FLAG_SYNC         0x00000001 /* sync file data before map */
+#define FIEMAP_FLAG_XATTR        0x00000002 /* map extended attribute tree */
+#define FIEMAP_FLAG_DEVICE_ORDER 0x40000000 /* return device ordered mapping */
+
+#define FIEMAP_FLAGS_COMPAT    (FIEMAP_FLAG_SYNC | FIEMAP_FLAG_XATTR | \
+                                FIEMAP_FLAG_DEVICE_ORDER)
+
+#define FIEMAP_EXTENT_LAST             0x00000001 /* Last extent in file. */
+#define FIEMAP_EXTENT_UNKNOWN          0x00000002 /* Data location unknown. */
+#define FIEMAP_EXTENT_DELALLOC         0x00000004 /* Location still pending.
+                                                   * Sets EXTENT_UNKNOWN. */
+#define FIEMAP_EXTENT_NO_DIRECT        0x00000008 /* Data mapping undefined */
+#define FIEMAP_EXTENT_SECONDARY        0x00000010 /* Data copied offline. May
+                                                   * set EXTENT_NO_DIRECT. */
+#define FIEMAP_EXTENT_NET              0x00000020 /* Data stored remotely.
+                                                   * Sets EXTENT_NO_DIRECT. */
+#define FIEMAP_EXTENT_DATA_COMPRESSED  0x00000040 /* Data is compressed by fs.
+                                                   * Sets EXTENT_NO_DIRECT. */
+#define FIEMAP_EXTENT_DATA_ENCRYPTED   0x00000080 /* Data is encrypted by fs.
+                                                   * Sets EXTENT_NO_DIRECT. */
+#define FIEMAP_EXTENT_NOT_ALIGNED      0x00000100 /* Extent offsets may not be
+                                                   * block aligned. */
+#define FIEMAP_EXTENT_DATA_INLINE      0x00000200 /* Data mixed with metadata.
+                                                   * Sets EXTENT_NOT_ALIGNED.*/
+#define FIEMAP_EXTENT_DATA_TAIL        0x00000400 /* Multiple files in block.
+                                                   * Sets EXTENT_NOT_ALIGNED.*/
+#define FIEMAP_EXTENT_UNWRITTEN        0x00000800 /* Space allocated, but
+                                                   * no data (i.e. zero). */
+#define FIEMAP_EXTENT_MERGED           0x00001000 /* File does not natively
+                                                   * support extents. Result
+                                                   * merged for efficiency. */
+
+#else
+
+#define ll_fiemap_extent fiemap_extent
+#define ll_user_fiemap   fiemap
+
+#endif /* HAVE_LINUX_FIEMAP_H */
+
+static inline size_t fiemap_count_to_size(size_t extent_count)
+{
+        return (sizeof(struct ll_user_fiemap) + extent_count *
+                                               sizeof(struct ll_fiemap_extent));
+}
+
+static inline unsigned fiemap_size_to_count(size_t array_size)
+{
+        return ((array_size - sizeof(struct ll_user_fiemap)) /
+                                               sizeof(struct ll_fiemap_extent));
+}
+
+#endif /* _LUSTRE_FIEMAP_H */
diff --git a/lustre/include/lustre/lustre_idl.h b/lustre/include/lustre/lustre_idl.h

index dace6cc..a092647 100644 (file)
--- a/lustre/include/lustre/lustre_idl.h
+++ b/lustre/include/lustre/lustre_idl.h
@@ -1,7 +1,39 @@
  /* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
   * vim:expandtab:shiftwidth=8:tabstop=8:
   *
- *   This file is part of Lustre, http://www.lustre.org
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright  2008 Sun Microsystems, Inc. All rights reserved
+ * Use is subject to license terms.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/include/lustre/lustre_idl.h
   *
   * Lustre wire protocol definitions.
   *
@@ -64,6 +96,9 @@
  /* Defn's shared with user-space. */
  #include <lustre/lustre_user.h>
  #include <lustre_ver.h>
+#include <lustre/ll_fiemap.h>
+
+#include <libcfs/kp30.h>
  
  /*
   * this file contains all data structures used in Lustre interfaces:
@@ -109,6 +144,8 @@
  #define MGS_REQUEST_PORTAL             26
  #define MGS_REPLY_PORTAL               27
  #define OST_REQUEST_PORTAL             28
+#define FLD_REQUEST_PORTAL             29
+#define SEQ_METADATA_PORTAL            30
  
  #define SVC_KILLED               1
  #define SVC_EVENT                2
@@ -199,6 +236,7 @@ struct lustre_msg_v2 {
  };
  
  /* without security, ptlrpc_body is put in the first buffer. */
+#define PTLRPC_NUM_VERSIONS     4
  struct ptlrpc_body {
          struct lustre_handle pb_handle;
          __u32 pb_type;
@@ -206,7 +244,7 @@ struct ptlrpc_body {
          __u32 pb_opc;
          __u32 pb_status;
          __u64 pb_last_xid;
-        __u64 pb_last_seen;
+        __u64 pb_last_seen; /* not used */
          __u64 pb_last_committed;
          __u64 pb_transno;
          __u32 pb_flags;
@@ -216,9 +254,13 @@ struct ptlrpc_body {
          __u32 pb_service_time; /* for rep, actual service time */
          __u32 pb_limit;
          __u64 pb_slv;
+        /* VBR: pre-versions */
+        __u64 pb_pre_versions[PTLRPC_NUM_VERSIONS];
+        /* padding for future needs */
+        __u64 pb_padding[4];
  };
  
-extern void lustre_swab_ptlrpc_body(struct ptlrpc_body *pb);
+extern void lustre_swab_ptlrpc_body(struct ptlrpc_body *pb, int msgsize);
  
  /* message body offset for lustre_msg_v2 */
  /* ptlrpc body offset in all request/reply messages */
@@ -253,23 +295,28 @@ extern void lustre_swab_ptlrpc_body(struct ptlrpc_body *pb);
  #define MSG_RESENT             2
  #define MSG_REPLAY             4
  /* #define MSG_AT_SUPPORT         8  avoid until 1.10+ */
+#define MSG_DELAY_REPLAY       0x10
+#define MSG_VERSION_REPLAY     0x20
  
  /*
   * Flags for all connect opcodes (MDS_CONNECT, OST_CONNECT)
   */
  
-#define MSG_CONNECT_RECOVERING  0x1
-#define MSG_CONNECT_RECONNECT   0x2
-#define MSG_CONNECT_REPLAYABLE  0x4
+#define MSG_CONNECT_RECOVERING  0x00000001
+#define MSG_CONNECT_RECONNECT   0x00000002
+#define MSG_CONNECT_REPLAYABLE  0x00000004
  //#define MSG_CONNECT_PEER        0x8
-#define MSG_CONNECT_LIBCLIENT   0x10
-#define MSG_CONNECT_INITIAL     0x20
-#define MSG_CONNECT_ASYNC       0x40
-#define MSG_CONNECT_NEXT_VER    0x80 /* use next version of lustre_msg */
+#define MSG_CONNECT_LIBCLIENT   0x00000010
+#define MSG_CONNECT_INITIAL     0x00000020
+#define MSG_CONNECT_ASYNC       0x00000040
+#define MSG_CONNECT_NEXT_VER    0x00000080 /* use next version of lustre_msg */
+#define MSG_CONNECT_TRANSNO     0x00000100
+#define MSG_CONNECT_DELAYED     0x00000200
  
  /* Connect flags */
  #define OBD_CONNECT_RDONLY            0x1ULL /*client allowed read-only access*/
  #define OBD_CONNECT_INDEX             0x2ULL /*connect to specific LOV idx */
+#define OBD_CONNECT_MDS               0x4ULL /*connect from MDT to OST */
  #define OBD_CONNECT_GRANT             0x8ULL /*OSC acquires grant at connect */
  #define OBD_CONNECT_SRVLOCK          0x10ULL /*server takes locks for client */
  #define OBD_CONNECT_VERSION          0x20ULL /*Lustre versions in ocd */
@@ -295,10 +342,11 @@ extern void lustre_swab_ptlrpc_body(struct ptlrpc_body *pb);
  #define OBD_CONNECT_LRU_RESIZE 0x02000000ULL /*Lru resize feature. */
  #define OBD_CONNECT_MDS_MDS    0x04000000ULL /*MDS-MDS connection */
  #define OBD_CONNECT_REAL       0x08000000ULL /*real connection */
-#define OBD_CONNECT_CHANGE_QS  0x10000000ULL /*shrink/enlarge qunit size
-                                              *b=10600 */
+#define OBD_CONNECT_CHANGE_QS  0x10000000ULL /*shrink/enlarge qunit b=10600 */
  #define OBD_CONNECT_CKSUM      0x20000000ULL /*support several cksum algos */
  #define OBD_CONNECT_FID        0x40000000ULL /* FID is supported */
+#define OBD_CONNECT_VBR        0x80000000ULL /* version based recovery */
+#define OBD_CONNECT_LOV_V3    0x100000000ULL /* client supports lov v3 ea */
  /* also update obd_connect_names[] for lprocfs_rd_connect_flags()
   * and lustre/utils/wirecheck.c */
  
@@ -313,14 +361,16 @@ extern void lustre_swab_ptlrpc_body(struct ptlrpc_body *pb);
                                  OBD_CONNECT_IBITS | OBD_CONNECT_JOIN | \
                                  OBD_CONNECT_NODEVOH | OBD_CONNECT_ATTRFID | \
                                  OBD_CONNECT_CANCELSET | OBD_CONNECT_AT | \
-                                LRU_RESIZE_CONNECT_FLAG)
+                                LRU_RESIZE_CONNECT_FLAG | OBD_CONNECT_VBR |\
+                                OBD_CONNECT_LOV_V3)
  #define OST_CONNECT_SUPPORTED  (OBD_CONNECT_SRVLOCK | OBD_CONNECT_GRANT | \
                                  OBD_CONNECT_REQPORTAL | OBD_CONNECT_VERSION | \
                                  OBD_CONNECT_TRUNCLOCK | OBD_CONNECT_INDEX | \
                                  OBD_CONNECT_BRW_SIZE | OBD_CONNECT_QUOTA64 | \
                                  OBD_CONNECT_CANCELSET | OBD_CONNECT_AT | \
                                  LRU_RESIZE_CONNECT_FLAG | OBD_CONNECT_CKSUM | \
-                                OBD_CONNECT_CHANGE_QS)
+                                OBD_CONNECT_VBR | OBD_CONNECT_CHANGE_QS | \
+                                OBD_CONNECT_MDS)
  #define ECHO_CONNECT_SUPPORTED (0)
  #define MGS_CONNECT_SUPPORTED  (OBD_CONNECT_VERSION | OBD_CONNECT_AT)
  
@@ -353,6 +403,14 @@ struct obd_connect_data {
  
  extern void lustre_swab_connect(struct obd_connect_data *ocd);
  
+/* b1_6 has smaller body. The defines below is for interoperability */
+#if LUSTRE_VERSION_CODE < OBD_OCD_VERSION(2,0,0,0)
+#define PTLRPC_INTEROP_1_6      1
+#define PTLRPC_BODY_MIN_SIZE    offsetof(struct ptlrpc_body, pb_pre_versions)
+#else
+#define PTLRPC_BODY_MIN_SIZE    sizeof(struct ptlrpc_body)
+#endif
+
  /*
   * Supported checksum algorithms. Up to 32 checksum types are supported.
   * (32-bit mask stored in obd_connect_data::ocd_cksum_types)
@@ -437,6 +495,7 @@ typedef __u32 obd_count;
  #define LOV_MAGIC_V1      0x0BD10BD0
  #define LOV_MAGIC         LOV_MAGIC_V1
  #define LOV_MAGIC_JOIN    0x0BD20BD0
+#define LOV_MAGIC_V3      0x0BD30BD0
  
  #define LOV_PATTERN_RAID0 0x001   /* stripes are used round-robin */
  #define LOV_PATTERN_RAID1 0x002   /* stripes are mirrors of each other */
@@ -465,6 +524,18 @@ struct lov_mds_md_v1 {            /* LOV EA mds/wire data (little-endian) */
          struct lov_ost_data_v1 lmm_objects[0]; /* per-stripe data */
  };
  
+struct lov_mds_md_v3 {            /* LOV EA mds/wire data (little-endian) */
+        __u32 lmm_magic;          /* magic number = LOV_MAGIC_V3 */
+        __u32 lmm_pattern;        /* LOV_PATTERN_RAID0, LOV_PATTERN_RAID1 */
+        __u64 lmm_object_id;      /* LOV object ID */
+        __u64 lmm_object_gr;      /* LOV object group */
+        __u32 lmm_stripe_size;    /* size of stripe in bytes */
+        __u32 lmm_stripe_count;   /* num stripes in use for this object */
+        char  lmm_pool_name[LOV_MAXPOOLNAME]; /* must be 32bit aligned */
+        struct lov_ost_data_v1 lmm_objects[0]; /* per-stripe data */
+};
+
+
  #define OBD_MD_FLID        (0x00000001ULL) /* object ID */
  #define OBD_MD_FLATIME     (0x00000002ULL) /* access time */
  #define OBD_MD_FLMTIME     (0x00000004ULL) /* data modification time */
@@ -499,11 +570,18 @@ struct lov_mds_md_v1 {            /* LOV EA mds/wire data (little-endian) */
  
  #define OBD_MD_MDS         (0x0000000100000000ULL) /* where an inode lives on */
  #define OBD_MD_REINT       (0x0000000200000000ULL) /* reintegrate oa */
+#define OBD_MD_MEA         (0x0000000400000000ULL) /* CMD split EA  */
  
  #define OBD_MD_FLXATTR     (0x0000001000000000ULL) /* xattr */
  #define OBD_MD_FLXATTRLS   (0x0000002000000000ULL) /* xattr list */
  #define OBD_MD_FLXATTRRM   (0x0000004000000000ULL) /* xattr remove */
  #define OBD_MD_FLACL       (0x0000008000000000ULL) /* ACL */
+#define OBD_MD_FLRMTPERM   (0x0000010000000000ULL) /* remote permission */
+#define OBD_MD_FLMDSCAPA   (0x0000020000000000ULL) /* MDS capability */
+#define OBD_MD_FLOSSCAPA   (0x0000040000000000ULL) /* OSS capability */
+#define OBD_MD_FLCKSPLIT   (0x0000080000000000ULL) /* Check split on server */
+#define OBD_MD_FLCROSSREF  (0x0000100000000000ULL) /* Cross-ref case */
+
  
  #define OBD_MD_FLGETATTR (OBD_MD_FLID    | OBD_MD_FLATIME | OBD_MD_FLMTIME | \
                            OBD_MD_FLCTIME | OBD_MD_FLSIZE  | OBD_MD_FLBLKSZ | \
@@ -542,6 +620,9 @@ extern void lustre_swab_obd_statfs (struct obd_statfs *os);
  #define OBD_STATFS_NODELAY      0x0001  /* requests should be send without delay
                                           * and resends for avoid deadlocks */
  
+#define OBD_STATFS_FROM_CACHE   0x0002  /* the statfs callback should not update
+                                         * obd_osfs_age */
+
  /* ost_body.data values for OST_BRW */
  
  #define OBD_BRW_READ            0x01
@@ -640,8 +721,9 @@ typedef enum {
          REINT_UNLINK   = 4,
          REINT_RENAME   = 5,
          REINT_OPEN     = 6,
-//      REINT_CLOSE    = 7,
-//      REINT_WRITE    = 8,
+        REINT_SETXATTR = 7,
+//      REINT_CLOSE    = 8,
+//      REINT_WRITE    = 9,
          REINT_MAX
  } mds_reint_t;
  
@@ -669,6 +751,252 @@ typedef enum {
  
  extern void lustre_swab_ll_fid (struct ll_fid *fid);
  
+struct lu_fid {
+        __u64 f_seq;  /* holds fid sequence. Lustre should support 2^64
+                       * objects, thus even if one sequence has one object we
+                       * reach this value. */
+        __u32 f_oid;  /* fid number within its sequence. */
+        __u32 f_ver;  /* holds fid version. */
+};
+
+#define DFID "[0x%16.16"LPF64"x/0x%8.8x:0x%8.8x]"
+
+#define PFID(fid)     \
+        fid_seq(fid), \
+        fid_oid(fid), \
+        fid_ver(fid)
+
+enum { 
+        /** put FID sequence at this offset in ldlm_res_id. */
+        LUSTRE_RES_ID_SEQ_OFF = 0,
+        /** put FID oid at this offset in ldlm_res_id. */
+        LUSTRE_RES_ID_OID_OFF = 1,
+        /** put FID version at this offset in ldlm_res_id. */
+        LUSTRE_RES_ID_VER_OFF = 2,
+        /** put pdo hash at this offset in ldlm_res_id. */
+        LUSTRE_RES_ID_HSH_OFF = 3
+};
+
+typedef __u64 seqno_t;
+
+/**
+ * Describes a range of sequence, lsr_start is included but lsr_end is
+ * not in the range.
+ */
+struct lu_seq_range {
+        __u64 lsr_start;
+        __u64 lsr_end;
+        /** this feild is not used in 1.8 client interop */
+        __u32 lsr_mdt;
+        __u32 lsr_padding;
+};
+
+/**
+ * returns  width of given range \a r
+ */
+
+static inline __u64 range_space(const struct lu_seq_range *r)
+{
+        return r->lsr_end - r->lsr_start;
+}
+
+/**
+ * initialize range to zero
+ */
+static inline void range_init(struct lu_seq_range *r)
+{
+        r->lsr_start = r->lsr_end = 0;
+}
+
+/**
+ * check if given seq id \a s is within given range \a r
+ */
+static inline int range_within(const struct lu_seq_range *r,
+                               __u64 s)
+{
+        return s >= r->lsr_start && s < r->lsr_end;
+}
+
+/**
+ * sanity check for range \a r
+ */
+static inline int range_is_sane(const struct lu_seq_range *r)
+{
+        return (r->lsr_end >= r->lsr_start);
+}
+
+static inline int range_is_zero(struct lu_seq_range *r)
+{
+        return (r->lsr_start == 0 && r->lsr_end == 0);
+}
+
+static inline int range_is_exhausted(const struct lu_seq_range *r)
+{
+        return range_space(r) == 0;
+}
+
+#define DRANGE "[%#16.16"LPF64"x-%#16.16"LPF64"x)"
+
+#define PRANGE(range)      \
+        (range)->lsr_start, \
+        (range)->lsr_end
+
+enum {
+        /*
+         * This is how may FIDs may be allocated in one sequence.
+         */
+        LUSTRE_SEQ_MAX_WIDTH = 0x0000000000004000ULL,
+};
+
+enum lu_cli_type {
+        LUSTRE_SEQ_METADATA,
+        LUSTRE_SEQ_DATA
+};
+
+struct lu_client_seq {
+        /* Sequence-controller export. */
+        struct obd_export      *lcs_exp;
+        struct semaphore        lcs_sem;
+
+        /*
+         * Range of allowed for allocation sequences. When using lu_client_seq
+         * on clients, this contains meta-sequence range. And for servers this
+         * contains super-sequence range.
+         */
+        struct lu_seq_range         lcs_space;
+
+        /* This holds last allocated fid in last obtained seq */
+        struct lu_fid           lcs_fid;
+
+        /* LUSTRE_SEQ_METADATA or LUSTRE_SEQ_DATA */
+        enum lu_cli_type        lcs_type;
+        /*
+         * Service uuid, passed from MDT + seq name to form unique seq name to
+         * use it with procfs.
+         */
+        char                    lcs_name[80];
+
+        /*
+         * Sequence width, that is how many objects may be allocated in one
+         * sequence. Default value for it is LUSTRE_SEQ_MAX_WIDTH.
+         */
+        __u64                   lcs_width;
+
+};
+
+/*
+ * fid constants
+ */
+enum {
+        /* initial fid id value */
+        LUSTRE_FID_INIT_OID  = 1UL
+};
+
+extern void lustre_swab_lu_fid(struct lu_fid *fid);
+
+/* get object sequence */
+static inline __u64 fid_seq(const struct lu_fid *fid)
+{
+        return fid->f_seq;
+}
+
+/* get object id */
+static inline __u32 fid_oid(const struct lu_fid *fid)
+{
+        return fid->f_oid;
+}
+
+/* get object version */
+static inline __u32 fid_ver(const struct lu_fid *fid)
+{
+        return fid->f_ver;
+}
+
+static inline void fid_init(struct lu_fid *fid)
+{
+        memset(fid, 0, sizeof(*fid));
+}
+
+/* Normal FID sequence starts from this value, i.e. 1<<33 */
+#define FID_SEQ_START  0x200000000ULL
+
+/* IDIF sequence starts from this value, i.e. 1<<32 */
+#define IDIF_SEQ_START 0x100000000ULL
+
+/**
+ * Check if a fid is igif or not.
+ * \param fid the fid to be tested.
+ * \return true if the fid is a igif; otherwise false. 
+ */
+static inline int fid_is_igif(const struct lu_fid *fid)
+{
+        return fid_seq(fid) > 0 && fid_seq(fid) < IDIF_SEQ_START;
+}
+
+/**
+ * Check if a fid is idif or not.
+ * \param fid the fid to be tested.
+ * \return true if the fid is a idif; otherwise false. 
+ */
+static inline int fid_is_idif(const struct lu_fid *fid)
+{
+        return fid_seq(fid) >= IDIF_SEQ_START  && fid_seq(fid) < FID_SEQ_START;
+}
+
+/**
+ * Check if a fid is zero.
+ * \param fid the fid to be tested.
+ * \return true if the fid is zero; otherwise false. 
+ */
+static inline int fid_is_zero(const struct lu_fid *fid)
+{
+        return fid_seq(fid) == 0 && fid_oid(fid) == 0;
+}
+
+/**
+ * Get inode number from a igif.
+ * \param fid a igif to get inode number from.
+ * \return inode number for the igif.
+ */
+static inline ino_t lu_igif_ino(const struct lu_fid *fid)
+{
+        return fid_seq(fid);
+}
+
+/**
+ * Get inode generation from a igif.
+ * \param fid a igif to get inode generation from.
+ * \return inode generation for the igif.
+ */ 
+static inline __u32 lu_igif_gen(const struct lu_fid *fid)
+{
+        return fid_oid(fid);
+}
+
+/**
+ * Check if two fids are equal or not.
+ * \param f0 the first fid
+ * \param f1 the second fid
+ * \return true if the two fids are equal; otherwise false. 
+ */
+static inline int lu_fid_eq(const struct lu_fid *f0,
+                            const struct lu_fid *f1)
+{
+        /* Check that there is no alignment padding. */
+        CLASSERT(sizeof *f0 ==
+                 sizeof f0->f_seq + sizeof f0->f_oid + sizeof f0->f_ver);
+        LASSERTF(fid_is_igif(f0) || fid_ver(f0) == 0, DFID"\n", PFID(f0));
+        LASSERTF(fid_is_igif(f1) || fid_ver(f1) == 0, DFID"\n", PFID(f1));
+        return memcmp(f0, f1, sizeof *f0) == 0;
+}
+
+void fid_cpu_to_le(struct lu_fid *dst, const struct lu_fid *src);
+void fid_le_to_cpu(struct lu_fid *dst, const struct lu_fid *src);
+
+struct ldlm_res_id *
+fid_build_reg_res_name(const struct lu_fid *f, struct ldlm_res_id *name);
+int fid_res_name_eq(const struct lu_fid *f, const struct ldlm_res_id *name);
+
  #define MDS_STATUS_CONN 1
  #define MDS_STATUS_LOV 2
  
@@ -723,6 +1051,16 @@ static inline int ll_inode_to_ext_flags(int iflags, int keep)
  }
  #endif
  
+/*
+ * while mds_body is to interact with 1.6, mdt_body is to interact with 2.0.
+ * both of them should have the same fields layout, because at client side
+ * one could be dynamically cast to the other.
+ *
+ * mdt_body has large size than mds_body, with unused padding (48 bytes)
+ * at the end. client always use size of mdt_body to prepare request/reply
+ * buffers, and actual data could be interepeted as mdt_body or mds_body
+ * accordingly.
+ */
  struct mds_body {
          struct ll_fid  fid1;
          struct ll_fid  fid2;
@@ -755,6 +1093,42 @@ struct mds_body {
  
  extern void lustre_swab_mds_body (struct mds_body *b);
  
+struct mdt_body {
+        struct lu_fid  fid1;
+        struct lu_fid  fid2;
+        struct lustre_handle handle;
+        __u64          valid;
+        __u64          size;   /* Offset, in the case of MDS_READPAGE */
+        __u64          mtime;
+        __u64          atime;
+        __u64          ctime;
+        __u64          blocks; /* XID, in the case of MDS_READPAGE */
+        __u64          ioepoch;
+        __u64          ino;    /* for 1.6 compatibility */
+        __u32          fsuid;
+        __u32          fsgid;
+        __u32          capability;
+        __u32          mode;
+        __u32          uid;
+        __u32          gid;
+        __u32          flags; /* from vfs for pin/unpin, MDS_BFLAG for close */
+        __u32          rdev;
+        __u32          nlink; /* #bytes to read in the case of MDS_READPAGE */
+        __u32          generation; /* for 1.6 compatibility */
+        __u32          suppgid;
+        __u32          eadatasize;
+        __u32          aclsize;
+        __u32          max_mdsize;
+        __u32          max_cookiesize;
+        __u32          padding_4; /* also fix lustre_swab_mdt_body */
+        __u64          padding_5;
+        __u64          padding_6;
+        __u64          padding_7;
+        __u64          padding_8;
+        __u64          padding_9;
+        __u64          padding_10;
+};
+
  #define Q_QUOTACHECK    0x800100
  #define Q_INITQUOTA     0x800101        /* init slave limits */
  #define Q_GETOINFO      0x800102        /* get obd quota info */
@@ -848,6 +1222,7 @@ struct mds_rec_setattr {
  #define MDS_ATTR_KILL_SGID  0x1000ULL /* = 4096 */
  #define MDS_ATTR_CTIME_SET  0x2000ULL /* = 8192 */
  #define MDS_ATTR_FROM_OPEN  0x4000ULL /* = 16384, called from open path, ie O_TRUNC */
+#define MDS_ATTR_BLOCKS     0x8000ULL /* = 32768 */
  
  extern void lustre_swab_mds_rec_setattr (struct mds_rec_setattr *sa);
  
@@ -891,6 +1266,39 @@ struct mds_rec_create {
  
  extern void lustre_swab_mds_rec_create (struct mds_rec_create *cr);
  
+struct mdt_rec_create {
+        __u32           cr_opcode;
+        __u32           cr_cap;
+        __u32           cr_fsuid;
+        __u32           cr_fsuid_h;
+        __u32           cr_fsgid;
+        __u32           cr_fsgid_h;
+        __u32           cr_suppgid1;
+        __u32           cr_suppgid1_h;
+        __u32           cr_suppgid2;
+        __u32           cr_suppgid2_h;
+        struct lu_fid   cr_fid1;
+        struct lu_fid   cr_fid2;
+        struct lustre_handle cr_old_handle; /* handle in case of open replay */
+        __u64           cr_time;
+        __u64           cr_rdev;
+        __u64           cr_ioepoch;
+        __u64           cr_padding_1; /* pad for 64 bits*/
+        __u32           cr_mode;
+        __u32           cr_bias;
+        __u32           cr_flags;     /* for use with open */
+        __u32           cr_padding_2; /* pad for 64 bits*/
+        __u32           cr_padding_3; /* pad for 64 bits*/
+        __u32           cr_padding_4; /* pad for 64 bits*/
+};
+
+struct mdt_epoch {
+        struct lustre_handle handle;
+        __u64  ioepoch;
+        __u32  flags;
+        __u32  padding;
+};
+
  struct mds_rec_join {
          struct ll_fid  jr_fid;
          __u64          jr_headsize;
@@ -898,6 +1306,12 @@ struct mds_rec_join {
  
  extern void lustre_swab_mds_rec_join (struct mds_rec_join *jr);
  
+struct mdt_rec_join {
+        struct lu_fid  jr_fid;
+        __u64          jr_headsize;
+};
+
+
  struct mds_rec_link {
          __u32           lk_opcode;
          __u32           lk_fsuid;
@@ -916,6 +1330,32 @@ struct mds_rec_link {
  
  extern void lustre_swab_mds_rec_link (struct mds_rec_link *lk);
  
+struct mdt_rec_link {
+        __u32           lk_opcode;
+        __u32           lk_cap;
+        __u32           lk_fsuid;
+        __u32           lk_fsuid_h;
+        __u32           lk_fsgid;
+        __u32           lk_fsgid_h;
+        __u32           lk_suppgid1;
+        __u32           lk_suppgid1_h;
+        __u32           lk_suppgid2;
+        __u32           lk_suppgid2_h;
+        struct lu_fid   lk_fid1;
+        struct lu_fid   lk_fid2;
+        __u64           lk_time;
+        __u64           lk_padding_1;
+        __u64           lk_padding_2;
+        __u64           lk_padding_3;
+        __u64           lk_padding_4;
+        __u32           lk_bias;
+        __u32           lk_padding_5;
+        __u32           lk_padding_6;
+        __u32           lk_padding_7;
+        __u32           lk_padding_8;
+        __u32           lk_padding_9;
+};
+
  struct mds_rec_unlink {
          __u32           ul_opcode;
          __u32           ul_fsuid;
@@ -934,6 +1374,32 @@ struct mds_rec_unlink {
  
  extern void lustre_swab_mds_rec_unlink (struct mds_rec_unlink *ul);
  
+struct mdt_rec_unlink {
+        __u32           ul_opcode;
+        __u32           ul_cap;
+        __u32           ul_fsuid;
+        __u32           ul_fsuid_h;
+        __u32           ul_fsgid;
+        __u32           ul_fsgid_h;
+        __u32           ul_suppgid1;
+        __u32           ul_suppgid1_h;
+        __u32           ul_padding2;
+        __u32           ul_padding2_h;
+        struct lu_fid   ul_fid1;
+        struct lu_fid   ul_fid2;
+        __u64           ul_time;
+        __u64           ul_padding_2;
+        __u64           ul_padding_3;
+        __u64           ul_padding_4;
+        __u64           ul_padding_5;
+        __u32           ul_bias;
+        __u32           ul_mode;
+        __u32           ul_padding_6;
+        __u32           ul_padding_7;
+        __u32           ul_padding_8;
+        __u32           ul_padding_9;
+};
+
  struct mds_rec_rename {
          __u32           rn_opcode;
          __u32           rn_fsuid;
@@ -952,11 +1418,114 @@ struct mds_rec_rename {
  
  extern void lustre_swab_mds_rec_rename (struct mds_rec_rename *rn);
  
+struct mdt_rec_rename {
+        __u32           rn_opcode;
+        __u32           rn_cap;
+        __u32           rn_fsuid;
+        __u32           rn_fsuid_h;
+        __u32           rn_fsgid;
+        __u32           rn_fsgid_h;
+        __u32           rn_suppgid1;
+        __u32           rn_suppgid1_h;
+        __u32           rn_suppgid2;
+        __u32           rn_suppgid2_h;
+        struct lu_fid   rn_fid1;
+        struct lu_fid   rn_fid2;
+        __u64           rn_time;
+        __u64           rn_padding_1;
+        __u64           rn_padding_2;
+        __u64           rn_padding_3;
+        __u64           rn_padding_4;
+        __u32           rn_bias;      /* some operation flags */
+        __u32           rn_mode;      /* cross-ref rename has mode */
+        __u32           rn_padding_5;
+        __u32           rn_padding_6;
+        __u32           rn_padding_7;
+        __u32           rn_padding_8;
+};
+
+struct mdt_rec_setattr {
+        __u32           sa_opcode;
+        __u32           sa_cap;
+        __u32           sa_fsuid;
+        __u32           sa_fsuid_h;
+        __u32           sa_fsgid;
+        __u32           sa_fsgid_h;
+        __u32           sa_suppgid;
+        __u32           sa_suppgid_h;
+        __u32           sa_padding_1;
+        __u32           sa_padding_1_h;
+        struct lu_fid   sa_fid;
+        __u64           sa_valid;
+        __u32           sa_uid;
+        __u32           sa_gid;
+        __u64           sa_size;
+        __u64           sa_blocks;
+        __u64           sa_mtime;
+        __u64           sa_atime;
+        __u64           sa_ctime;
+        __u32           sa_attr_flags;
+        __u32           sa_mode;
+        __u32           sa_padding_2;
+        __u32           sa_padding_3;
+        __u32           sa_padding_4;
+        __u32           sa_padding_5;
+};
+
+struct mdt_rec_setxattr {
+        __u32           sx_opcode;
+        __u32           sx_cap;
+        __u32           sx_fsuid;
+        __u32           sx_fsuid_h;
+        __u32           sx_fsgid;
+        __u32           sx_fsgid_h;
+        __u32           sx_suppgid1;
+        __u32           sx_suppgid1_h;
+        __u32           sx_suppgid2;
+        __u32           sx_suppgid2_h;
+        struct lu_fid   sx_fid;
+        __u64           sx_padding_1; /* These three members are lu_fid size */
+        __u32           sx_padding_2;
+        __u32           sx_padding_3;
+        __u64           sx_valid;
+        __u64           sx_padding_4;
+        __u64           sx_padding_5;
+        __u64           sx_padding_6;
+        __u64           sx_padding_7;
+        __u32           sx_size;
+        __u32           sx_flags;
+        __u32           sx_padding_8;
+        __u32           sx_padding_9;
+        __u32           sx_padding_10;
+        __u32           sx_padding_11;
+};
+
+/*
+ * capa related definitions
+ */
+#define CAPA_HMAC_MAX_LEN       64
+#define CAPA_HMAC_KEY_MAX_LEN   56
+
+/* NB take care when changing the sequence of elements this struct,
+ * because the offset info is used in find_capa() */
+struct lustre_capa {
+        struct lu_fid   lc_fid;         /** fid */
+        __u64           lc_opc;         /** operations allowed */
+        __u64           lc_uid;         /** file owner */
+        __u64           lc_gid;         /** file group */
+        __u32           lc_flags;       /** HMAC algorithm & flags */
+        __u32           lc_keyid;       /** key# used for the capability */
+        __u32           lc_timeout;     /** capa timeout value (sec) */
+        __u32           lc_expiry;      /** expiry time (sec) */
+        __u8            lc_hmac[CAPA_HMAC_MAX_LEN];   /** HMAC */
+} __attribute__((packed));
+
  /*
   *  LOV data structures
   */
  
-#define LOV_MIN_STRIPE_SIZE 65536   /* maximum PAGE_SIZE (ia64), power of 2 */
+#define LOV_MIN_STRIPE_BITS 16   /* maximum PAGE_SIZE (ia64), power of 2 */
+#define LOV_MIN_STRIPE_SIZE (1<<LOV_MIN_STRIPE_BITS)
  #define LOV_MAX_STRIPE_COUNT  160   /* until bug 4424 is fixed */
  #define LOV_V1_INSANE_STRIPE_COUNT 65532 /* maximum stripe count bz13933 */
  
@@ -1183,12 +1752,15 @@ struct cfg_marker {
          __u32             cm_flags;
          __u32             cm_vers;       /* lustre release version number */
          __u32             padding;       /* 64 bit align */
-        time_t            cm_createtime; /*when this record was first created */
-        time_t            cm_canceltime; /*when this record is no longer valid*/
+        __u64             cm_createtime; /*when this record was first created */
+        __u64             cm_canceltime; /*when this record is no longer valid*/
          char              cm_tgtname[MTI_NAME_MAXLEN];
          char              cm_comment[MTI_NAME_MAXLEN];
  };
  
+extern void lustre_swab_cfg_marker(struct cfg_marker *marker,
+                                   int swab, int size);
+
  /*
   * Opcodes for multiple servers.
   */
@@ -1239,6 +1811,7 @@ typedef enum {
          OST_RAID1_REC    = LLOG_OP_MAGIC | 0x01000,
          MDS_UNLINK_REC   = LLOG_OP_MAGIC | 0x10000 | (MDS_REINT << 8) | REINT_UNLINK,
          MDS_SETATTR_REC  = LLOG_OP_MAGIC | 0x10000 | (MDS_REINT << 8) | REINT_SETATTR,
+        MDS_SETATTR64_REC= LLOG_OP_MAGIC | 0x90000 | (MDS_REINT << 8) | REINT_SETATTR,
          OBD_CFG_REC      = LLOG_OP_MAGIC | 0x20000,
          PTL_CFG_REC      = LLOG_OP_MAGIC | 0x30000, /* obsolete */
          LLOG_GEN_REC     = LLOG_OP_MAGIC | 0x40000,
@@ -1303,7 +1876,7 @@ struct llog_create_rec {
          struct llog_rec_hdr     lcr_hdr;
          struct ll_fid           lcr_fid;
          obd_id                  lcr_oid;
-        obd_count               lcr_ogen;
+        obd_count               lcr_ogr;
          __u32                   padding;
          struct llog_rec_tail    lcr_tail;
  } __attribute__((packed));
@@ -1319,21 +1892,33 @@ struct llog_orphan_rec {
  struct llog_unlink_rec {
          struct llog_rec_hdr     lur_hdr;
          obd_id                  lur_oid;
-        obd_count               lur_ogen;
-        __u32                   padding;
+        obd_count               lur_ogr;
+        obd_count               lur_count; /* to destroy the lost precreated */
          struct llog_rec_tail    lur_tail;
  } __attribute__((packed));
  
  struct llog_setattr_rec {
          struct llog_rec_hdr     lsr_hdr;
          obd_id                  lsr_oid;
-        obd_count               lsr_ogen;
+        obd_count               lsr_ogr;
          __u32                   lsr_uid;
          __u32                   lsr_gid;
          __u32                   padding;
          struct llog_rec_tail    lsr_tail;
  } __attribute__((packed));
  
+struct llog_setattr64_rec {
+        struct llog_rec_hdr     lsr_hdr;
+        obd_id                  lsr_oid;
+        obd_count               lsr_ogr;
+        __u32                   padding;
+        __u32                   lsr_uid;
+        __u32                   lsr_uid_h;
+        __u32                   lsr_gid;
+        __u32                   lsr_gid_h;
+        struct llog_rec_tail    lsr_tail;
+} __attribute__((packed));
+
  struct llog_size_change_rec {
          struct llog_rec_hdr     lsc_hdr;
          struct ll_fid           lsc_fid;
@@ -1493,12 +2078,21 @@ struct ost_body {
          struct  obdo oa;
  };
  
+/* Key for FIEMAP to be used in get_info calls */
+struct ll_fiemap_info_key {
+        char    name[8];
+        struct  obdo oa;
+        struct  ll_user_fiemap fiemap;
+};
  
  extern void lustre_swab_ost_body (struct ost_body *b);
  extern void lustre_swab_ost_last_id(obd_id *id);
+extern void lustre_swab_fiemap(struct ll_user_fiemap *fiemap);
  
-extern void lustre_swab_lov_user_md(struct lov_user_md *lum);
-extern void lustre_swab_lov_user_md_objects(struct lov_user_md *lum);
+extern void lustre_swab_lov_user_md_v1(struct lov_user_md_v1 *lum);
+extern void lustre_swab_lov_user_md_v3(struct lov_user_md_v3 *lum);
+extern void lustre_swab_lov_user_md_objects(struct lov_user_ost_data *lod,
+                                            int stripe_count);
  extern void lustre_swab_lov_user_md_join(struct lov_user_md_join *lumj);
  
  /* llog_swab.c */
@@ -1562,7 +2156,22 @@ extern int quota_copy_qdata(void *request, struct qunit_data *qdata,
  typedef enum {
          QUOTA_DQACQ     = 601,
          QUOTA_DQREL     = 602,
+        QUOTA_LAST_OPC
  } quota_cmd_t;
+#define QUOTA_FIRST_OPC QUOTA_DQACQ
+
+
+enum seq_rpc_opc {
+        SEQ_QUERY                       = 700,
+        SEQ_LAST_OPC,
+        SEQ_FIRST_OPC                   = SEQ_QUERY
+};
+
+enum seq_op {
+        SEQ_ALLOC_SUPER = 0,
+        SEQ_ALLOC_META = 1
+};
+
  
  #define JOIN_FILE_ALIGN 4096
  
@@ -1576,7 +2185,6 @@ typedef enum {
  #define QUOTA_RET_NOQUOTA      1 /* not support quota */
  #define QUOTA_RET_NOLIMIT      2 /* quota limit isn't set */
  #define QUOTA_RET_ACQUOTA      4 /* need to acquire extra quota */
-#define QUOTA_RET_INC_PENDING  8 /* pending value is increased */
  
  extern int quota_get_qunit_data_size(__u64 flag);
  #endif
diff --git a/lustre/include/lustre/lustre_user.h b/lustre/include/lustre/lustre_user.h

index a87cf82..f263c38 100644 (file)
--- a/lustre/include/lustre/lustre_user.h
+++ b/lustre/include/lustre/lustre_user.h
@@ -1,7 +1,39 @@
  /* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
   * vim:expandtab:shiftwidth=8:tabstop=8:
   *
- *   This file is part of Lustre, http://www.lustre.org
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright  2008 Sun Microsystems, Inc. All rights reserved
+ * Use is subject to license terms.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/include/lustre/lustre_user.h
   *
   * Lustre public user-space interface definitions.
   */
@@ -9,6 +41,7 @@
  #ifndef _LUSTRE_USER_H
  #define _LUSTRE_USER_H
  
+#include <lustre/ll_fiemap.h>
  #if defined(__linux__)
  #include <linux/lustre_user.h>
  #elif defined(__APPLE__)
@@ -29,8 +62,12 @@
  #define EXT3_IOC_SETVERSION             _IOW('f', 4, long)
  #define EXT3_IOC_GETVERSION_OLD         _IOR('v', 1, long)
  #define EXT3_IOC_SETVERSION_OLD         _IOW('v', 2, long)
+#define EXT3_IOC_FIEMAP                 _IOWR('f', 10, struct ll_user_fiemap)
  #endif
  
+/* FIEMAP flags supported by Lustre */
+#define LUSTRE_FIEMAP_FLAGS_COMPAT (FIEMAP_FLAG_SYNC | FIEMAP_FLAG_DEVICE_ORDER)
+
  struct obd_statfs;
  
  /* 
@@ -77,6 +114,10 @@ struct obd_statfs;
  #define LL_IOC_OBD_STATFS       IOC_OBD_STATFS
  #define IOC_MDC_GETSTRIPE       IOC_MDC_GETFILESTRIPE
  
+/* Do not define O_CHECK_STALE as 0200000000,
+ * which is conflict with MDS_OPEN_OWNEROVERRIDE */
+#define O_CHECK_STALE       020000000  /* hopefully this does not conflict */
+
  #define O_LOV_DELAY_CREATE 0100000000  /* hopefully this does not conflict */
  #define O_JOIN_FILE        0400000000  /* hopefully this does not conflict */
  
@@ -86,13 +127,16 @@ struct obd_statfs;
  
  #define LOV_USER_MAGIC_V1 0x0BD10BD0
  #define LOV_USER_MAGIC    LOV_USER_MAGIC_V1
-
  #define LOV_USER_MAGIC_JOIN 0x0BD20BD0
+#define LOV_USER_MAGIC_V3 0x0BD30BD0
  
  #define LOV_PATTERN_RAID0 0x001
  #define LOV_PATTERN_RAID1 0x002
  #define LOV_PATTERN_FIRST 0x100
  
+#define LOV_MAXPOOLNAME 16
+#define LOV_POOLNAMEF "%.16s"
+
  #define lov_user_ost_data lov_user_ost_data_v1
  struct lov_user_ost_data_v1 {     /* per-stripe data structure */
          __u64 l_object_id;        /* OST object ID */
@@ -113,6 +157,18 @@ struct lov_user_md_v1 {           /* LOV EA user data (host-endian) */
          struct lov_user_ost_data_v1 lmm_objects[0]; /* per-stripe data */
  } __attribute__((packed));
  
+struct lov_user_md_v3 {           /* LOV EA user data (host-endian) */
+        __u32 lmm_magic;          /* magic number = LOV_USER_MAGIC_V3 */
+        __u32 lmm_pattern;        /* LOV_PATTERN_RAID0, LOV_PATTERN_RAID1 */
+        __u64 lmm_object_id;      /* LOV object ID */
+        __u64 lmm_object_gr;      /* LOV object group */
+        __u32 lmm_stripe_size;    /* size of stripe in bytes */
+        __u16 lmm_stripe_count;   /* num stripes in use for this object */
+        __u16 lmm_stripe_offset;  /* starting stripe offset in lmm_objects */
+        char  lmm_pool_name[LOV_MAXPOOLNAME]; /* pool name */
+        struct lov_user_ost_data_v1 lmm_objects[0]; /* per-stripe data */
+} __attribute__((packed));
+
  /* Compile with -D_LARGEFILE64_SOURCE or -D_GNU_SOURCE (or #define) to
   * use this.  It is unsafe to #define those values in this header as it
   * is possible the application has already #included <sys/stat.h>. */
@@ -120,7 +176,12 @@ struct lov_user_md_v1 {           /* LOV EA user data (host-endian) */
  #define lov_user_mds_data lov_user_mds_data_v1
  struct lov_user_mds_data_v1 {
          lstat_t lmd_st;                 /* MDS stat struct */
-        struct lov_user_md_v1 lmd_lmm;  /* LOV EA user data */
+        struct lov_user_md_v1 lmd_lmm;  /* LOV EA V1 user data */
+} __attribute__((packed));
+
+struct lov_user_mds_data_v3 {
+        lstat_t lmd_st;                 /* MDS stat struct */
+        struct lov_user_md_v3 lmd_lmm;  /* LOV EA V3 user data */
  } __attribute__((packed));
  #endif
  
@@ -233,6 +294,8 @@ struct mds_grp_downcall_data {
  
  #endif /* !__KERNEL__ */
  
+#define QFMT_LDISKFS 2 /* pre-1.6.6 compatibility */
+
  typedef enum lustre_quota_version {
          LUSTRE_QUOTA_V1 = 0,
          LUSTRE_QUOTA_V2 = 1
diff --git a/lustre/include/lustre/types.h b/lustre/include/lustre/types.h

index a5da592..46afce0 100644 (file)
--- a/lustre/include/lustre/types.h
+++ b/lustre/include/lustre/types.h
@@ -1,3 +1,39 @@
+/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
+ * vim:expandtab:shiftwidth=8:tabstop=8:
+ *
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright  2008 Sun Microsystems, Inc. All rights reserved
+ * Use is subject to license terms.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
  #ifndef _LUSTRE_TYPES_H
  #define _LUSTRE_TYPES_H
  
diff --git a/lustre/include/lustre_cache.h b/lustre/include/lustre_cache.h

index d5a5337..5bff0a2 100644 (file)
--- a/lustre/include/lustre_cache.h
+++ b/lustre/include/lustre_cache.h
@@ -1,5 +1,37 @@
  /* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
   * vim:expandtab:shiftwidth=8:tabstop=8:
+ *
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright  2008 Sun Microsystems, Inc. All rights reserved
+ * Use is subject to license terms.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
   */
  
  #ifndef LUSTRE_CACHE_H
diff --git a/lustre/include/lustre_cfg.h b/lustre/include/lustre_cfg.h

index 3b770d3..cb1e9e4 100644 (file)
--- a/lustre/include/lustre_cfg.h
+++ b/lustre/include/lustre_cfg.h
@@ -1,23 +1,37 @@
  /* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
   * vim:expandtab:shiftwidth=8:tabstop=8:
   *
- *  Copyright (C) 2001 Cluster File Systems, Inc. <braam@clusterfs.com>
+ * GPL HEADER START
   *
- *   This file is part of Lustre, http://www.lustre.org.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   *
- *   Lustre is free software; you can redistribute it and/or
- *   modify it under the terms of version 2 of the GNU General Public
- *   License as published by the Free Software Foundation.
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
   *
- *   Lustre is distributed in the hope that it will be useful,
- *   but WITHOUT ANY WARRANTY; without even the implied warranty of
- *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- *   GNU General Public License for more details.
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
   *
- *   You should have received a copy of the GNU General Public License
- *   along with Lustre; if not, write to the Free Software
- *   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
   *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright  2008 Sun Microsystems, Inc. All rights reserved
+ * Use is subject to license terms.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
   */
  
  #ifndef _LUSTRE_CFG_H
@@ -56,6 +70,10 @@ enum lcfg_command_type {
          LCFG_LOG_START      = 0x00ce011,
          LCFG_LOG_END        = 0x00ce012,
          LCFG_LOV_ADD_INA    = 0x00ce013,
+        LCFG_POOL_NEW       = 0x00ce020,
+        LCFG_POOL_ADD       = 0x00ce021,
+        LCFG_POOL_REM       = 0x00ce022,
+        LCFG_POOL_DEL       = 0x00ce023,
  };
  
  struct lustre_cfg_bufs {
@@ -200,7 +218,7 @@ static inline struct lustre_cfg *lustre_cfg_new(int cmd,
          OBD_ALLOC(lcfg, lustre_cfg_len(bufs->lcfg_bufcount,
                                         bufs->lcfg_buflen));
          if (!lcfg)
-                RETURN(lcfg);
+                RETURN(ERR_PTR(-ENOMEM));
  
          lcfg->lcfg_version = LUSTRE_CFG_VERSION;
          lcfg->lcfg_command = cmd;
diff --git a/lustre/include/lustre_commit_confd.h b/lustre/include/lustre_commit_confd.h

deleted file mode 100644 (file)

index 87e8c91..0000000
--- a/lustre/include/lustre_commit_confd.h
+++ /dev/null
@@ -1,59 +0,0 @@
-/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
- * vim:expandtab:shiftwidth=8:tabstop=8:
- *
- *   This file is part of Lustre, http://www.lustre.org
- *
- * Structures relating to the log commit thread.
- */
-
-#ifndef _LUSTRE_COMMIT_CONFD_H
-#define _LUSTRE_COMMIT_CONFD_H
-
-#include <lustre_log.h>
-
-struct llog_canceld_ctxt {
-        struct list_head           llcd_list;  /* free or pending struct list */
-        struct llog_ctxt          *llcd_ctxt;
-        struct llog_commit_master *llcd_lcm;
-        int                        llcd_size;
-        int                        llcd_cookiebytes;
-        struct llog_cookie         llcd_cookies[0];
-};
-
-struct llog_commit_master {
-        struct list_head        lcm_thread_busy;  /* list of busy daemons */
-        struct list_head        lcm_thread_idle;  /* list of idle daemons */
-        spinlock_t              lcm_thread_lock;  /* protects thread_list */
-        atomic_t                lcm_thread_numidle;/* number of idle threads */
-        atomic_t                lcm_thread_total; /* total number of threads */
-        int                     lcm_thread_max;   /* <= num_osts normally */
-
-        int                     lcm_flags;
-        cfs_waitq_t             lcm_waitq;
-
-        struct list_head        lcm_llcd_pending; /* llog_canceld_ctxt to send */
-        struct list_head        lcm_llcd_resend;  /* try to resend this data */
-        struct list_head        lcm_llcd_free;    /* free llog_canceld_ctxt */
-        spinlock_t              lcm_llcd_lock;    /* protects llcd_free */
-        atomic_t                lcm_llcd_numfree; /* items on llcd_free */
-        int                     lcm_llcd_minfree; /* min free on llcd_free */
-        int                     lcm_llcd_maxfree; /* max free on llcd_free */
-};
-
-#define LLOG_LCM_FL_EXIT        0x01
-#define LLOG_LCM_FL_EXIT_FORCE  0x02
-
-/* the thread data that collects local commits and makes rpc's */
-struct llog_commit_daemon {
-        struct list_head           lcd_lcm_list;  /* list of daemon threads */
-        struct list_head           lcd_llcd_list; /* list of pending RPCs */
-        struct llog_commit_master *lcd_lcm;       /* pointer back to parent */
-        int                        lcd_index;     /* the index of the llog daemon */
-};
-
-/* ptlrpc/recov_thread.c */
-int llog_start_commit_thread(struct llog_commit_master *);
-
-int llog_init_commit_master(struct llog_commit_master *);
-int llog_cleanup_commit_master(struct llog_commit_master *lcm, int force);
-#endif /* _LUSTRE_COMMIT_CONFD_H */
diff --git a/lustre/include/lustre_debug.h b/lustre/include/lustre_debug.h

index c6bd7ba..5ded0ff 100644 (file)
--- a/lustre/include/lustre_debug.h
+++ b/lustre/include/lustre_debug.h
@@ -1,23 +1,37 @@
  /* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
   * vim:expandtab:shiftwidth=8:tabstop=8:
   *
- *  Copyright (C) 2002 Cluster File Systems, Inc.
+ * GPL HEADER START
   *
- *   This file is part of Lustre, http://www.lustre.org.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   *
- *   Lustre is free software; you can redistribute it and/or
- *   modify it under the terms of version 2 of the GNU General Public
- *   License as published by the Free Software Foundation.
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
   *
- *   Lustre is distributed in the hope that it will be useful,
- *   but WITHOUT ANY WARRANTY; without even the implied warranty of
- *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- *   GNU General Public License for more details.
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
   *
- *   You should have received a copy of the GNU General Public License
- *   along with Lustre; if not, write to the Free Software
- *   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
   *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright  2008 Sun Microsystems, Inc. All rights reserved
+ * Use is subject to license terms.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
   */
  
  #ifndef _LUSTRE_DEBUG_H
diff --git a/lustre/include/lustre_disk.h b/lustre/include/lustre_disk.h

index 1ab9b2e..1e55a80 100644 (file)
--- a/lustre/include/lustre_disk.h
+++ b/lustre/include/lustre_disk.h
@@ -1,26 +1,43 @@
  /* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
   * vim:expandtab:shiftwidth=8:tabstop=8:
   *
- *  Copyright (C) 2006 Cluster File Systems, Inc.
- *   Author: Nathan Rutman <nathan@clusterfs.com>
+ * GPL HEADER START
   *
- *   This file is part of Lustre, http://www.lustre.org.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   *
- *   Lustre is free software; you can redistribute it and/or
- *   modify it under the terms of version 2 of the GNU General Public
- *   License as published by the Free Software Foundation.
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
   *
- *   Lustre is distributed in the hope that it will be useful,
- *   but WITHOUT ANY WARRANTY; without even the implied warranty of
- *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- *   GNU General Public License for more details.
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
   *
- *   You should have received a copy of the GNU General Public License
- *   along with Lustre; if not, write to the Free Software
- *   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
   *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright  2008 Sun Microsystems, Inc. All rights reserved
+ * Use is subject to license terms.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/include/lustre_disk.h
   *
   * Lustre disk format definitions.
+ *
+ * Author: Nathan Rutman <nathan@clusterfs.com>
   */
  
  #ifndef _LUSTRE_DISK_H
@@ -149,7 +166,7 @@ struct lustre_mount_data {
  #define LMD_FLG_SERVER       0x0001  /* Mounting a server */
  #define LMD_FLG_CLIENT       0x0002  /* Mounting a client */
  #define LMD_FLG_ABORT_RECOV  0x0008  /* Abort recovery */
-#define LMD_FLG_NOSVC        0x0010  /* Only start MGS/MGC for servers, 
+#define LMD_FLG_NOSVC        0x0010  /* Only start MGS/MGC for servers,
                                          no other services */
  #define LMD_FLG_NOMGS        0x0020  /* Only start target for servers, reusing
                                          existing MGS services */
@@ -165,10 +182,17 @@ struct lustre_mount_data {
  #if LR_CLIENT_START < LR_SERVER_SIZE
  #error "Can't have LR_CLIENT_START < LR_SERVER_SIZE"
  #endif
-/* This limit is arbitrary (32k clients on x86), but it is convenient to use
- * 2^n * CFS_PAGE_SIZE * 8 for the number of bits that fit an order-n allocation. */
-#define LR_MAX_CLIENTS (CFS_PAGE_SIZE * 8)
  
+/*
+ * This limit is arbitrary (131072 clients on x86), but it is convenient to use
+ * 2^n * CFS_PAGE_SIZE * 8 for the number of bits that fit an order-n allocation.
+ * If we need more than 131072 clients (order-2 allocation on x86) then this
+ * should become an array of single-page pointers that are allocated on demand.
+ */
+#define LR_MAX_CLIENTS max(128 * 1024UL, CFS_PAGE_SIZE * 8)
+/* version recovery */
+#define LR_EPOCH_BITS   32
+#define lr_epoch(a) ((a) >> LR_EPOCH_BITS)
  
  /* COMPAT_146 */
  #define OBD_COMPAT_OST          0x00000002 /* this is an OST (temporary) */
@@ -182,8 +206,10 @@ struct lustre_mount_data {
  #define OBD_INCOMPAT_OST        0x00000002 /* this is an OST */
  #define OBD_INCOMPAT_MDT        0x00000004 /* this is an MDT */
  #define OBD_INCOMPAT_COMMON_LR  0x00000008 /* common last_rvcd format */
+#define OBD_INCOMPAT_FID        0x00000010 /* FID is enabled */
+#define OBD_INCOMPAT_SOM        0x00000020 /* Size-On-MDS is enabled */
  
-
+#define LR_EXPIRE_INTERVALS 16 /**< number of intervals to track transno */
  /* Data stored per server at the head of the last_rcvd file.  In le32 order.
     This should be common to filter_internal.h, lustre_mds.h */
  struct lr_server_data {
@@ -203,7 +229,13 @@ struct lr_server_data {
          __u8  lsd_peeruuid[40];    /* UUID of MDS associated with this OST */
          __u32 lsd_ost_index;       /* index number of OST in LOV */
          __u32 lsd_mdt_index;       /* index number of MDT in LMV */
-        __u8  lsd_padding[LR_SERVER_SIZE - 148];
+        __u32 lsd_start_epoch;     /* VBR: start epoch from last boot */
+        /** transaction values since lsd_trans_table_time */
+        __u64 lsd_trans_table[LR_EXPIRE_INTERVALS];
+        /** start point of transno table below */
+        __u32 lsd_trans_table_time; /* time of first slot in table above */
+        __u32 lsd_expire_intervals; /* LR_EXPIRE_INTERVALS */
+        __u8  lsd_padding[LR_SERVER_SIZE - 288];
  };
  
  /* Data stored per client in the last_rcvd file.  In le32 order. */
@@ -218,9 +250,21 @@ struct lsd_client_data {
          __u64 lcd_last_close_xid;     /* xid for the last transaction */
          __u32 lcd_last_close_result;  /* result from last RPC */
          __u32 lcd_last_close_data;    /* per-op data */
-        __u8  lcd_padding[LR_CLIENT_SIZE - 88];
+        /* VBR: last versions */
+        __u64 lcd_pre_versions[4];
+        __u32 lcd_last_epoch;
+        /** orphans handling for delayed export rely on that */
+        __u32 lcd_first_epoch;
+        __u8  lcd_padding[LR_CLIENT_SIZE - 128];
  };
  
+static inline __u64 lsd_last_transno(struct lsd_client_data *lcd)
+{
+        return le64_to_cpu(lcd->lcd_last_transno) >
+               le64_to_cpu(lcd->lcd_last_close_transno) ?
+               le64_to_cpu(lcd->lcd_last_transno) :
+               le64_to_cpu(lcd->lcd_last_close_transno);
+}
  
  #ifdef __KERNEL__
  /****************** superblock additional info *********************/
@@ -240,14 +284,8 @@ struct lustre_sb_info {
  #define LSI_UMOUNT_FORCE                 0x00000010
  #define LSI_UMOUNT_FAILOVER              0x00000020
  
-#if  (LINUX_VERSION_CODE >= KERNEL_VERSION(2,5,0))
-# define    s2lsi(sb)        ((struct lustre_sb_info *)((sb)->s_fs_info))
-# define    s2lsi_nocast(sb) ((sb)->s_fs_info)
-#else  /* 2.4 here */
-# define    s2lsi(sb)        ((struct lustre_sb_info *)((sb)->u.generic_sbp))
-# define    s2lsi_nocast(sb) ((sb)->u.generic_sbp)
-#endif
-
+#define    s2lsi(sb)        ((struct lustre_sb_info *)((sb)->s_fs_info))
+#define    s2lsi_nocast(sb) ((sb)->s_fs_info)
  #define     get_profile_name(sb)   (s2lsi(sb)->lsi_lmd->lmd_profile)
  
  #endif /* __KERNEL__ */
diff --git a/lustre/include/lustre_dlm.h b/lustre/include/lustre_dlm.h

index 56c769b..8e9c18a 100644 (file)
--- a/lustre/include/lustre_dlm.h
+++ b/lustre/include/lustre_dlm.h
@@ -1,6 +1,37 @@
  /* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
- * (visit-tags-table FILE)
   * vim:expandtab:shiftwidth=8:tabstop=8:
+ *
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright  2008 Sun Microsystems, Inc. All rights reserved
+ * Use is subject to license terms.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
   */
  
  #ifndef _LUSTRE_DLM_H__
@@ -28,7 +59,12 @@ struct obd_device;
  
  #define OBD_LDLM_DEVICENAME  "ldlm"
  
+#ifdef HAVE_BGL_SUPPORT
+/* 1.5 times the maximum 128 tasks available in VN mode */
+#define LDLM_DEFAULT_LRU_SIZE 196
+#else
  #define LDLM_DEFAULT_LRU_SIZE (100 * num_online_cpus())
+#endif
  #define LDLM_DEFAULT_MAX_ALIVE (cfs_time_seconds(36000))
  
  typedef enum {
@@ -136,7 +172,7 @@ typedef enum {
   * the 1st operation, whereas the 2nd operation has canceled this lock and
   * is waiting for rpc_lock which is taken by the 1st operation.
   * LDLM_FL_BL_AST is to be set by ldlm_callback_handler() to the lock not allow
- * ELC code to cancel it. 
+ * ELC code to cancel it.
   * LDLM_FL_BL_DONE is to be set by ldlm_cancel_callback() when lock cache is
   * droped to let ldlm_callback_handler() return EINVAL to the server. It is
   * used when ELC rpc is already prepared and is waiting for rpc_lock, too late
@@ -147,6 +183,9 @@ typedef enum {
  /* measure lock contention and return -EUSERS if locking contention is high */
  #define LDLM_FL_DENY_ON_CONTENTION 0x40000000
  
+/* 0x80000000 is occupied by LDLM_AST_DISCARD_DATA */
+#define LDLM_FL_FAIL_LOC       0x100000000ULL
+
  /* The blocking callback is overloaded to perform two functions.  These flags
   * indicate which operation should be performed. */
  #define LDLM_CB_BLOCKING    1
@@ -214,52 +253,98 @@ struct ldlm_namespace;
  
  struct ldlm_pool_ops {
          int (*po_recalc)(struct ldlm_pool *pl);
-        int (*po_shrink)(struct ldlm_pool *pl, int nr, 
+        int (*po_shrink)(struct ldlm_pool *pl, int nr,
                           unsigned int gfp_mask);
          int (*po_setup)(struct ldlm_pool *pl, int limit);
  };
  
-/* One second for pools thread check interval. */
+/** 
+ * One second for pools thread check interval. Each pool has own period. 
+ */
  #define LDLM_POOLS_THREAD_PERIOD (1)
  
-/* 5% margin for modest pools. See ldlm_pool.c for details. */
+/** 
+ * 5% margin for modest pools. See ldlm_pool.c for details. 
+ */
  #define LDLM_POOLS_MODEST_MARGIN (5)
  
-/* A change to SLV in % after which we want to wake up pools thread asap. */
-#define LDLM_POOLS_FAST_SLV_CHANGE (50)
+/**
+ * Default recalc period for server side pools in sec.
+ */
+#define LDLM_POOL_SRV_DEF_RECALC_PERIOD (1)
+
+/**
+ * Default recalc period for client side pools in sec.
+ */
+#define LDLM_POOL_CLI_DEF_RECALC_PERIOD (10)
  
  struct ldlm_pool {
-        /* Common pool fields */
-        cfs_proc_dir_entry_t  *pl_proc_dir;      /* Pool proc directory. */
-        char                   pl_name[100];     /* Pool name, should be long 
-                                                  * enough to contain complex
-                                                  * proc entry name. */
-        spinlock_t             pl_lock;          /* Lock for protecting slv/clv 
-                                                  * updates. */
-        atomic_t               pl_limit;         /* Number of allowed locks in
-                                                  * in pool, both, client and 
-                                                  * server side. */
-        atomic_t               pl_granted;       /* Number of granted locks. */
-        atomic_t               pl_grant_rate;    /* Grant rate per T. */
-        atomic_t               pl_cancel_rate;   /* Cancel rate per T. */
-        atomic_t               pl_grant_speed;   /* Grant speed (GR-CR) per T. */
-        __u64                  pl_server_lock_volume; /* Server lock volume. 
-                                                  * Protected by pl_lock */
-        __u64                  pl_client_lock_volume; /* Client lock volue. */
-        atomic_t               pl_lock_volume_factor; /* Lock volume factor. */
-
-        time_t                 pl_recalc_time;   /* Time when last slv from 
-                                                  * server was obtained. */
-        struct ldlm_pool_ops  *pl_ops;           /* Recalc and shrink ops. */ 
-
-        int                    pl_grant_plan;    /* Planned number of granted 
-                                                  * locks for next T. */
-        int                    pl_grant_step;    /* Grant plan step for next 
-                                                  * T. */
-
-        struct lprocfs_stats  *pl_stats;         /* Pool statistics. */
+        /**
+         * Pool proc directory.
+         */
+        cfs_proc_dir_entry_t  *pl_proc_dir;
+        /**
+         * Pool name, should be long enough to contain compound proc entry name.
+         */
+        char                   pl_name[100];
+        /**
+         * Lock for protecting slv/clv updates.
+         */
+        spinlock_t             pl_lock;
+        /**
+         * Number of allowed locks in in pool, both, client and server side.
+         */
+        atomic_t               pl_limit;
+        /**
+         * Number of granted locks in
+         */
+        atomic_t               pl_granted;
+        /**
+         * Grant rate per T.
+         */
+        atomic_t               pl_grant_rate;
+        /**
+         * Cancel rate per T.
+         */
+        atomic_t               pl_cancel_rate;
+        /**
+         * Grant speed (GR-CR) per T.
+         */
+        atomic_t               pl_grant_speed;
+        /**
+         * Server lock volume. Protected by pl_lock.
+         */
+        __u64                  pl_server_lock_volume;
+        /**
+         * Current biggest client lock volume. Protected by pl_lock.
+         */
+        __u64                  pl_client_lock_volume;
+        /**
+         * Lock volume factor. SLV on client is calculated as following:
+         * server_slv * lock_volume_factor.
+         */
+        atomic_t               pl_lock_volume_factor;
+        /**
+         * Time when last slv from server was obtained.
+         */
+        time_t                 pl_recalc_time;
+        /**
+          * Recalc period for pool.
+          */
+        time_t                 pl_recalc_period;
+        /**
+         * Recalc and shrink ops.
+         */
+        struct ldlm_pool_ops  *pl_ops;
+        /**
+         * Number of planned locks for next period.
+         */
+        int                    pl_grant_plan;
+        /**
+         * Pool statistics.
+         */
+        struct lprocfs_stats  *pl_stats;
  };
-
  typedef int (*ldlm_res_policy)(struct ldlm_namespace *, struct ldlm_lock **,
                                 void *req_cookie, ldlm_mode_t mode, int flags,
                                 void *data);
@@ -281,7 +366,7 @@ typedef enum {
  #define NS_DEFAULT_CONTENTION_SECONDS 2
  #define NS_DEFAULT_CONTENDED_LOCKS 32
  
-/* Default value for ->ns_shrink_thumb. If lock is not extent one its cost 
+/* Default value for ->ns_shrink_thumb. If lock is not extent one its cost
   * is one page. Here we have 256 pages which is 1M on i386. Thus by default
   * all extent locks which have more than 1M long extent will be kept in lru,
   * others (including ibits locks) will be canceled on memory pressure event. */
@@ -307,7 +392,8 @@ struct ldlm_namespace {
  
          unsigned int           ns_max_unused;
          unsigned int           ns_max_age;
-        
+        unsigned int           ns_timeouts;
+
          /* Lower limit to number of pages in lock to keep it in cache */
          unsigned int           ns_shrink_thumb;
          cfs_time_t             ns_next_dump;   /* next debug dump, jiffies */
@@ -338,7 +424,7 @@ struct ldlm_namespace {
  static inline int ns_is_client(struct ldlm_namespace *ns)
  {
          LASSERT(ns != NULL);
-        LASSERT(!(ns->ns_client & ~(LDLM_NAMESPACE_CLIENT | 
+        LASSERT(!(ns->ns_client & ~(LDLM_NAMESPACE_CLIENT |
                                      LDLM_NAMESPACE_SERVER)));
          LASSERT(ns->ns_client == LDLM_NAMESPACE_CLIENT ||
                  ns->ns_client == LDLM_NAMESPACE_SERVER);
@@ -348,7 +434,7 @@ static inline int ns_is_client(struct ldlm_namespace *ns)
  static inline int ns_is_server(struct ldlm_namespace *ns)
  {
          LASSERT(ns != NULL);
-        LASSERT(!(ns->ns_client & ~(LDLM_NAMESPACE_CLIENT | 
+        LASSERT(!(ns->ns_client & ~(LDLM_NAMESPACE_CLIENT |
                                      LDLM_NAMESPACE_SERVER)));
          LASSERT(ns->ns_client == LDLM_NAMESPACE_CLIENT ||
                  ns->ns_client == LDLM_NAMESPACE_SERVER);
@@ -383,7 +469,7 @@ typedef int (*ldlm_glimpse_callback)(struct ldlm_lock *lock, void *data);
  /* Interval node data for each LDLM_EXTENT lock */
  struct ldlm_interval {
          struct interval_node li_node;   /* node for tree mgmt */
-        struct list_head     li_group;  /* the locks which have the same 
+        struct list_head     li_group;  /* the locks which have the same
                                           * policy - group of the policy */
  };
  #define to_ldlm_interval(n) container_of(n, struct ldlm_interval, li_node)
@@ -401,7 +487,7 @@ struct ldlm_lock {
          struct portals_handle l_handle; // must be first in the structure
          atomic_t              l_refc;
  
-        /* internal spinlock protects l_resource.  we should hold this lock 
+        /* internal spinlock protects l_resource.  we should hold this lock
           * first before grabbing res_lock.*/
          spinlock_t            l_lock;
  
@@ -416,8 +502,8 @@ struct ldlm_lock {
  
          struct ldlm_interval *l_tree_node;      /* tree node for ldlm_extent */
  
-        /* protected by led_lock */
-        struct list_head      l_export_chain; // per-export chain of locks
+        /* protected by per-bucket exp->exp_lock_hash locks */
+        struct hlist_node     l_exp_hash;       /* per export hash of locks */
  
          /* protected by lr_lock */
          ldlm_mode_t           l_req_mode;
@@ -434,7 +520,7 @@ struct ldlm_lock {
          ldlm_policy_data_t    l_policy_data;
  
          /* protected by lr_lock */
-        __u32                 l_flags;
+        __u64                 l_flags;
          __u32                 l_readers;
          __u32                 l_writers;
          __u8                  l_destroyed;
@@ -469,7 +555,7 @@ struct ldlm_lock {
          /* for ldlm_add_ast_work_item() */
          struct list_head      l_bl_ast;
          struct list_head      l_cp_ast;
-        struct ldlm_lock     *l_blocking_lock; 
+        struct ldlm_lock     *l_blocking_lock;
          int                   l_bl_ast_run;
  
          /* protected by lr_lock, linkages to "skip lists" */
@@ -613,9 +699,11 @@ int ldlm_handle_cancel(struct ptlrpc_request *req);
  int ldlm_request_cancel(struct ptlrpc_request *req,
                          struct ldlm_request *dlm_req, int first);
  int ldlm_del_waiting_lock(struct ldlm_lock *lock);
-int ldlm_refresh_waiting_lock(struct ldlm_lock *lock);
+int ldlm_refresh_waiting_lock(struct ldlm_lock *lock, int timeout);
  int ldlm_get_ref(void);
  void ldlm_put_ref(void);
+int ldlm_init_export(struct obd_export *exp);
+void ldlm_destroy_export(struct obd_export *exp);
  
  /* ldlm_lock.c */
  ldlm_processing_policy ldlm_get_processing_policy(struct ldlm_resource *res);
@@ -746,10 +834,10 @@ int ldlm_cli_enqueue(struct obd_export *exp, struct ptlrpc_request **req,
                       void *lvb, __u32 lvb_len, void *lvb_swabber,
                       struct lustre_handle *lockh, int async);
  struct ptlrpc_request *ldlm_prep_enqueue_req(struct obd_export *exp,
-                                             int bufcount, int *size,
+                                             int bufcount, __u32 *size,
                                               struct list_head *head, int count);
  struct ptlrpc_request *ldlm_prep_elc_req(struct obd_export *exp, int version,
-                                         int opc, int bufcount, int *size,
+                                         int opc, int bufcount, __u32 *size,
                                           int bufoff, int canceloff,
                                           struct list_head *cancels, int count);
  int ldlm_cli_enqueue_fini(struct obd_export *exp, struct ptlrpc_request *req,
@@ -820,11 +908,10 @@ void unlock_res_and_lock(struct ldlm_lock *lock);
  void ldlm_pools_recalc(ldlm_side_t client);
  int ldlm_pools_init(void);
  void ldlm_pools_fini(void);
-void ldlm_pools_wakeup(void);
  
-int ldlm_pool_init(struct ldlm_pool *pl, struct ldlm_namespace *ns, 
+int ldlm_pool_init(struct ldlm_pool *pl, struct ldlm_namespace *ns,
                     int idx, ldlm_side_t client);
-int ldlm_pool_shrink(struct ldlm_pool *pl, int nr, 
+int ldlm_pool_shrink(struct ldlm_pool *pl, int nr,
                       unsigned int gfp_mask);
  void ldlm_pool_fini(struct ldlm_pool *pl);
  int ldlm_pool_setup(struct ldlm_pool *pl, int limit);
diff --git a/lustre/include/lustre_export.h b/lustre/include/lustre_export.h

index 43fc0bb..f0ab6ad 100644 (file)
--- a/lustre/include/lustre_export.h
+++ b/lustre/include/lustre_export.h
@@ -1,5 +1,37 @@
  /* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
   * vim:expandtab:shiftwidth=8:tabstop=8:
+ *
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright  2008 Sun Microsystems, Inc. All rights reserved
+ * Use is subject to license terms.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
   */
  
  #ifndef __EXPORT_H
@@ -8,17 +40,27 @@
  #include <lustre/lustre_idl.h>
  #include <lustre_dlm.h>
  #include <lprocfs_status.h>
+#include <class_hash.h>
+
+struct lu_export_data {
+        struct semaphore        led_lcd_lock; /**< protect led_lcd */
+        struct lsd_client_data *led_lcd;      /**< client data */
+        loff_t                  led_lr_off;   /**< offset in last_rcvd */
+        int                     led_lr_idx;   /**< client index */
+};
  
-/* Data stored per client in the last_rcvd file.  In le32 order. */
  struct mds_export_data {
+        struct lu_export_data   med_led;
          struct list_head        med_open_head;
          spinlock_t              med_open_lock; /* lock med_open_head, mfd_list*/
-        struct lsd_client_data *med_lcd;
          __u64                   med_ibits_known;
-        loff_t                  med_lr_off;
-        int                     med_lr_idx;
  };
  
+#define med_lcd_lock    med_led.led_lcd_lock
+#define med_lcd         med_led.led_lcd
+#define med_lr_off      med_led.led_lr_off
+#define med_lr_idx      med_led.led_lr_idx
+
  struct osc_creator {
          spinlock_t              oscc_lock;
          struct list_head        oscc_list;
@@ -26,26 +68,20 @@ struct osc_creator {
          obd_id                  oscc_last_id;//last available pre-created object
          obd_id                  oscc_next_id;// what object id to give out next
          int                     oscc_grow_count;
+        int                     oscc_max_grow_count;
          struct obdo             oscc_oa;
          int                     oscc_flags;
          cfs_waitq_t             oscc_waitq; /* creating procs wait on this */
  };
  
-struct ldlm_export_data {
-        struct list_head       led_held_locks; /* protected by led_lock below */
-        spinlock_t             led_lock;
-};
-
  struct ec_export_data { /* echo client */
          struct list_head eced_locks;
  };
  
  /* In-memory access to client data from OST struct */
  struct filter_export_data {
-        spinlock_t                 fed_lock;      /* protects fed_open_head */
-        struct lsd_client_data    *fed_lcd;
-        loff_t                     fed_lr_off;
-        int                        fed_lr_idx;
+        struct lu_export_data      fed_led;
+        spinlock_t                 fed_lock;      /**< protects fed_mod_list */
          long                       fed_dirty;    /* in bytes */
          long                       fed_grant;    /* in bytes */
          struct list_head           fed_mod_list; /* files being modified */
@@ -54,22 +90,41 @@ struct filter_export_data {
          struct brw_stats           fed_brw_stats;
  };
  
+#define fed_lcd_lock    fed_led.led_lcd_lock
+#define fed_lcd         fed_led.led_lcd
+#define fed_lr_off      fed_led.led_lr_off
+#define fed_lr_idx      fed_led.led_lr_idx
+
+typedef struct nid_stat_uuid {
+        struct list_head ns_uuid_list;
+        struct obd_uuid  ns_uuid;
+} nid_stat_uuid_t;
+
  typedef struct nid_stat {
          lnet_nid_t               nid;
          struct hlist_node        nid_hash;
          struct list_head         nid_list;
+        struct list_head         nid_uuid_list;
          struct obd_device       *nid_obd;
          struct proc_dir_entry   *nid_proc;
          struct lprocfs_stats    *nid_stats;
          struct brw_stats        *nid_brw_stats;
+        struct lprocfs_stats    *nid_ldlm_stats;
          int                      nid_exp_ref_count;
-}nid_stat_t;
+} nid_stat_t;
+
+enum obd_option {
+        OBD_OPT_FORCE =         0x0001,
+        OBD_OPT_FAILOVER =      0x0002,
+        OBD_OPT_ABORT_RECOV =   0x0004,
+};
  
  struct obd_export {
          struct portals_handle     exp_handle;
          atomic_t                  exp_refcount;
          atomic_t                  exp_rpc_count;
          struct obd_uuid           exp_client_uuid;
+        lnet_nid_t                exp_client_nid;
          struct list_head          exp_obd_chain;
          struct hlist_node         exp_uuid_hash; /* uuid-export hash*/
          struct hlist_node         exp_nid_hash; /* nid-export hash */
@@ -78,36 +133,54 @@ struct obd_export {
          struct obd_device        *exp_obd;
          struct obd_import        *exp_imp_reverse; /* to make RPCs backwards */
          struct nid_stat          *exp_nid_stats;
-        struct lprocfs_stats     *exp_ldlm_stats;
          struct lprocfs_stats     *exp_ops_stats;
          struct ptlrpc_connection *exp_connection;
          __u32                     exp_conn_cnt;
-        struct ldlm_export_data   exp_ldlm_data;
+        lustre_hash_t            *exp_lock_hash; /* existing lock hash */
+        spinlock_t                exp_lock_hash_lock;
          struct list_head          exp_outstanding_replies;
+        struct list_head          exp_uncommitted_replies;
+        spinlock_t                exp_uncommitted_replies_lock;
          time_t                    exp_last_request_time;
          struct list_head          exp_req_replay_queue;
          spinlock_t                exp_lock; /* protects flags int below */
          /* ^ protects exp_outstanding_replies too */
          __u64                     exp_connect_flags;
-        int                       exp_flags;
+        enum obd_option           exp_flags;
          unsigned long             exp_failed:1,
+                                  exp_in_recovery:1,
                                    exp_disconnected:1,
                                    exp_connecting:1,
+                                  /* VBR: export missed recovery */
+                                  exp_delayed:1,
+                                  /* VBR: failed version checking */
+                                  exp_vbr_failed:1,
                                    exp_replay_needed:1,
                                    exp_need_sync:1, /* needs sync from connect */
                                    exp_libclient:1; /* liblustre client? */
+        struct list_head          exp_queued_rpc;  /* RPC to be handled */
+        /* VBR: per-export last committed */
+        __u64                     exp_last_committed;
          union {
+                struct lu_export_data     eu_target_data;
                  struct mds_export_data    eu_mds_data;
                  struct filter_export_data eu_filter_data;
                  struct ec_export_data     eu_ec_data;
          } u;
  };
  
+#define exp_target_data u.eu_target_data
  #define exp_mds_data    u.eu_mds_data
-#define exp_lov_data    u.eu_lov_data
  #define exp_filter_data u.eu_filter_data
  #define exp_ec_data     u.eu_ec_data
  
+static inline int exp_expired(struct obd_export *exp, __u32 age)
+{
+        LASSERT(exp->exp_delayed);
+        return cfs_time_before(exp->exp_last_request_time + age,
+                               cfs_time_current_sec());
+}
+
  static inline int exp_connect_cancelset(struct obd_export *exp)
  {
          LASSERT(exp != NULL);
@@ -120,6 +193,13 @@ static inline int exp_connect_lru_resize(struct obd_export *exp)
          return !!(exp->exp_connect_flags & OBD_CONNECT_LRU_RESIZE);
  }
  
+static inline int exp_connect_vbr(struct obd_export *exp)
+{
+        LASSERT(exp != NULL);
+        LASSERT(exp->exp_connection);
+        return !!(exp->exp_connect_flags & OBD_CONNECT_VBR);
+}
+
  static inline int imp_connect_lru_resize(struct obd_import *imp)
  {
          struct obd_connect_data *ocd;
diff --git a/lustre/include/lustre_fsfilt.h b/lustre/include/lustre_fsfilt.h

index 41b9431..2ff66f9 100644 (file)
--- a/lustre/include/lustre_fsfilt.h
+++ b/lustre/include/lustre_fsfilt.h
@@ -1,25 +1,41 @@
  /* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
   * vim:expandtab:shiftwidth=8:tabstop=8:
   *
- *  Copyright (C) 2001-2004 Cluster File Systems, Inc. <info@clusterfs.com>
+ * GPL HEADER START
   *
- *   This file is part of Lustre, http://www.lustre.org.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   *
- *   Lustre is free software; you can redistribute it and/or
- *   modify it under the terms of version 2 of the GNU General Public
- *   License as published by the Free Software Foundation.
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
   *
- *   Lustre is distributed in the hope that it will be useful,
- *   but WITHOUT ANY WARRANTY; without even the implied warranty of
- *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- *   GNU General Public License for more details.
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
   *
- *   You should have received a copy of the GNU General Public License
- *   along with Lustre; if not, write to the Free Software
- *   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
   *
- * Filesystem interface helper.
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright  2008 Sun Microsystems, Inc. All rights reserved
+ * Use is subject to license terms.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/include/lustre_fsfilt.h
   *
+ * Filesystem interface helper.
   */
  
  #ifndef _LUSTRE_FSFILT_H
diff --git a/lustre/include/lustre_ha.h b/lustre/include/lustre_ha.h

index 43071ba..448ee12 100644 (file)
--- a/lustre/include/lustre_ha.h
+++ b/lustre/include/lustre_ha.h
@@ -1,5 +1,37 @@
  /* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
   * vim:expandtab:shiftwidth=8:tabstop=8:
+ *
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright  2008 Sun Microsystems, Inc. All rights reserved
+ * Use is subject to license terms.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
   */
  
  #ifndef _LUSTRE_HA_H
diff --git a/lustre/include/lustre_handles.h b/lustre/include/lustre_handles.h

index 9186173..e8d24e8 100644 (file)
--- a/lustre/include/lustre_handles.h
+++ b/lustre/include/lustre_handles.h
@@ -1,3 +1,39 @@
+/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
+ * vim:expandtab:shiftwidth=8:tabstop=8:
+ *
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright  2008 Sun Microsystems, Inc. All rights reserved
+ * Use is subject to license terms.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
  #ifndef __LUSTRE_HANDLES_H_
  #define __LUSTRE_HANDLES_H_
  
diff --git a/lustre/include/lustre_import.h b/lustre/include/lustre_import.h

index fb66323..8852741 100644 (file)
--- a/lustre/include/lustre_import.h
+++ b/lustre/include/lustre_import.h
@@ -1,5 +1,37 @@
  /* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
   * vim:expandtab:shiftwidth=8:tabstop=8:
+ *
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright  2008 Sun Microsystems, Inc. All rights reserved
+ * Use is subject to license terms.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
   */
  
  #ifndef __IMPORT_H
@@ -8,7 +40,6 @@
  #include <lustre_handles.h>
  #include <lustre/lustre_idl.h>
  
-
  /* Adaptive Timeout stuff */
  #define D_ADAPTTO D_OTHER
  #define AT_BINS 4                  /* "bin" means "N seconds of history" */
@@ -64,7 +95,7 @@ struct obd_import_conn {
          __u64                     oic_last_attempt; /* jiffies, 64-bit */
  };
  
-#define IMP_AT_MAX_PORTALS 4
+#define IMP_AT_MAX_PORTALS 8
  struct imp_at {
          int                     iat_portal[IMP_AT_MAX_PORTALS];
          struct adaptive_timeout iat_net_latency;
@@ -91,6 +122,7 @@ struct obd_import {
          cfs_waitq_t               imp_recovery_waitq;
  
          atomic_t                  imp_inflight;
+        atomic_t                  imp_unregistering;
          atomic_t                  imp_replay_inflight;
          atomic_t                  imp_inval_count;
          enum lustre_imp_state     imp_state;
@@ -117,8 +149,11 @@ struct obd_import {
                                    imp_replayable:1,       /* try to recover the import */
                                    imp_dlm_fake:1,         /* don't run recovery (timeout instead) */
                                    imp_server_timeout:1,   /* use 1/2 timeout on MDS' OSCs */
-                                  imp_initial_recov:1,    /* retry the initial connection */  
+                                  imp_initial_recov:1,    /* retry the initial connection */
                                    imp_initial_recov_bk:1, /* turn off init_recov after trying all failover nids */
+                                  imp_delayed_recovery:1, /* VBR: imp in delayed recovery */
+                                  imp_no_lock_replay:1,   /* VBR: if gap was found then no lock replays */
+                                  imp_vbr_failed:1,       /* recovery by versions was failed */
                                    imp_force_verify:1,     /* force an immidiate ping */
                                    imp_pingable:1,         /* pingable */
                                    imp_resend_replay:1,    /* resend for replay */
@@ -138,6 +173,18 @@ struct obd_import {
  };
  
  /* import.c */
+static inline unsigned int at_est2timeout(unsigned int val)
+{
+        /* add an arbitrary minimum: 125% +5 sec */
+        return (val + (val >> 2) + 5);
+}
+
+static inline unsigned int at_timeout2est(unsigned int val)
+{
+        /* restore estimate value from timeout: e=4/5(t-5) */
+        return (max((val << 2) / 5, 5U) - 4);
+}
+
  static inline void at_init(struct adaptive_timeout *at, int val, int flags) {
          memset(at, 0, sizeof(*at));
          at->at_current = val;
diff --git a/lustre/include/lustre_lib.h b/lustre/include/lustre_lib.h

index 67c20a0..7c1a622 100644 (file)
--- a/lustre/include/lustre_lib.h
+++ b/lustre/include/lustre_lib.h
@@ -1,25 +1,41 @@
  /* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
   * vim:expandtab:shiftwidth=8:tabstop=8:
   *
- *  Copyright (C) 2001 Cluster File Systems, Inc. <braam@clusterfs.com>
+ * GPL HEADER START
   *
- *   This file is part of Lustre, http://www.lustre.org.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   *
- *   Lustre is free software; you can redistribute it and/or
- *   modify it under the terms of version 2 of the GNU General Public
- *   License as published by the Free Software Foundation.
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
   *
- *   Lustre is distributed in the hope that it will be useful,
- *   but WITHOUT ANY WARRANTY; without even the implied warranty of
- *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- *   GNU General Public License for more details.
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
   *
- *   You should have received a copy of the GNU General Public License
- *   along with Lustre; if not, write to the Free Software
- *   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
   *
- * Basic Lustre library routines.
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
   *
+ * GPL HEADER END
+ */
+/*
+ * Copyright  2008 Sun Microsystems, Inc. All rights reserved
+ * Use is subject to license terms.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/include/lustre_lib.h
+ *
+ * Basic Lustre library routines.
   */
  
  #ifndef _LUSTRE_LIB_H
@@ -55,6 +71,18 @@ struct obd_export;
  
  void target_client_add_cb(struct obd_device *obd, __u64 transno, void *cb_data,
                            int error);
+void target_trans_table_init(struct obd_device *obd);
+__u32 target_trans_table_last_time(struct obd_export *exp);
+void target_trans_table_recalc(struct obd_device *obd, __u32 new_age);
+void target_trans_table_update(struct obd_export *exp, __u64 transno);
+#ifdef __KERNEL__
+int target_fs_version_capable(struct obd_device *obd);
+#else
+static inline int target_fs_version_capable(struct obd_device *obd)
+{
+        return 0;
+}
+#endif
  int target_handle_connect(struct ptlrpc_request *req, svc_handler_t handler);
  int target_handle_disconnect(struct ptlrpc_request *req);
  void target_destroy_export(struct obd_export *exp);
@@ -64,21 +92,21 @@ int target_handle_ping(struct ptlrpc_request *req);
  int target_pack_pool_reply(struct ptlrpc_request *req);
  void target_committed_to_req(struct ptlrpc_request *req);
  
-#ifdef HAVE_QUOTA_SUPPORT
  /* quotacheck callback, dqacq/dqrel callback handler */
  int target_handle_qc_callback(struct ptlrpc_request *req);
+#ifdef HAVE_QUOTA_SUPPORT
  int target_handle_dqacq_callback(struct ptlrpc_request *req);
  #else
  #define target_handle_dqacq_callback(req) ldlm_callback_reply(req, -ENOTSUPP)
-#define target_handle_qc_callback(req) (0)
  #endif
  
  void target_cancel_recovery_timer(struct obd_device *obd);
  void target_abort_recovery(void *data);
+int target_recovery_check_and_stop(struct obd_device *obd);
  void target_cleanup_recovery(struct obd_device *obd);
  int target_queue_recovery_request(struct ptlrpc_request *req,
                                    struct obd_device *obd);
-int target_queue_last_replay_reply(struct ptlrpc_request *req, int rc);
+int target_handle_reply(struct ptlrpc_request *req, int rc, int fail);
  void target_send_reply(struct ptlrpc_request *req, int rc, int fail_id);
  
  /* client.c */
@@ -446,6 +474,7 @@ static inline void obd_ioctl_freedata(char *buf, int len)
  
  #define OBD_IOC_LOV_GET_CONFIG         _IOWR('f', 132, OBD_IOC_DATA_TYPE)
  #define OBD_IOC_CLIENT_RECOVER         _IOW ('f', 133, OBD_IOC_DATA_TYPE)
+#define OBD_IOC_PING_TARGET            _IOW ('f', 136, OBD_IOC_DATA_TYPE)
  
  #define OBD_IOC_DEC_FS_USE_COUNT       _IO  ('f', 139      )
  #define OBD_IOC_NO_TRANSNO             _IOW ('f', 140, OBD_IOC_DATA_TYPE)
@@ -476,6 +505,7 @@ static inline void obd_ioctl_freedata(char *buf, int len)
  #define OBD_IOC_DUMP_LOG               _IOWR('f', 185, OBD_IOC_DATA_TYPE)
  #define OBD_IOC_CLEAR_LOG              _IOWR('f', 186, OBD_IOC_DATA_TYPE)
  #define OBD_IOC_PARAM                  _IOW ('f', 187, OBD_IOC_DATA_TYPE)
+#define OBD_IOC_POOL                   _IOWR('f', 188, OBD_IOC_DATA_TYPE)
  
  #define OBD_IOC_CATLOGLIST             _IOWR('f', 190, OBD_IOC_DATA_TYPE)
  #define OBD_IOC_LLOG_INFO              _IOWR('f', 191, OBD_IOC_DATA_TYPE)
@@ -759,4 +789,3 @@ do {                                                                    \
  #endif
  
  #endif /* _LUSTRE_LIB_H */
-
diff --git a/lustre/include/lustre_lite.h b/lustre/include/lustre_lite.h

index e4e8ee8..6543cba 100644 (file)
--- a/lustre/include/lustre_lite.h
+++ b/lustre/include/lustre_lite.h
@@ -1,5 +1,37 @@
  /* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
   * vim:expandtab:shiftwidth=8:tabstop=8:
+ *
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright  2008 Sun Microsystems, Inc. All rights reserved
+ * Use is subject to license terms.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
   */
  
  #ifndef _LL_H
@@ -98,6 +130,8 @@ struct lustre_client_ocd {
           */
          __u64      lco_flags;
          spinlock_t lco_lock;
+        struct obd_export *lco_mdc_exp;
+        struct obd_export *lco_osc_exp;
  };
  
  /*
@@ -126,7 +160,11 @@ static inline int ll_ocd_update(struct obd_device *host,
                         lco->lco_flags, flags);
                  spin_lock(&lco->lco_lock);
                  lco->lco_flags &= flags;
+                /* for each osc event update ea size */
+                if (lco->lco_osc_exp)
+                        mdc_init_ea_size(lco->lco_mdc_exp, lco->lco_osc_exp);
                  spin_unlock(&lco->lco_lock);
+
                  result = 0;
          } else {
                  CERROR("unexpected notification from %s %s!\n",
diff --git a/lustre/include/lustre_log.h b/lustre/include/lustre_log.h

index 565d08d..ad872a5 100644 (file)
--- a/lustre/include/lustre_log.h
+++ b/lustre/include/lustre_log.h
@@ -1,25 +1,41 @@
  /* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
   * vim:expandtab:shiftwidth=8:tabstop=8:
   *
- *  Copyright (C) 2001 Cluster File Systems, Inc. <info@clusterfs.com>
+ * GPL HEADER START
   *
- *   This file is part of Lustre, http://www.lustre.org.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   *
- *   Lustre is free software; you can redistribute it and/or
- *   modify it under the terms of version 2 of the GNU General Public
- *   License as published by the Free Software Foundation.
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
   *
- *   Lustre is distributed in the hope that it will be useful,
- *   but WITHOUT ANY WARRANTY; without even the implied warranty of
- *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- *   GNU General Public License for more details.
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
   *
- *   You should have received a copy of the GNU General Public License
- *   along with Lustre; if not, write to the Free Software
- *   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
   *
- * Generic infrastructure for managing a collection of logs.
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright  2008 Sun Microsystems, Inc. All rights reserved
+ * Use is subject to license terms.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/include/lustre_log.h
   *
+ * Generic infrastructure for managing a collection of logs.
   * These logs are used for:
   *
   * - orphan recovery: OST adds record on create
@@ -96,16 +112,44 @@ extern int llog_cancel_rec(struct llog_handle *loghandle, int index);
  extern int llog_close(struct llog_handle *cathandle);
  extern int llog_get_size(struct llog_handle *loghandle);
  
-/* llog_cat.c   -  catalog api */
+/* llog_cat.c - catalog api */
  struct llog_process_data {
-        void *lpd_data;
-        llog_cb_t lpd_cb;
+        /**
+         * Any useful data needed while processing catalog. This is
+         * passed later to process callback.
+         */
+        void                *lpd_data;
+        /**
+         * Catalog process callback function, called for each record
+         * in catalog.
+         */
+        llog_cb_t            lpd_cb;
  };
  
  struct llog_process_cat_data {
-        int     first_idx;
-        int     last_idx;
-        /* to process catalog across zero record */
+        /**
+         * Temporary stored first_idx while scanning log.
+         */
+        int                  lpcd_first_idx;
+        /**
+         * Temporary stored last_idx while scanning log.
+         */
+        int                  lpcd_last_idx;
+};
+
+struct llog_process_cat_args {
+        /**
+         * Llog context used in recovery thread on OST (recov_thread.c)
+         */
+        struct llog_ctxt    *lpca_ctxt;
+        /**
+         * Llog callback used in recovery thread on OST (recov_thread.c)
+         */
+        void                *lpca_cb;
+        /**
+         * Data pointer for llog callback.
+         */
+        void                *lpca_arg;
  };
  
  int llog_cat_put(struct llog_handle *cathandle);
@@ -114,6 +158,7 @@ int llog_cat_add_rec(struct llog_handle *cathandle, struct llog_rec_hdr *rec,
  int llog_cat_cancel_records(struct llog_handle *cathandle, int count,
                              struct llog_cookie *cookies);
  int llog_cat_process(struct llog_handle *cat_llh, llog_cb_t cb, void *data);
+int llog_cat_process_thread(void *data);
  int llog_cat_reverse_process(struct llog_handle *cat_llh, llog_cb_t cb, void *data);
  int llog_cat_set_first_idx(struct llog_handle *cathandle, int index);
  
@@ -137,7 +182,7 @@ int llog_obd_origin_add(struct llog_ctxt *ctxt,
                          struct llog_rec_hdr *rec, struct lov_stripe_md *lsm,
                          struct llog_cookie *logcookies, int numcookies);
  
-int llog_cat_initialize(struct obd_device *obd, int count, 
+int llog_cat_initialize(struct obd_device *obd, int idx,
                          struct obd_uuid *uuid);
  int obd_llog_init(struct obd_device *obd, struct obd_device *disk_obd,
                    int count, struct llog_catid *logid, struct obd_uuid *uuid);
@@ -152,7 +197,7 @@ int llog_catalog_list(struct obd_device *obd, int count,
  /* llog_net.c */
  int llog_initiator_connect(struct llog_ctxt *ctxt);
  int llog_receptor_accept(struct llog_ctxt *ctxt, struct obd_import *imp);
-int llog_origin_connect(struct llog_ctxt *ctxt, int count,
+int llog_origin_connect(struct llog_ctxt *ctxt,
                          struct llog_logid *logid, struct llog_gen *gen,
                          struct obd_uuid *uuid);
  int llog_handle_connect(struct ptlrpc_request *req);
@@ -162,9 +207,9 @@ int llog_obd_repl_cancel(struct llog_ctxt *ctxt,
                           struct lov_stripe_md *lsm, int count,
                           struct llog_cookie *cookies, int flags);
  int llog_obd_repl_sync(struct llog_ctxt *ctxt, struct obd_export *exp);
-int llog_repl_connect(struct llog_ctxt *ctxt, int count,
-                      struct llog_logid *logid, struct llog_gen *gen,
-                      struct obd_uuid *uuid);
+int llog_obd_repl_connect(struct llog_ctxt *ctxt,
+                          struct llog_logid *logid, struct llog_gen *gen,
+                          struct obd_uuid *uuid);
  
  struct llog_operations {
          int (*lop_write_rec)(struct llog_handle *loghandle,
@@ -191,7 +236,7 @@ struct llog_operations {
                         struct llog_cookie *logcookies, int numcookies);
          int (*lop_cancel)(struct llog_ctxt *ctxt, struct lov_stripe_md *lsm,
                            int count, struct llog_cookie *cookies, int flags);
-        int (*lop_connect)(struct llog_ctxt *ctxt, int count,
+        int (*lop_connect)(struct llog_ctxt *ctxt,
                             struct llog_logid *logid, struct llog_gen *gen,
                             struct obd_uuid *uuid);
          /* XXX add 2 more: commit callbacks and llog recovery functions */
@@ -200,7 +245,11 @@ struct llog_operations {
  /* llog_lvfs.c */
  extern struct llog_operations llog_lvfs_ops;
  int llog_get_cat_list(struct obd_device *obd, struct obd_device *disk_obd,
-                      char *name, int count, struct llog_catid *idarray);
+                      char *name, int idx, int count,
+                      struct llog_catid *idarray);
+
+int llog_put_cat_list(struct obd_device *obd, struct obd_device *disk_obd,
+                      char *name, int idx, int count, struct llog_catid *idarray);
  
  struct llog_ctxt {
          int                      loc_idx; /* my index the obd array of ctxt's */
@@ -218,13 +267,76 @@ struct llog_ctxt {
          void                    *llog_proc_cb;
  };
  
-#ifndef __KERNEL__
-
-#define cap_raise(c, flag) do {} while(0)
+#define LCM_NAME_SIZE 64
+
+struct llog_commit_master {
+        /**
+         * Thread control flags (start, stop, etc.)
+         */
+        long                       lcm_flags;
+        /**
+         * Number of llcds onthis lcm.
+         */
+        atomic_t                   lcm_count;
+        /**
+         * Thread control structure. Used for control commit thread.
+         */
+        struct ptlrpcd_ctl         lcm_pc;
+        /**
+         * Lock protecting list of llcds.
+         */
+        spinlock_t                 lcm_lock;
+        /**
+         * Llcds in flight for debugging purposes.
+         */
+        struct list_head           lcm_llcds;
+        /**
+         * Commit thread name buffer. Only used for thread start.
+         */
+        char                       lcm_name[LCM_NAME_SIZE];
+};
  
-#define CAP_SYS_RESOURCE 24
+struct llog_canceld_ctxt {
+        /**
+         * Llog context this llcd is attached to. Used for accessing
+         * ->loc_import and others in process of canceling cookies
+         * gathered in this llcd.
+         */
+        struct llog_ctxt          *llcd_ctxt;
+        /**
+         * Cancel thread control stucture pointer. Used for accessing
+         * it to see if should stop processing and other needs.
+         */
+        struct llog_commit_master *llcd_lcm;
+        /**
+         * Maximal llcd size. Used in calculations on how much of room
+         * left in llcd to cookie comming cookies.
+         */
+        int                        llcd_size;
+        /**
+         * Link to lcm llcds list.
+         */
+        struct list_head           llcd_list;
+        /**
+         * Current llcd size while gathering cookies. This should not be
+         * more than ->llcd_size. Used for determining if we need to
+         * send this llcd (if full) and allocate new one. This is also
+         * used for copying new cookie at the end of buffer.
+         */
+        int                        llcd_cookiebytes;
+        /**
+         * Pointer to the start of cookies buffer.
+         */
+        struct llog_cookie         llcd_cookies[0];
+};
  
-#endif   /* !__KERNEL__ */
+/* ptlrpc/recov_thread.c */
+extern struct llog_commit_master *llog_recov_thread_init(char *name);
+extern void llog_recov_thread_fini(struct llog_commit_master *lcm, 
+                                   int force);
+extern int llog_recov_thread_start(struct llog_commit_master *lcm);
+extern void llog_recov_thread_stop(struct llog_commit_master *lcm, 
+                                   int force);
  
  static inline void llog_gen_init(struct llog_ctxt *ctxt)
  {
@@ -292,10 +404,10 @@ static inline int llog_data_len(int len)
  do {                                                                        \
           if ((ctxt) == NULL)                                                \
                   break;                                                     \
-         CDEBUG(D_INFO, "PUTting ctxt %p : new refcount %d\n", (ctxt),      \
-                atomic_read(&(ctxt)->loc_refcount) - 1);                    \
           LASSERT(atomic_read(&(ctxt)->loc_refcount) > 0);                   \
           LASSERT(atomic_read(&(ctxt)->loc_refcount) < 0x5a5a5a);            \
+         CDEBUG(D_INFO, "PUTting ctxt %p : new refcount %d\n", (ctxt),      \
+                atomic_read(&(ctxt)->loc_refcount) - 1);                    \
           __llog_ctxt_put(ctxt);                                             \
  } while (0)
  
@@ -303,14 +415,16 @@ static inline struct llog_ctxt *llog_get_context(struct obd_device *obd,
                                                     int index)
  {
           struct llog_ctxt *ctxt;
- 
-         if (index < 0 || index >= LLOG_MAX_CTXTS)
+
+         if (index < 0 || index >= LLOG_MAX_CTXTS) {
+                 CDEBUG(D_INFO, "obd %p bad index %d\n", obd, index);
                   return NULL;
-        
-         spin_lock(&obd->obd_dev_lock);  
+         }
+
+         spin_lock(&obd->obd_dev_lock);
           if (obd->obd_llog_ctxt[index] == NULL) {
                   spin_unlock(&obd->obd_dev_lock);
-                 CDEBUG(D_INFO, "obd %p and ctxt index %d is NULL \n", obd, index);
+                 CDEBUG(D_INFO,"obd %p and ctxt index %d is NULL \n",obd,index);
                   return NULL;
           }
           ctxt = llog_ctxt_get(obd->obd_llog_ctxt[index]);
@@ -329,8 +443,7 @@ static inline int llog_write_rec(struct llog_handle *handle,
                                   int numcookies, void *buf, int idx)
  {
          struct llog_operations *lop;
-        __u32 cap;
-        int rc, buflen;
+        int raised, rc, buflen;
          ENTRY;
  
          rc = llog_handle2ops(handle, &lop);
@@ -346,10 +459,12 @@ static inline int llog_write_rec(struct llog_handle *handle,
                  buflen = rec->lrh_len;
          LASSERT(size_round(buflen) == buflen);
  
-        cap = current->cap_effective;             
-        cap_raise(current->cap_effective, CAP_SYS_RESOURCE); 
+        raised = cfs_cap_raised(CFS_CAP_SYS_RESOURCE);
+        if (!raised)
+                cfs_cap_raise(CFS_CAP_SYS_RESOURCE); 
          rc = lop->lop_write_rec(handle, rec, logcookies, numcookies, buf, idx);
-        current->cap_effective = cap; 
+        if (!raised)
+                cfs_cap_lower(CFS_CAP_SYS_RESOURCE); 
          RETURN(rc);
  }
  
@@ -445,8 +560,7 @@ static inline int llog_create(struct llog_ctxt *ctxt, struct llog_handle **res,
                                struct llog_logid *logid, char *name)
  {
          struct llog_operations *lop;
-        __u32 cap;
-        int rc;
+        int raised, rc;
          ENTRY;
  
          rc = llog_obd2ops(ctxt, &lop);
@@ -455,14 +569,16 @@ static inline int llog_create(struct llog_ctxt *ctxt, struct llog_handle **res,
          if (lop->lop_create == NULL)
                  RETURN(-EOPNOTSUPP);
  
-        cap = current->cap_effective;             
-        cap_raise(current->cap_effective, CAP_SYS_RESOURCE);
+        raised = cfs_cap_raised(CFS_CAP_SYS_RESOURCE);
+        if (!raised)
+                cfs_cap_raise(CFS_CAP_SYS_RESOURCE);
          rc = lop->lop_create(ctxt, res, logid, name);
-        current->cap_effective = cap; 
+        if (!raised)
+                cfs_cap_lower(CFS_CAP_SYS_RESOURCE);
          RETURN(rc);
  }
  
-static inline int llog_connect(struct llog_ctxt *ctxt, int count,
+static inline int llog_connect(struct llog_ctxt *ctxt,
                                 struct llog_logid *logid, struct llog_gen *gen,
                                 struct obd_uuid *uuid)
  {
@@ -476,7 +592,7 @@ static inline int llog_connect(struct llog_ctxt *ctxt, int count,
          if (lop->lop_connect == NULL)
                  RETURN(-EOPNOTSUPP);
  
-        rc = lop->lop_connect(ctxt, count, logid, gen, uuid);
+        rc = lop->lop_connect(ctxt, logid, gen, uuid);
          RETURN(rc);
  }
  
diff --git a/lustre/include/lustre_mds.h b/lustre/include/lustre_mds.h

index 47d5073..d7afc22 100644 (file)
--- a/lustre/include/lustre_mds.h
+++ b/lustre/include/lustre_mds.h
@@ -1,7 +1,39 @@
  /* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
   * vim:expandtab:shiftwidth=8:tabstop=8:
   *
- *   This file is part of Lustre, http://www.lustre.org
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright  2008 Sun Microsystems, Inc. All rights reserved
+ * Use is subject to license terms.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/include/lustre_mds.h
   *
   * MDS data structures.
   * See also lustre_idl.h for wire formats of requests.
@@ -171,13 +203,14 @@ struct obd_client_handle;
  void mdc_set_open_replay_data(struct obd_client_handle *och,
                                struct ptlrpc_request *open_req);
  void mdc_clear_open_replay_data(struct obd_client_handle *och);
-int mdc_close(struct obd_export *, struct obdo *, struct obd_client_handle *,
-              struct ptlrpc_request **);
+int mdc_close(struct obd_export *, struct mdc_op_data *, struct obdo *,
+              struct obd_client_handle *, struct ptlrpc_request **);
  int mdc_readpage(struct obd_export *exp, struct ll_fid *mdc_fid, __u64 offset,
                   struct page *, struct ptlrpc_request **);
  int mdc_create(struct obd_export *exp, struct mdc_op_data *op_data,
                 const void *data, int datalen, int mode, __u32 uid, __u32 gid,
-               __u32 cap_effective, __u64 rdev,struct ptlrpc_request **request);
+               cfs_cap_t cap_effective, __u64 rdev,
+               struct ptlrpc_request **request);
  int mdc_unlink(struct obd_export *exp, struct mdc_op_data *data,
                 struct ptlrpc_request **request);
  int mdc_link(struct obd_export *exp, struct mdc_op_data *data,
@@ -196,10 +229,10 @@ int mdc_resource_get_unused(struct obd_export *exp, struct ll_fid *fid,
  void mdc_store_inode_generation(struct ptlrpc_request *req, int reqoff,
                                  int repoff);
  int mdc_llog_process(struct obd_export *, char *logname, llog_cb_t, void *data);
-int mdc_done_writing(struct obd_export *exp, struct obdo *);
+int mdc_done_writing(struct obd_export *, struct mdc_op_data *, struct obdo *);
  
-static inline void mdc_pack_fid(struct ll_fid *fid, obd_id ino, __u32 gen,
-                                int type)
+static inline void ll_pack_fid(struct ll_fid *fid, obd_id ino, __u32 gen,
+                               int type)
  {
          fid->id = ino;
          fid->generation = gen;
@@ -237,7 +270,6 @@ typedef int (* md_enqueue_cb_t)(struct obd_export *exp,
                                  int rc);
  
  struct md_enqueue_info {
-        struct obd_export      *mi_exp;
          struct mdc_op_data      mi_data;
          struct lookup_intent    mi_it;
          struct lustre_handle    mi_lockh;
@@ -247,9 +279,4 @@ struct md_enqueue_info {
          void                   *mi_cbdata;
  };
  
-struct mdc_enqueue_args {
-        struct md_enqueue_info   *ma_mi;
-        struct ldlm_enqueue_info *ma_ei;
-};
-
  #endif
diff --git a/lustre/include/lustre_net.h b/lustre/include/lustre_net.h

index 7e57a14..cccf70c 100644 (file)
--- a/lustre/include/lustre_net.h
+++ b/lustre/include/lustre_net.h
@@ -1,23 +1,37 @@
  /* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
   * vim:expandtab:shiftwidth=8:tabstop=8:
   *
- *  Copyright (C) 2002, 2003 Cluster File Systems, Inc.
+ * GPL HEADER START
   *
- *   This file is part of Lustre, http://www.lustre.org.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   *
- *   Lustre is free software; you can redistribute it and/or
- *   modify it under the terms of version 2 of the GNU General Public
- *   License as published by the Free Software Foundation.
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
   *
- *   Lustre is distributed in the hope that it will be useful,
- *   but WITHOUT ANY WARRANTY; without even the implied warranty of
- *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- *   GNU General Public License for more details.
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
   *
- *   You should have received a copy of the GNU General Public License
- *   along with Lustre; if not, write to the Free Software
- *   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
   *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright  2008 Sun Microsystems, Inc. All rights reserved
+ * Use is subject to license terms.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
   */
  
  #ifndef _LUSTRE_NET_H
@@ -41,6 +55,8 @@
  #include <lustre_import.h>
  #include <lprocfs_status.h>
  
+#include <obd_support.h>
+
  /* MD flags we _always_ use */
  #define PTLRPC_MD_OPTIONS  0
  
@@ -128,7 +144,7 @@
  #define MGS_MAXREPSIZE  (9 * 1024)
  
  /* Absolute limits */
-#define OSS_THREADS_MIN 2
+#define OSS_THREADS_MIN 3       /* difficult replies, HPQ, others */
  #define OSS_THREADS_MAX 512
  #define OST_NBUFS       (64 * num_online_cpus())
  #define OST_BUFSIZE     (8 * 1024)
@@ -141,8 +157,10 @@
  #define OST_MAXREQSIZE  (5 * 1024)
  #define OST_MAXREPSIZE  (9 * 1024)
  
+/* Macro to hide a typecast. */
+#define ptlrpc_req_async_args(req) ((void *)&req->rq_async_args)
+
  struct ptlrpc_connection {
-        struct list_head        c_link;
          struct hlist_node       c_hash;
          lnet_nid_t              c_self;
          lnet_process_id_t       c_peer;
@@ -244,11 +262,13 @@ struct ptlrpc_reply_state {
  struct ptlrpc_thread;
  
  enum rq_phase {
-        RQ_PHASE_NEW         = 0xebc0de00,
-        RQ_PHASE_RPC         = 0xebc0de01,
-        RQ_PHASE_BULK        = 0xebc0de02,
-        RQ_PHASE_INTERPRET   = 0xebc0de03,
-        RQ_PHASE_COMPLETE    = 0xebc0de04,
+        RQ_PHASE_NEW            = 0xebc0de00,
+        RQ_PHASE_RPC            = 0xebc0de01,
+        RQ_PHASE_BULK           = 0xebc0de02,
+        RQ_PHASE_INTERPRET      = 0xebc0de03,
+        RQ_PHASE_COMPLETE       = 0xebc0de04,
+        RQ_PHASE_UNREGISTERING  = 0xebc0de05,
+        RQ_PHASE_UNDEFINED      = 0xebc0de06
  };
  
  struct ptlrpc_request_pool {
@@ -258,11 +278,28 @@ struct ptlrpc_request_pool {
          void (*prp_populate)(struct ptlrpc_request_pool *, int);
  };
  
+struct ldlm_lock;
+
+struct ptlrpc_hpreq_ops {
+        /**
+         * Check if the lock handle of the given lock is the same as
+         * taken from the request.
+         */
+        int  (*hpreq_lock_match)(struct ptlrpc_request *, struct ldlm_lock *);
+        /**
+         * Check if the request is a high priority one.
+         */
+        int  (*hpreq_check)(struct ptlrpc_request *);
+};
+
  struct ptlrpc_request {
          int rq_type; /* one of PTL_RPC_MSG_* */
          struct list_head rq_list;
          struct list_head rq_timed_list;         /* server-side early replies */
          struct list_head rq_history_list;       /* server-side history */
+        struct list_head rq_exp_list;           /* server-side per-export list */
+        struct ptlrpc_hpreq_ops *rq_ops;        /* server-side hp handlers */
+
          __u64            rq_history_seq;        /* history sequence # */
          int rq_status;
          spinlock_t rq_lock;
@@ -283,8 +320,10 @@ struct ptlrpc_request {
                  rq_no_delay:1, rq_net_err:1, rq_early:1, rq_must_unlink:1,
                  /* server-side flags */
                  rq_packed_final:1,  /* packed final reply */
-                rq_sent_final:1;    /* stop sending early replies */
-        enum rq_phase rq_phase; /* one of RQ_PHASE_* */
+                rq_sent_final:1,    /* stop sending early replies */
+                rq_hp:1;            /* high priority RPC */
+        enum rq_phase rq_phase;     /* one of RQ_PHASE_* */
+        enum rq_phase rq_next_phase; /* one of RQ_PHASE_* to be used next */
          atomic_t rq_refcount;   /* client-side refcount for SENT race,
                                     server-side refcounf for multiple replies */
  
@@ -341,12 +380,14 @@ struct ptlrpc_request {
  
          struct ptlrpc_bulk_desc *rq_bulk;       /* client side bulk */
          /* client outgoing req */
-        time_t rq_sent;                         /* when request sent, seconds, 
+        time_t rq_sent;                         /* when request sent, seconds,
                                                   * or time when request should
                                                   * be sent */
          volatile time_t rq_deadline;     /* when request must finish. volatile
-               so that servers' early reply updates to the deadline aren't 
+               so that servers' early reply updates to the deadline aren't
                 kept in per-cpu cache */
+        time_t rq_reply_deadline;        /* when req reply unlink must finish. */
+        time_t rq_bulk_deadline;         /* when req bulk unlink must finish. */
          int    rq_timeout;               /* service time estimate (secs) */
  
          /* Multi-rpc bits */
@@ -394,11 +435,10 @@ static inline int lustre_rep_need_swab(struct ptlrpc_request *req)
          return req->rq_rep_swab_mask & (1 << MSG_PTLRPC_HEADER_OFF);
  }
  
-
  static inline const char *
-ptlrpc_rqphase2str(struct ptlrpc_request *req)
+ptlrpc_phase2str(enum rq_phase phase)
  {
-        switch (req->rq_phase) {
+        switch (phase) {
          case RQ_PHASE_NEW:
                  return "New";
          case RQ_PHASE_RPC:
@@ -409,11 +449,19 @@ ptlrpc_rqphase2str(struct ptlrpc_request *req)
                  return "Interpret";
          case RQ_PHASE_COMPLETE:
                  return "Complete";
+        case RQ_PHASE_UNREGISTERING:
+                return "Unregistering";
          default:
                  return "?Phase?";
          }
  }
  
+static inline const char *
+ptlrpc_rqphase2str(struct ptlrpc_request *req)
+{
+        return ptlrpc_phase2str(req->rq_phase);
+}
+
  /* Spare the preprocessor, spoil the bugs. */
  #define FLAG(field, str) (field ? str : "")
  
@@ -424,9 +472,9 @@ ptlrpc_rqphase2str(struct ptlrpc_request *req)
          FLAG(req->rq_timedout, "X") /* eXpired */, FLAG(req->rq_resend, "S"), \
          FLAG(req->rq_restart, "T"), FLAG(req->rq_replay, "P"),                \
          FLAG(req->rq_no_resend, "N"),                                         \
-        FLAG(req->rq_waiting, "W")
+        FLAG(req->rq_waiting, "W"), FLAG(req->rq_hp, "H")
  
-#define REQ_FLAGS_FMT "%s:%s%s%s%s%s%s%s%s%s"
+#define REQ_FLAGS_FMT "%s:%s%s%s%s%s%s%s%s%s%s"
  
  void _debug_req(struct ptlrpc_request *req, __u32 mask,
                  struct libcfs_debug_msg_data *data, const char *fmt, ...)
@@ -501,12 +549,14 @@ struct ptlrpc_bulk_desc {
  
  struct ptlrpc_thread {
  
-        struct list_head t_link; /* active threads for service, from svc->srv_threads */
+        struct list_head t_link; /* active threads in svc->srv_threads */
  
          void *t_data;            /* thread-private data (preallocated memory) */
          __u32 t_flags;
  
          unsigned int t_id; /* service thread index, from ptlrpc_start_threads */
+        struct lc_watchdog *t_watchdog; /* put watchdog in the structure per
+                                         * thread b=14840 */
          cfs_waitq_t t_ctl_waitq;
  };
  
@@ -523,6 +573,9 @@ struct ptlrpc_request_buffer_desc {
  
  typedef int (*svc_handler_t)(struct ptlrpc_request *req);
  typedef void (*svcreq_printfn_t)(void *, struct ptlrpc_request *);
+typedef int (*svc_hpreq_handler_t)(struct ptlrpc_request *);
+
+#define PTLRPC_SVC_HP_RATIO 10
  
  struct ptlrpc_service {
          struct list_head srv_list;              /* chain thru all services */
@@ -537,6 +590,7 @@ struct ptlrpc_service {
          int              srv_threads_running;   /* # running threads */
          int              srv_n_difficult_replies; /* # 'difficult' replies */
          int              srv_n_active_reqs;     /* # reqs being served */
+        int              srv_n_hpreq;           /* # HPreqs being served */
          cfs_duration_t   srv_rqbd_timeout;      /* timeout before re-posting reqs, in tick */
          int              srv_watchdog_factor;   /* soft watchdog timeout mutiplier */
          unsigned         srv_cpu_affinity:1;    /* bind threads to CPUs */
@@ -545,7 +599,7 @@ struct ptlrpc_service {
  
          __u32            srv_req_portal;
          __u32            srv_rep_portal;
-        
+
          /* AT stuff */
          struct adaptive_timeout srv_at_estimate;/* estimated rpc service time */
          spinlock_t        srv_at_lock;
@@ -553,8 +607,11 @@ struct ptlrpc_service {
          cfs_timer_t       srv_at_timer;         /* early reply timer */
  
          int               srv_n_queued_reqs;    /* # reqs in either of the queues below */
+        int               srv_hpreq_count;      /* # hp requests handled */
+        int               srv_hpreq_ratio;      /* # hp per lp reqs to handle */
          struct list_head  srv_req_in_queue;     /* incoming reqs */
          struct list_head  srv_request_queue;    /* reqs waiting for service */
+        struct list_head  srv_request_hpq;      /* high priority queue */
  
          struct list_head  srv_request_history;  /* request history */
          __u64             srv_request_seq;      /* next request sequence # */
@@ -579,6 +636,7 @@ struct ptlrpc_service {
  
          struct list_head   srv_threads;         /* service thread list */
          svc_handler_t      srv_handler;
+        svc_hpreq_handler_t srv_hpreq_handler;  /* hp request handler */
  
          char *srv_name;  /* only statically allocated strings here; we don't clean them */
          char *srv_thread_name;  /* only statically allocated strings here; we don't clean them */
@@ -592,7 +650,7 @@ struct ptlrpc_service {
          struct list_head         srv_free_rs_list;
          /* waitq to run, when adding stuff to srv_free_rs_list */
          cfs_waitq_t              srv_free_rs_waitq;
-        
+
          /*
           * if non-NULL called during thread creation (ptlrpc_start_thread())
           * to initialize service specific per-thread state.
@@ -607,6 +665,70 @@ struct ptlrpc_service {
          //struct ptlrpc_srv_ni srv_interfaces[0];
  };
  
+struct ptlrpcd_ctl {
+        /**
+         * Ptlrpc thread control flags (LIOD_START, LIOD_STOP, LIOD_FORCE)
+         */
+        unsigned long               pc_flags;
+        /**
+         * Thread lock protecting structure fields.
+         */
+        spinlock_t                  pc_lock;
+        /**
+         * Start completion.
+         */
+        struct completion           pc_starting;
+        /**
+         * Stop completion.
+         */
+        struct completion           pc_finishing;
+        /**
+         * Thread requests set.
+         */
+        struct ptlrpc_request_set  *pc_set;
+        /**
+         * Thread name used in cfs_daemonize()
+         */
+        char                        pc_name[16];
+#ifndef __KERNEL__
+        /**
+         * Async rpcs flag to make sure that ptlrpcd_check() is called only 
+         * once.
+         */
+        int                         pc_recurred;
+        /**
+         * Currently not used.
+         */
+        void                       *pc_callback;
+        /**
+         * User-space async rpcs callback.
+         */
+        void                       *pc_wait_callback;
+        /**
+         * User-space check idle rpcs callback.
+         */
+        void                       *pc_idle_callback;
+#endif
+};
+
+/* Bits for pc_flags */
+enum ptlrpcd_ctl_flags {
+        /**
+         * Ptlrpc thread start flag.
+         */
+        LIOD_START       = 1 << 0,
+        /**
+         * Ptlrpc thread stop flag.
+         */
+        LIOD_STOP        = 1 << 1,
+        /**
+         * Ptlrpc thread force flag (only stop force so far).
+         * This will cause aborting any inflight rpcs handled
+         * by thread if LIOD_STOP is specified.
+         */
+        LIOD_FORCE       = 1 << 2
+};
+
  /* ptlrpc/events.c */
  extern lnet_handle_eq_t ptlrpc_eq_h;
  extern int ptlrpc_uuid_to_peer(struct obd_uuid *uuid,
@@ -619,30 +741,51 @@ extern void reply_out_callback(lnet_event_t *ev);
  extern void server_bulk_callback (lnet_event_t *ev);
  
  /* ptlrpc/connection.c */
-void ptlrpc_dump_connections(void);
-void ptlrpc_readdress_connection(struct ptlrpc_connection *, struct obd_uuid *);
-struct ptlrpc_connection *ptlrpc_get_connection(lnet_process_id_t peer,
-                                                lnet_nid_t self, struct obd_uuid *uuid);
-int ptlrpc_put_connection(struct ptlrpc_connection *c);
+struct ptlrpc_connection *ptlrpc_connection_get(lnet_process_id_t peer,
+                                                lnet_nid_t self,
+                                                struct obd_uuid *uuid);
+int ptlrpc_connection_put(struct ptlrpc_connection *c);
  struct ptlrpc_connection *ptlrpc_connection_addref(struct ptlrpc_connection *);
-int ptlrpc_init_connection(void);
-void ptlrpc_cleanup_connection(void);
+int ptlrpc_connection_init(void);
+void ptlrpc_connection_fini(void);
  extern lnet_pid_t ptl_get_pid(void);
  
  /* ptlrpc/niobuf.c */
  int ptlrpc_start_bulk_transfer(struct ptlrpc_bulk_desc *desc);
  void ptlrpc_abort_bulk(struct ptlrpc_bulk_desc *desc);
  int ptlrpc_register_bulk(struct ptlrpc_request *req);
-void ptlrpc_unregister_bulk (struct ptlrpc_request *req);
+int ptlrpc_unregister_bulk(struct ptlrpc_request *req, int async);
  
-static inline int ptlrpc_bulk_active (struct ptlrpc_bulk_desc *desc)
+static inline int ptlrpc_server_bulk_active(struct ptlrpc_bulk_desc *desc)
  {
-        int           rc;
+        int rc;
+
+        LASSERT(desc != NULL);
+
+        spin_lock(&desc->bd_lock);
+        rc = desc->bd_network_rw;
+        spin_unlock(&desc->bd_lock);
+        return rc;
+}
+
+static inline int ptlrpc_client_bulk_active(struct ptlrpc_request *req)
+{
+        struct ptlrpc_bulk_desc *desc = req->rq_bulk;
+        int                      rc;
+
+        LASSERT(req != NULL);
+
+        if (OBD_FAIL_CHECK(OBD_FAIL_PTLRPC_LONG_BULK_UNLINK) &&
+            req->rq_bulk_deadline > cfs_time_current_sec())
+                return 1;
+
+        if (!desc)
+                return 0;
  
          spin_lock(&desc->bd_lock);
          rc = desc->bd_network_rw;
          spin_unlock(&desc->bd_lock);
-        return (rc);
+        return rc;
  }
  
  #define PTLRPC_REPLY_MAYBE_DIFFICULT 0x01
@@ -662,31 +805,12 @@ void ptlrpc_init_client(int req_portal, int rep_portal, char *name,
  void ptlrpc_cleanup_client(struct obd_import *imp);
  struct ptlrpc_connection *ptlrpc_uuid_to_connection(struct obd_uuid *uuid);
  
-static inline int
-ptlrpc_client_recv_or_unlink (struct ptlrpc_request *req)
-{
-        int           rc;
-
-        spin_lock(&req->rq_lock);
-        rc = req->rq_receiving_reply || req->rq_must_unlink;
-        spin_unlock(&req->rq_lock);
-        return (rc);
-}
-
-static inline void
-ptlrpc_wake_client_req (struct ptlrpc_request *req)
-{
-        if (req->rq_set == NULL)
-                cfs_waitq_signal(&req->rq_reply_waitq);
-        else
-                cfs_waitq_signal(&req->rq_set->set_waitq);
-}
-
  int ptlrpc_queue_wait(struct ptlrpc_request *req);
  int ptlrpc_replay_req(struct ptlrpc_request *req);
-void ptlrpc_unregister_reply(struct ptlrpc_request *req);
+int ptlrpc_unregister_reply(struct ptlrpc_request *req, int async);
  void ptlrpc_restart_req(struct ptlrpc_request *req);
  void ptlrpc_abort_inflight(struct obd_import *imp);
+void ptlrpc_abort_set(struct ptlrpc_request_set *set);
  
  struct ptlrpc_request_set *ptlrpc_prep_set(void);
  int ptlrpc_set_add_cb(struct ptlrpc_request_set *set,
@@ -699,22 +823,24 @@ void ptlrpc_interrupted_set(void *data);
  void ptlrpc_mark_interrupted(struct ptlrpc_request *req);
  void ptlrpc_set_destroy(struct ptlrpc_request_set *);
  void ptlrpc_set_add_req(struct ptlrpc_request_set *, struct ptlrpc_request *);
-void ptlrpc_set_add_new_req(struct ptlrpc_request_set *,
-                            struct ptlrpc_request *);
+int ptlrpc_set_add_new_req(struct ptlrpcd_ctl *pc,
+                           struct ptlrpc_request *req);
  
  void ptlrpc_free_rq_pool(struct ptlrpc_request_pool *pool);
  void ptlrpc_add_rqs_to_pool(struct ptlrpc_request_pool *pool, int num_rq);
-struct ptlrpc_request_pool *ptlrpc_init_rq_pool(int, int,
-                                                void (*populate_pool)(struct ptlrpc_request_pool *, int));
+
+struct ptlrpc_request_pool *
+ptlrpc_init_rq_pool(int, int,
+                    void (*populate_pool)(struct ptlrpc_request_pool *, int));
+
  void ptlrpc_at_set_req_timeout(struct ptlrpc_request *req);
  struct ptlrpc_request *ptlrpc_prep_req(struct obd_import *imp, __u32 version,
-                                       int opcode, int count, int *lengths,
+                                       int opcode, int count, __u32 *lengths,
                                         char **bufs);
  struct ptlrpc_request *ptlrpc_prep_req_pool(struct obd_import *imp,
                                               __u32 version, int opcode,
-                                            int count, int *lengths, char **bufs,
+                                            int count, __u32 *lengths, char **bufs,
                                              struct ptlrpc_request_pool *pool);
-void ptlrpc_free_req(struct ptlrpc_request *request);
  void ptlrpc_req_finished(struct ptlrpc_request *request);
  void ptlrpc_req_finished_with_imp_lock(struct ptlrpc_request *request);
  struct ptlrpc_request *ptlrpc_request_addref(struct ptlrpc_request *req);
@@ -734,7 +860,7 @@ __u64 ptlrpc_req_xid(struct ptlrpc_request *request);
  /* ptlrpc/service.c */
  void ptlrpc_save_lock (struct ptlrpc_request *req,
                         struct lustre_handle *lock, int mode);
-void ptlrpc_commit_replies (struct obd_device *obd);
+void ptlrpc_commit_replies (struct obd_export *exp);
  void ptlrpc_schedule_difficult_reply (struct ptlrpc_reply_state *rs);
  struct ptlrpc_service *ptlrpc_init_svc(int nbufs, int bufsize, int max_req_size,
                                         int max_reply_size,
@@ -742,9 +868,9 @@ struct ptlrpc_service *ptlrpc_init_svc(int nbufs, int bufsize, int max_req_size,
                                         int watchdog_factor,
                                         svc_handler_t, char *name,
                                         cfs_proc_dir_entry_t *proc_entry,
-                                       svcreq_printfn_t, 
+                                       svcreq_printfn_t,
                                         int min_threads, int max_threads,
-                                       char *threadname);
+                                       char *threadname, svc_hpreq_handler_t);
  void ptlrpc_stop_all_threads(struct ptlrpc_service *svc);
  
  int ptlrpc_start_threads(struct obd_device *dev, struct ptlrpc_service *svc);
@@ -753,6 +879,7 @@ int ptlrpc_unregister_service(struct ptlrpc_service *service);
  int liblustre_check_services (void *arg);
  void ptlrpc_daemonize(char *name);
  int ptlrpc_service_health_check(struct ptlrpc_service *);
+void ptlrpc_hpreq_reorder(struct ptlrpc_request *req);
  
  
  struct ptlrpc_svc_data {
@@ -774,18 +901,18 @@ int ptlrpc_reconnect_import(struct obd_import *imp);
  int lustre_msg_swabbed(struct lustre_msg *msg);
  int lustre_msg_check_version(struct lustre_msg *msg, __u32 version);
  int lustre_pack_request(struct ptlrpc_request *, __u32 magic, int count,
-                        int *lens, char **bufs);
-int lustre_pack_reply(struct ptlrpc_request *, int count, int *lens,
+                        __u32 *lens, char **bufs);
+int lustre_pack_reply(struct ptlrpc_request *, int count, __u32 *lens,
                        char **bufs);
  #define LPRFL_EARLY_REPLY 1
-int lustre_pack_reply_flags(struct ptlrpc_request *, int count, int *lens,
+int lustre_pack_reply_flags(struct ptlrpc_request *, int count, __u32 *lens,
                              char **bufs, int flags);
  void lustre_shrink_reply(struct ptlrpc_request *req, int segment,
                           unsigned int newlen, int move_data);
  void lustre_free_reply_state(struct ptlrpc_reply_state *rs);
-int lustre_msg_size(__u32 magic, int count, int *lengths);
+int lustre_msg_size(__u32 magic, int count, __u32 *lengths);
  int lustre_packed_msg_size(struct lustre_msg *msg);
-int lustre_msg_early_size(void);
+int lustre_msg_early_size(struct ptlrpc_request *req);
  int lustre_unpack_msg(struct lustre_msg *m, int len);
  void *lustre_msg_buf(struct lustre_msg *m, int n, int minlen);
  int lustre_msg_buflen(struct lustre_msg *m, int n);
@@ -812,6 +939,7 @@ void lustre_msg_add_version(struct lustre_msg *msg, int version);
  __u32 lustre_msg_get_opc(struct lustre_msg *msg);
  __u64 lustre_msg_get_last_xid(struct lustre_msg *msg);
  __u64 lustre_msg_get_last_committed(struct lustre_msg *msg);
+__u64 *lustre_msg_get_versions(struct lustre_msg *msg);
  __u64 lustre_msg_get_transno(struct lustre_msg *msg);
  __u64 lustre_msg_get_slv(struct lustre_msg *msg);
  __u32 lustre_msg_get_limit(struct lustre_msg *msg);
@@ -830,6 +958,7 @@ void lustre_msg_set_type(struct lustre_msg *msg, __u32 type);
  void lustre_msg_set_opc(struct lustre_msg *msg, __u32 opc);
  void lustre_msg_set_last_xid(struct lustre_msg *msg, __u64 last_xid);
  void lustre_msg_set_last_committed(struct lustre_msg *msg,__u64 last_committed);
+void lustre_msg_set_versions(struct lustre_msg *msg, __u64 *versions);
  void lustre_msg_set_transno(struct lustre_msg *msg, __u64 transno);
  void lustre_msg_set_status(struct lustre_msg *msg, __u32 status);
  void lustre_msg_set_conn_cnt(struct lustre_msg *msg, __u32 conn_cnt);
@@ -838,6 +967,81 @@ void lustre_msg_set_service_time(struct lustre_msg *msg, __u32 service_time);
  void lustre_msg_set_cksum(struct lustre_msg *msg, __u32 cksum);
  
  static inline void
+ptlrpc_rqphase_move(struct ptlrpc_request *req, enum rq_phase new_phase)
+{
+        if (req->rq_phase == new_phase)
+                return;
+        
+        if (new_phase == RQ_PHASE_UNREGISTERING) {
+                req->rq_next_phase = req->rq_phase;
+                if (req->rq_import)
+                        atomic_inc(&req->rq_import->imp_unregistering);
+        }
+        
+        if (req->rq_phase == RQ_PHASE_UNREGISTERING) {
+                if (req->rq_import)
+                        atomic_dec(&req->rq_import->imp_unregistering);
+        }
+
+        DEBUG_REQ(D_RPCTRACE, req, "move req \"%s\" -> \"%s\"", 
+                  ptlrpc_rqphase2str(req), ptlrpc_phase2str(new_phase));
+
+        req->rq_phase = new_phase;
+}
+
+static inline int
+ptlrpc_client_early(struct ptlrpc_request *req)
+{
+        if (OBD_FAIL_CHECK(OBD_FAIL_PTLRPC_LONG_REPL_UNLINK) &&
+            req->rq_reply_deadline > cfs_time_current_sec())
+                return 0;
+        return req->rq_early;
+}
+
+static inline int
+ptlrpc_client_replied(struct ptlrpc_request *req)
+{
+        if (OBD_FAIL_CHECK(OBD_FAIL_PTLRPC_LONG_REPL_UNLINK) &&
+            req->rq_reply_deadline > cfs_time_current_sec())
+                return 0;
+        return req->rq_replied;
+}
+
+static inline int
+ptlrpc_client_recv(struct ptlrpc_request *req)
+{
+        if (OBD_FAIL_CHECK(OBD_FAIL_PTLRPC_LONG_REPL_UNLINK) &&
+            req->rq_reply_deadline > cfs_time_current_sec())
+                return 1;
+        return req->rq_receiving_reply;
+}
+
+static inline int
+ptlrpc_client_recv_or_unlink(struct ptlrpc_request *req)
+{
+        int rc;
+
+        spin_lock(&req->rq_lock);
+        if (OBD_FAIL_CHECK(OBD_FAIL_PTLRPC_LONG_REPL_UNLINK) &&
+            req->rq_reply_deadline > cfs_time_current_sec()) {
+                spin_unlock(&req->rq_lock);
+                return 1;
+        }
+        rc = req->rq_receiving_reply || req->rq_must_unlink;
+        spin_unlock(&req->rq_lock);
+        return rc;
+}
+
+static inline void
+ptlrpc_client_wake_req(struct ptlrpc_request *req)
+{
+        if (req->rq_set == NULL)
+                cfs_waitq_signal(&req->rq_reply_waitq);
+        else
+                cfs_waitq_signal(&req->rq_set->set_waitq);
+}
+
+static inline void
  ptlrpc_rs_addref(struct ptlrpc_reply_state *rs)
  {
          LASSERT(atomic_read(&rs->rs_refcount) > 0);
@@ -855,7 +1059,7 @@ ptlrpc_rs_decref(struct ptlrpc_reply_state *rs)
  /* Should only be called once per req */
  static inline void ptlrpc_req_drop_rs(struct ptlrpc_request *req)
  {
-        if (req->rq_reply_state == NULL) 
+        if (req->rq_reply_state == NULL)
                  return; /* shouldn't occur */
          ptlrpc_rs_decref(req->rq_reply_state);
          req->rq_reply_state = NULL;
@@ -883,11 +1087,11 @@ static inline int ptlrpc_req_get_repsize(struct ptlrpc_request *req)
  }
  
  static inline void
-ptlrpc_req_set_repsize(struct ptlrpc_request *req, int count, int *lens)
+ptlrpc_req_set_repsize(struct ptlrpc_request *req, int count, __u32 *lens)
  {
          int size = lustre_msg_size(req->rq_reqmsg->lm_magic, count, lens);
-        
-        req->rq_replen = size + lustre_msg_early_size();
+
+        req->rq_replen = size + lustre_msg_early_size(req);
          if (req->rq_reqmsg->lm_magic == LUSTRE_MSG_MAGIC_V2)
                  req->rq_reqmsg->lm_repsize = size;
  }
@@ -907,6 +1111,8 @@ int import_set_conn_priority(struct obd_import *imp, struct obd_uuid *uuid);
  /* ptlrpc/pinger.c */
  int ptlrpc_pinger_add_import(struct obd_import *imp);
  int ptlrpc_pinger_del_import(struct obd_import *imp);
+struct ptlrpc_request * ptlrpc_prep_ping(struct obd_import *imp);
+int ptlrpc_obd_ping(struct obd_device *obd);
  #ifdef __KERNEL__
  void ping_evictor_start(void);
  void ping_evictor_stop(void);
@@ -916,6 +1122,8 @@ void ping_evictor_stop(void);
  #endif
  
  /* ptlrpc/ptlrpcd.c */
+int ptlrpcd_start(char *name, struct ptlrpcd_ctl *pc);
+void ptlrpcd_stop(struct ptlrpcd_ctl *pc, int force);
  void ptlrpcd_wake(struct ptlrpc_request *req);
  void ptlrpcd_add_req(struct ptlrpc_request *req);
  int ptlrpcd_addref(void);
@@ -926,12 +1134,11 @@ const char* ll_opcode2str(__u32 opcode);
  #ifdef LPROCFS
  void ptlrpc_lprocfs_register_obd(struct obd_device *obd);
  void ptlrpc_lprocfs_unregister_obd(struct obd_device *obd);
-void ptlrpc_lprocfs_brw(struct ptlrpc_request *req, int opc, int bytes);
+void ptlrpc_lprocfs_brw(struct ptlrpc_request *req, int bytes);
  #else
  static inline void ptlrpc_lprocfs_register_obd(struct obd_device *obd) {}
  static inline void ptlrpc_lprocfs_unregister_obd(struct obd_device *obd) {}
-static inline void ptlrpc_lprocfs_brw(struct ptlrpc_request *req, int opc,
-                                      int bytes) {}
+static inline void ptlrpc_lprocfs_brw(struct ptlrpc_request *req, int bytes) {}
  #endif
  
  /* ptlrpc/llog_server.c */
diff --git a/lustre/include/lustre_param.h b/lustre/include/lustre_param.h

index 281cf6e..52b1d29 100644 (file)
--- a/lustre/include/lustre_param.h
+++ b/lustre/include/lustre_param.h
@@ -1,26 +1,43 @@
  /* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
   * vim:expandtab:shiftwidth=8:tabstop=8:
   *
- *  Copyright (C) 2006 Cluster File Systems, Inc.
- *   Author: Nathan Rutman <nathan@clusterfs.com>
+ * GPL HEADER START
   *
- *   This file is part of Lustre, http://www.lustre.org.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   *
- *   Lustre is free software; you can redistribute it and/or
- *   modify it under the terms of version 2 of the GNU General Public
- *   License as published by the Free Software Foundation.
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
   *
- *   Lustre is distributed in the hope that it will be useful,
- *   but WITHOUT ANY WARRANTY; without even the implied warranty of
- *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- *   GNU General Public License for more details.
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
   *
- *   You should have received a copy of the GNU General Public License
- *   along with Lustre; if not, write to the Free Software
- *   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
   *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright  2008 Sun Microsystems, Inc. All rights reserved
+ * Use is subject to license terms.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/include/lustre_param.h
   *
   * User-settable parameter keys
+ *
+ * Author: Nathan Rutman <nathan@clusterfs.com>
   */
  
  #ifndef _LUSTRE_PARAM_H
diff --git a/lustre/include/lustre_quota.h b/lustre/include/lustre_quota.h

index a2d3635..43a0d31 100644 (file)
--- a/lustre/include/lustre_quota.h
+++ b/lustre/include/lustre_quota.h
@@ -1,6 +1,39 @@
  /* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
   * vim:expandtab:shiftwidth=8:tabstop=8:
+ *
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
   */
+/*
+ * Copyright  2008 Sun Microsystems, Inc. All rights reserved
+ * Use is subject to license terms.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
  #ifndef _LUSTRE_QUOTA_H
  #define _LUSTRE_QUOTA_H
  
@@ -18,6 +51,7 @@
  #include <lustre_net.h>
  #include <lvfs.h>
  #include <obd_support.h>
+#include <class_hash.h>
  
  struct obd_device;
  struct client_obd;
@@ -30,6 +64,40 @@ struct client_obd;
  
  #ifdef __KERNEL__
  
+#ifdef LPROCFS
+enum {
+        LQUOTA_FIRST_STAT = 0,
+        /* these four are for measuring quota requests, for both of
+         * quota master and quota slaves */
+        LQUOTA_SYNC_ACQ = LQUOTA_FIRST_STAT,
+        LQUOTA_SYNC_REL,
+        LQUOTA_ASYNC_ACQ,
+        LQUOTA_ASYNC_REL,
+        /* these four measure how much time I/O threads spend on dealing
+         * with quota before and after writing data or creating files,
+         * only for quota slaves(lquota_chkquota and lquota_pending_commit) */
+        LQUOTA_WAIT_FOR_CHK_BLK,
+        LQUOTA_WAIT_FOR_CHK_INO,
+        LQUOTA_WAIT_FOR_COMMIT_BLK,
+        LQUOTA_WAIT_FOR_COMMIT_INO,
+        /* these two are for measuring time waiting return of quota reqs
+         * (qctxt_wait_pending_dqacq), only for quota salves */
+        LQUOTA_WAIT_PENDING_BLK_QUOTA,
+        LQUOTA_WAIT_PENDING_INO_QUOTA,
+        /* these two are for those when they are calling
+         * qctxt_wait_pending_dqacq, the quota req has returned already,
+         * only for quota salves */
+        LQUOTA_NOWAIT_PENDING_BLK_QUOTA,
+        LQUOTA_NOWAIT_PENDING_INO_QUOTA,
+        /* these are for quota ctl */
+        LQUOTA_QUOTA_CTL,
+        /* these are for adjust quota qunit, for both of
+         * quota master and quota slaves  */
+        LQUOTA_ADJUST_QUNIT,
+        LQUOTA_LAST_STAT
+};
+#endif  /* LPROCFS */
+
  /* structures to access admin quotafile */
  struct lustre_mem_dqinfo {
          unsigned int dqi_bgrace;
@@ -93,7 +161,6 @@ struct dquot_id {
  #define QFILE_CONVERT           7
  
  /* admin quotafile operations */
-#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,5,0)
  int lustre_check_quota_file(struct lustre_quota_info *lqi, int type);
  int lustre_read_quota_info(struct lustre_quota_info *lqi, int type);
  int lustre_write_quota_info(struct lustre_quota_info *lqi, int type);
@@ -103,46 +170,6 @@ int lustre_init_quota_info(struct lustre_quota_info *lqi, int type);
  int lustre_get_qids(struct file *file, struct inode *inode, int type,
                      struct list_head *list);
  int lustre_quota_convert(struct lustre_quota_info *lqi, int type);
-#else
-
-#ifndef DQ_FAKE_B
-#define DQ_FAKE_B       6
-#endif
-
-static inline int lustre_check_quota_file(struct lustre_quota_info *lqi,
-                                          int type)
-{
-        return 0;
-}
-static inline int lustre_read_quota_info(struct lustre_quota_info *lqi,
-                                         int type)
-{
-        return 0;
-}
-static inline int lustre_write_quota_info(struct lustre_quota_info *lqi,
-                                          int type)
-{
-        return 0;
-}
-static inline int lustre_read_dquot(struct lustre_dquot *dquot)
-{
-        return 0;
-}
-static inline int lustre_commit_dquot(struct lustre_dquot *dquot)
-{
-        return 0;
-}
-static inline int lustre_init_quota_info(struct lustre_quota_info *lqi,
-                                         int type)
-{
-        return 0;
-}
-static inline int lustre_quota_convert(struct lustre_quota_info *lqi,
-                                       int type)
-{
-        return 0;
-}
-#endif  /* KERNEL_VERSION(2,5,0) */
  
  #define LL_DQUOT_OFF(sb)    DQUOT_OFF(sb)
  
@@ -163,8 +190,13 @@ struct lustre_quota_ctxt {
          dqacq_handler_t lqc_handler;    /* dqacq/dqrel RPC handler, only for quota master */
          unsigned long lqc_flags;        /* quota flags */
          unsigned long lqc_recovery:1,   /* Doing recovery */
-                      lqc_switch_qs:1;  /* the function of change qunit size
+                      lqc_switch_qs:1,  /* the function of change qunit size
                                           * 0:Off, 1:On */
+                      lqc_valid:1,      /* this qctxt is valid or not */
+                      lqc_setup:1;      /* tell whether of not quota_type has
+                                         * been processed, so that the master
+                                         * knows when it can start processing
+                                         * incoming acq/rel quota requests */
          unsigned long lqc_iunit_sz;     /* original unit size of file quota and
                                           * upper limitation for adjust file
                                           * qunit */
@@ -176,9 +208,8 @@ struct lustre_quota_ctxt {
                                           * upper limitation for adjust block
                                           * qunit */
          unsigned long lqc_btune_sz;     /* See comment of lqc_itune_sz */
-        struct lustre_class_hash_body *lqc_lqs_hash_body;
-                                        /* all lustre_qunit_size structure in
-                                         * it */
+        struct lustre_hash *lqc_lqs_hash; /* all lustre_qunit_size structures */
+
          /* the values below are relative to how master change its qunit sizes */
          unsigned long lqc_cqs_boundary_factor; /* this affects the boundary of
                                                  * shrinking and enlarging qunit
@@ -192,10 +223,20 @@ struct lustre_quota_ctxt {
                                               * adjusting qunit size. How many
                                               * seconds must be waited between
                                               * enlarging and shinking qunit */
+        int           lqc_sync_blk;         /* when blk qunit reaches this value,
+                                             * later write reqs from client
+                                             * should be sync b=16642 */
          spinlock_t    lqc_lock;         /* guard lqc_imp_valid now */
+        cfs_waitq_t   lqc_wait_for_qmaster; /* when mds isn't connected, threads
+                                             * on osts who send the quota reqs
+                                             * with wait==1 will be put here
+                                             * b=14840 */
+        struct proc_dir_entry *lqc_proc_dir;
+        struct lprocfs_stats  *lqc_stats; /* lquota statistics */
  };
  
-#define LQC_HASH_BODY(qctxt) (qctxt->lqc_lqs_hash_body)
+#define QUOTA_MASTER_READY(qctxt)   (qctxt)->lqc_setup = 1
+#define QUOTA_MASTER_UNREADY(qctxt) (qctxt)->lqc_setup = 0
  
  struct lustre_qunit_size {
          struct hlist_node lqs_hash; /* the hash entry */
@@ -219,6 +260,8 @@ struct lustre_qunit_size {
          cfs_time_t lqs_last_bshrink;   /* time of last block shrink */
          cfs_time_t lqs_last_ishrink;   /* time of last inode shrink */
          spinlock_t lqs_lock;
+        struct quota_adjust_qunit lqs_key; /* hash key */
+        struct lustre_quota_ctxt *lqs_ctxt; /* quota ctxt */
  };
  
  #define LQS_IS_GRP(lqs)    ((lqs)->lqs_flags & LQUOTA_FLAGS_GRP)
@@ -232,15 +275,24 @@ struct lustre_qunit_size {
  static inline void lqs_getref(struct lustre_qunit_size *lqs)
  {
          atomic_inc(&lqs->lqs_refcount);
+        CDEBUG(D_QUOTA, "lqs=%p refcount %d\n",
+               lqs, atomic_read(&lqs->lqs_refcount));
  }
  
  static inline void lqs_putref(struct lustre_qunit_size *lqs)
  {
-        if (atomic_dec_and_test(&lqs->lqs_refcount)) {
-                spin_lock(&lqs->lqs_lock);
-                hlist_del_init(&lqs->lqs_hash);
-                spin_unlock(&lqs->lqs_lock);
+        LASSERT(atomic_read(&lqs->lqs_refcount) > 0);
+
+        /* killing last ref, let's let hash table kill it */
+        if (atomic_read(&lqs->lqs_refcount) == 1) {
+                lustre_hash_del(lqs->lqs_ctxt->lqc_lqs_hash,
+                                &lqs->lqs_key, &lqs->lqs_hash);
                  OBD_FREE_PTR(lqs);
+        } else {
+                atomic_dec(&lqs->lqs_refcount);
+                CDEBUG(D_QUOTA, "lqs=%p refcount %d\n",
+                       lqs, atomic_read(&lqs->lqs_refcount));
+
          }
  }
  
@@ -269,6 +321,9 @@ struct lustre_quota_info {
  struct lustre_quota_ctxt {
  };
  
+#define QUOTA_MASTER_READY(qctxt)
+#define QUOTA_MASTER_UNREADY(qctxt)
+
  #endif /* !HAVE_QUOTA_SUPPORT */
  
  /* If the (quota limit < qunit * slave count), the slave which can't
@@ -282,8 +337,9 @@ struct quotacheck_thread_args {
          atomic_t            *qta_sem;   /* obt_quotachecking */
  };
  
-typedef int (*quota_acquire)(struct obd_device *obd,
-                             unsigned int uid, unsigned int gid);
+struct obd_trans_info;
+typedef int (*quota_acquire)(struct obd_device *obd, unsigned int uid,
+                             unsigned int gid, struct obd_trans_info *oti);
  
  typedef struct {
          int (*quota_init) (void);
@@ -311,33 +367,36 @@ typedef struct {
  
          /* For quota slave, check whether specified uid/gid is over quota */
          int (*quota_getflag) (struct obd_device *, struct obdo *);
-
+#ifdef __KERNEL__
          /* For quota slave, acquire/release quota from master if needed */
-        int (*quota_acquire) (struct obd_device *, unsigned int, unsigned int);
+        int (*quota_acquire) (struct obd_device *, unsigned int, unsigned int,
+                              struct obd_trans_info *);
  
          /* For quota slave, check whether specified uid/gid's remaining quota
           * can finish a block_write or inode_create rpc. It updates the pending
           * record of block and inode, acquires quota if necessary */
          int (*quota_chkquota) (struct obd_device *, unsigned int, unsigned int,
-                               int, int *, quota_acquire);
+                               int, int *, quota_acquire,
+                               struct obd_trans_info *, struct inode *, int);
  
+        /* For quota client, the actions after the pending write is committed */
+        int (*quota_pending_commit) (struct obd_device *, unsigned int,
+                                     unsigned int, int);
+#endif
          /* For quota client, poll if the quota check done */
          int (*quota_poll_check) (struct obd_export *, struct if_quotacheck *);
  
          /* For quota client, check whether specified uid/gid is over quota */
          int (*quota_chkdq) (struct client_obd *, unsigned int, unsigned int);
  
-        /* For quota client, the actions after the pending write is committed */
-        int (*quota_pending_commit) (struct obd_device *, unsigned int,
-                                     unsigned int, int);
-
          /* For quota client, set over quota flag for specifed uid/gid */
          int (*quota_setdq) (struct client_obd *, unsigned int, unsigned int,
                              obd_flag, obd_flag);
  
          /* For adjusting qunit size b=10600 */
-        int (*quota_adjust_qunit) (struct obd_export *exp, struct
-                                   quota_adjust_qunit *oqaq);
+        int (*quota_adjust_qunit) (struct obd_export *exp,
+                                   struct quota_adjust_qunit *oqaq,
+                                   struct lustre_quota_ctxt *qctxt);
  
  } quota_interface_t;
  
@@ -517,22 +576,25 @@ static inline int lquota_getflag(quota_interface_t *interface,
          RETURN(rc);
  }
  
+#ifdef __KERNEL__
  static inline int lquota_acquire(quota_interface_t *interface,
                                   struct obd_device *obd,
-                                 unsigned int uid, unsigned int gid)
+                                 unsigned int uid, unsigned int gid,
+                                 struct obd_trans_info *oti)
  {
          int rc;
          ENTRY;
  
          QUOTA_CHECK_OP(interface, acquire);
-        rc = QUOTA_OP(interface, acquire)(obd, uid, gid);
+        rc = QUOTA_OP(interface, acquire)(obd, uid, gid, oti);
          RETURN(rc);
  }
  
  static inline int lquota_chkquota(quota_interface_t *interface,
                                    struct obd_device *obd,
-                                  unsigned int uid, unsigned int gid,
-                                  int count, int *flag)
+                                  unsigned int uid, unsigned int gid, int count,
+                                  int *flag, struct obd_trans_info *oti,
+                                  struct inode *inode, int frags)
  {
          int rc;
          ENTRY;
@@ -540,52 +602,42 @@ static inline int lquota_chkquota(quota_interface_t *interface,
          QUOTA_CHECK_OP(interface, chkquota);
          QUOTA_CHECK_OP(interface, acquire);
          rc = QUOTA_OP(interface, chkquota)(obd, uid, gid, count, flag,
-                                           QUOTA_OP(interface, acquire));
+                                           QUOTA_OP(interface, acquire), oti,
+                                           inode, frags);
          RETURN(rc);
  }
  
  static inline int lquota_pending_commit(quota_interface_t *interface,
                                          struct obd_device *obd,
                                          unsigned int uid, unsigned int gid,
-                                        int npage)
+                                        int pending)
  {
          int rc;
          ENTRY;
  
          QUOTA_CHECK_OP(interface, pending_commit);
-        rc = QUOTA_OP(interface, pending_commit)(obd, uid, gid, npage);
+        rc = QUOTA_OP(interface, pending_commit)(obd, uid, gid, pending);
          RETURN(rc);
  }
-
-int lprocfs_quota_rd_bunit(char *page, char **start, off_t off, int count,
-                           int *eof, void *data);
-int lprocfs_quota_wr_bunit(struct file *file, const char *buffer,
-                           unsigned long count, void *data);
-int lprocfs_quota_rd_btune(char *page, char **start, off_t off, int count,
-                           int *eof, void *data);
-int lprocfs_quota_wr_btune(struct file *file, const char *buffer,
-                           unsigned long count, void *data);
-int lprocfs_quota_rd_iunit(char *page, char **start, off_t off, int count,
-                           int *eof, void *data);
-int lprocfs_quota_wr_iunit(struct file *file, const char *buffer,
-                           unsigned long count, void *data);
-int lprocfs_quota_rd_itune(char *page, char **start, off_t off, int count,
-                           int *eof, void *data);
-int lprocfs_quota_wr_itune(struct file *file, const char *buffer,
-                           unsigned long count, void *data);
-int lprocfs_quota_rd_type(char *page, char **start, off_t off, int count,
-                          int *eof, void *data);
-int lprocfs_quota_wr_type(struct file *file, const char *buffer,
-                          unsigned long count, void *data);
-int lprocfs_quota_rd_switch_seconds(char *page, char **start, off_t off,
-                                    int count, int *eof, void *data);
-int lprocfs_quota_wr_switch_seconds(struct file *file, const char *buffer,
-                                    unsigned long count, void *data);
+#endif
  
  #ifndef __KERNEL__
  extern quota_interface_t osc_quota_interface;
  extern quota_interface_t mdc_quota_interface;
  extern quota_interface_t lov_quota_interface;
+
+#ifndef MAXQUOTAS
+#define MAXQUOTAS 2
+#endif
+
+#ifndef USRQUOTA
+#define USRQUOTA 0
+#endif
+
+#ifndef GRPQUOTA
+#define GRPQUOTA 1
+#endif
+
  #endif
  
  #define LUSTRE_ADMIN_QUOTAFILES_V1 {\
diff --git a/lustre/include/lustre_ucache.h b/lustre/include/lustre_ucache.h

index 16b5c1a..c32735e 100644 (file)
--- a/lustre/include/lustre_ucache.h
+++ b/lustre/include/lustre_ucache.h
@@ -1,5 +1,37 @@
  /* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
   * vim:expandtab:shiftwidth=8:tabstop=8:
+ *
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright  2008 Sun Microsystems, Inc. All rights reserved
+ * Use is subject to license terms.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
   */
  
  #ifndef _UPCALL_CACHE_H
diff --git a/lustre/include/lustre_ver.h.in b/lustre/include/lustre_ver.h.in

index 1c63510..9027021 100644 (file)
--- a/lustre/include/lustre_ver.h.in
+++ b/lustre/include/lustre_ver.h.in
@@ -9,6 +9,10 @@
  #define LUSTRE_PATCH @AC_LUSTRE_PATCH@
  #define LUSTRE_FIX @AC_LUSTRE_FIX@
  #define LUSTRE_VERSION_STRING "@AC_LUSTRE_VERSION_STRING@"
+#define CLIENT_URN "@AC_LUSTRE_CLIENT_URN@"
+#define MDS_URN "@AC_LUSTRE_MDS_URN@"
+#define MGS_URN "@AC_LUSTRE_MGS_URN@"
+#define OSS_URN "@AC_LUSTRE_OSS_URN@"
  
  #define LUSTRE_VERSION_CODE OBD_OCD_VERSION(LUSTRE_MAJOR,LUSTRE_MINOR,LUSTRE_PATCH,LUSTRE_FIX)
  
diff --git a/lustre/include/lvfs.h b/lustre/include/lvfs.h

index 42e8544..a0e9872 100644 (file)
--- a/lustre/include/lvfs.h
+++ b/lustre/include/lvfs.h
@@ -1,22 +1,39 @@
  /* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
   * vim:expandtab:shiftwidth=8:tabstop=8:
   *
- *  Copyright (C) 2001 Cluster File Systems, Inc. <braam@clusterfs.com>
+ * GPL HEADER START
   *
- *   This file is part of Lustre, http://www.lustre.org.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   *
- *   Lustre is free software; you can redistribute it and/or
- *   modify it under the terms of version 2 of the GNU General Public
- *   License as published by the Free Software Foundation.
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
   *
- *   Lustre is distributed in the hope that it will be useful,
- *   but WITHOUT ANY WARRANTY; without even the implied warranty of
- *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- *   GNU General Public License for more details.
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
   *
- *   You should have received a copy of the GNU General Public License
- *   along with Lustre; if not, write to the Free Software
- *   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright  2008 Sun Microsystems, Inc. All rights reserved
+ * Use is subject to license terms.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/include/lvfs.h
   *
   * lustre VFS/process permission interface
   */
diff --git a/lustre/include/obd.h b/lustre/include/obd.h

index 88a81a3..2a5022f 100644 (file)
--- a/lustre/include/obd.h
+++ b/lustre/include/obd.h
@@ -1,5 +1,37 @@
  /* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
   * vim:expandtab:shiftwidth=8:tabstop=8:
+ *
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright  2008 Sun Microsystems, Inc. All rights reserved
+ * Use is subject to license terms.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
   */
  
  #ifndef __OBD_H
@@ -61,8 +93,8 @@ struct lov_oinfo {                 /* per-stripe data structure */
          /* used by the osc to keep track of what objects to build into rpcs */
          struct loi_oap_pages loi_read_lop;
          struct loi_oap_pages loi_write_lop;
-        /* _cli_ is poorly named, it should be _ready_ */
-        struct list_head loi_cli_item;
+        struct list_head loi_ready_item;
+        struct list_head loi_hp_ready_item;
          struct list_head loi_write_item;
          struct list_head loi_read_item;
  
@@ -80,7 +112,8 @@ static inline void loi_init(struct lov_oinfo *loi)
          CFS_INIT_LIST_HEAD(&loi->loi_write_lop.lop_pending);
          CFS_INIT_LIST_HEAD(&loi->loi_write_lop.lop_urgent);
          CFS_INIT_LIST_HEAD(&loi->loi_write_lop.lop_pending_group);
-        CFS_INIT_LIST_HEAD(&loi->loi_cli_item);
+        CFS_INIT_LIST_HEAD(&loi->loi_ready_item);
+        CFS_INIT_LIST_HEAD(&loi->loi_hp_ready_item);
          CFS_INIT_LIST_HEAD(&loi->loi_write_item);
          CFS_INIT_LIST_HEAD(&loi->loi_read_item);
  }
@@ -115,6 +148,7 @@ struct lov_stripe_md {
                  __u32 lw_stripe_size;      /* size of the stripe */
                  __u32 lw_pattern;          /* striping pattern (RAID0, RAID1) */
                  unsigned lw_stripe_count;  /* number of objects being striped over */
+                char  lw_pool_name[LOV_MAXPOOLNAME]; /* pool name */
          } lsm_wire;
  
          struct lov_array_info *lsm_array; /*Only for joined file array info*/
@@ -128,6 +162,7 @@ struct lov_stripe_md {
  #define lsm_stripe_size  lsm_wire.lw_stripe_size
  #define lsm_pattern      lsm_wire.lw_pattern
  #define lsm_stripe_count lsm_wire.lw_stripe_count
+#define lsm_pool_name    lsm_wire.lw_pool_name
  
  struct obd_info;
  
@@ -201,6 +236,7 @@ enum async_flags {
                                      the page is accounted for in the
                                      obd_io_group given to
                                      obd_queue_group_io */
+        ASYNC_HP = 0x10,
  };
  
  struct obd_async_page_ops {
@@ -242,9 +278,25 @@ struct ost_server_data;
  /* hold common fields for "target" device */
  struct obd_device_target {
          struct super_block       *obt_sb;
+        /** last_rcvd file */
+        struct file              *obt_rcvd_filp;
+        /** server data in last_rcvd file */
+        struct lr_server_data    *obt_lsd;
+        /** Lock protecting client bitmap */
+        spinlock_t                obt_client_bitmap_lock;
+        /** Bitmap of known clients */
+        unsigned long            *obt_client_bitmap;
+        /** Server last transaction number */
+        __u64                     obt_last_transno;
+        /** Lock protecting last transaction number */
+        spinlock_t                obt_translock;
+        /** Number of mounts */
+        __u64                     obt_mount_count;
          atomic_t                  obt_quotachecking;
          struct lustre_quota_ctxt  obt_qctxt;
          lustre_quota_version_t    obt_qfmt;
+        __u32                     obt_stale_export_age;
+        spinlock_t                obt_trans_table_lock;
  };
  
  typedef void (*obd_pin_extent_cb)(void *data);
@@ -269,12 +321,7 @@ struct filter_obd {
          cfs_dentry_t       **fo_dentry_O_groups;
          cfs_dentry_t       **fo_dentry_O_sub;
          spinlock_t           fo_objidlock;      /* protect fo_lastobjid */
-        spinlock_t           fo_translock;      /* protect fsd_last_transno */
-        struct file         *fo_rcvd_filp;
          struct file         *fo_health_check_filp;
-        struct lr_server_data *fo_fsd;
-        unsigned long       *fo_last_rcvd_slots;
-        __u64                fo_mount_count;
  
          int                  fo_destroy_in_progress;
          struct semaphore     fo_create_lock;
@@ -287,6 +334,8 @@ struct filter_obd {
          obd_size             fo_tot_pending;
  
          obd_size             fo_readcache_max_filesize;
+        int                  fo_read_cache;
+        int                  fo_writethrough_cache;
  
          struct obd_import   *fo_mdc_imp;
          struct obd_uuid      fo_mdc_uuid;
@@ -322,9 +371,15 @@ struct filter_obd {
  
          int                      fo_fmd_max_num; /* per exp filter_mod_data */
          int                      fo_fmd_max_age; /* jiffies to fmd expiry */
-        void                     *fo_lcm;
+        struct llog_commit_master *fo_lcm;
  };
  
+#define fo_translock            fo_obt.obt_translock
+#define fo_rcvd_filp            fo_obt.obt_rcvd_filp
+#define fo_fsd                  fo_obt.obt_lsd
+#define fo_last_rcvd_slots      fo_obt.obt_client_bitmap
+#define fo_mount_count          fo_obt.obt_mount_count
+
  #define OSC_MAX_RIF_DEFAULT       8
  #define OSC_MAX_RIF_MAX         256
  #define OSC_MAX_DIRTY_DEFAULT  (OSC_MAX_RIF_DEFAULT * 4)
@@ -344,9 +399,9 @@ struct client_obd {
          int                      cl_conn_count;
          /* max_mds_easize is purely a performance thing so we don't have to
           * call obd_size_diskmd() all the time. */
-        int                      cl_default_mds_easize;
-        int                      cl_max_mds_easize;
-        int                      cl_max_mds_cookiesize;
+        unsigned                 cl_default_mds_easize;
+        unsigned                 cl_max_mds_easize;
+        unsigned                 cl_max_mds_cookiesize;
  
          //struct llog_canceld_ctxt *cl_llcd; /* it's included by obd_llog_ctxt */
          void                    *cl_llcd_offset;
@@ -378,6 +433,7 @@ struct client_obd {
           */
          client_obd_lock_t        cl_loi_list_lock;
          struct list_head         cl_loi_ready_list;
+        struct list_head         cl_loi_hp_ready_list;
          struct list_head         cl_loi_write_list;
          struct list_head         cl_loi_read_list;
          int                      cl_r_in_flight;
@@ -416,12 +472,16 @@ struct client_obd {
          __u32                    cl_supp_cksum_types;
          /* checksum algorithm to be used */
          cksum_type_t             cl_cksum_type;
- 
+
          /* also protected by the poorly named _loi_list_lock lock above */
          struct osc_async_rc      cl_ar;
  
          /* used by quotacheck */
          int                      cl_qchk_stat; /* quotacheck stat of the peer */
+
+        /* sequence manager */
+        struct lu_client_seq    *cl_seq;
+
          atomic_t                 cl_resends; /* resend count */
          /* Cache of triples */
          struct lustre_cache     *cl_cache;
@@ -452,15 +512,10 @@ struct mds_obd {
          cfs_dentry_t                    *mds_fid_de;
          int                              mds_max_mdsize;
          int                              mds_max_cookiesize;
-        struct file                     *mds_rcvd_filp;
-        spinlock_t                       mds_transno_lock;
-        __u64                            mds_last_transno;
-        __u64                            mds_mount_count;
          __u64                            mds_io_epoch;
          unsigned long                    mds_atime_diff;
          struct semaphore                 mds_epoch_sem;
          struct ll_fid                    mds_rootfid;
-        struct lr_server_data           *mds_server_data;
          cfs_dentry_t                    *mds_pending_dir;
          cfs_dentry_t                    *mds_logs_dir;
          cfs_dentry_t                    *mds_objects_dir;
@@ -479,11 +534,11 @@ struct mds_obd {
          /* file for store objid */
          struct file                     *mds_lov_objid_filp;
          __u32                            mds_lov_objid_count;
+        __u32                            mds_lov_objid_max_index;
          __u32                            mds_lov_objid_lastpage;
          __u32                            mds_lov_objid_lastidx;
  
          struct file                     *mds_health_check_filp;
-        unsigned long                   *mds_client_bitmap;
          struct upcall_cache             *mds_group_hash;
  
          struct lustre_quota_info         mds_quota_info;
@@ -493,13 +548,24 @@ struct mds_obd {
                                           mds_fl_acl:1,
                                           mds_fl_cfglog:1,
                                           mds_fl_synced:1,
+                                         mds_fl_target:1, /* mds have one or
+                                                           * more targets */
                                           mds_evict_ost_nids:1;
  
          uid_t                            mds_squash_uid;
          gid_t                            mds_squash_gid;
          lnet_nid_t                       mds_nosquash_nid;
+        /* do we need permission sync */
+        unsigned int                     mds_sync_permission;
  };
  
+#define mds_transno_lock         mds_obt.obt_translock
+#define mds_rcvd_filp            mds_obt.obt_rcvd_filp
+#define mds_server_data          mds_obt.obt_lsd
+#define mds_client_bitmap        mds_obt.obt_client_bitmap
+#define mds_mount_count          mds_obt.obt_mount_count
+#define mds_last_transno         mds_obt.obt_last_transno
+
  /* lov objid */
  #define mds_max_ost_index  (0xFFFF)
  #define MDS_LOV_ALLOC_SIZE (CFS_PAGE_SIZE)
@@ -544,10 +610,11 @@ struct echo_client_obd {
  struct lov_qos_oss {
          struct obd_uuid     lqo_uuid;       /* ptlrpc's c_remote_uuid */
          struct list_head    lqo_oss_list;   /* link to lov_qos */
-        __u32               lqo_ost_count;  /* number of osts on this oss */
          __u64               lqo_bavail;     /* total bytes avail on OSS */
          __u64               lqo_penalty;    /* current penalty */
          __u64               lqo_penalty_per_obj; /* penalty decrease every obj*/
+        time_t              lqo_used;       /* last used time, seconds */
+        __u32               lqo_ost_count;  /* number of osts on this oss */
  };
  
  struct ltd_qos {
@@ -555,18 +622,36 @@ struct ltd_qos {
          __u64               ltq_penalty;     /* current penalty */
          __u64               ltq_penalty_per_obj; /* penalty decrease every obj*/
          __u64               ltq_weight;      /* net weighting */
+        time_t              ltq_used;        /* last used time, seconds */
          unsigned int        ltq_usable:1;    /* usable for striping */
  };
  
+/* Generic subset of OSTs */
+struct ost_pool {
+        __u32              *op_array;        /* array of index of
+                                                lov_obd->lov_tgts */
+        unsigned int        op_count;        /* number of OSTs in the array */
+        unsigned int        op_size;         /* allocated size of lp_array */
+        struct rw_semaphore op_rw_sem;       /* to protect ost_pool use */
+};
+
+/* Round-robin allocator data */
+struct lov_qos_rr {
+        __u32               lqr_start_idx;   /* start index of new inode */
+        __u32               lqr_offset_idx;  /* aliasing for start_idx  */
+        int                 lqr_start_count; /* reseed counter */
+        struct ost_pool     lqr_pool;        /* round-robin optimized list */
+        unsigned long       lqr_dirty:1;     /* recalc round-robin list */
+};
+
+/* Stripe placement optimization */
  struct lov_qos {
          struct list_head    lq_oss_list;    /* list of OSSs that targets use */
          struct rw_semaphore lq_rw_sem;
          __u32               lq_active_oss_count;
-        __u32              *lq_rr_array;    /* round-robin optimized list */
-        unsigned int        lq_rr_size;     /* rr array size */
          unsigned int        lq_prio_free;   /* priority for free space */
+        struct lov_qos_rr   lq_rr;          /* round robin qos data */
          unsigned long       lq_dirty:1,     /* recalc qos data */
-                            lq_dirty_rr:1,  /* recalc round-robin list */
                              lq_same_space:1,/* the ost's all have approx.
                                                 the same space avail */
                              lq_reset:1;     /* zero current penalties */
@@ -583,9 +668,30 @@ struct lov_tgt_desc {
                              ltd_reap:1;  /* should this target be deleted */
  };
  
+/* Pool metadata */
+#define pool_tgt_size(_p)   _p->pool_obds.op_size
+#define pool_tgt_count(_p)  _p->pool_obds.op_count
+#define pool_tgt_array(_p)  _p->pool_obds.op_array
+#define pool_tgt_rw_sem(_p) _p->pool_obds.op_rw_sem
+#define pool_tgt(_p, _i)    _p->pool_lov->lov_tgts[_p->pool_obds.op_array[_i]]
+
+struct pool_desc {
+        char                  pool_name[LOV_MAXPOOLNAME + 1]; /* name of pool */
+        struct ost_pool       pool_obds;              /* pool members */
+        atomic_t              pool_refcount;          /* pool ref. counter */
+        struct lov_qos_rr     pool_rr;                /* round robin qos */
+        struct hlist_node     pool_hash;              /* access by poolname */
+        struct list_head      pool_list;              /* serial access */
+        cfs_proc_dir_entry_t *pool_proc_entry;        /* file in /proc */
+        struct lov_obd       *pool_lov;               /* lov obd to which this
+                                                         pool belong */
+};
+
  struct lov_obd {
          struct lov_desc         desc;
-        struct lov_tgt_desc   **lov_tgts;
+        struct lov_tgt_desc   **lov_tgts;              /* sparse array */
+        struct ost_pool         lov_packed;            /* all OSTs in a packed
+                                                          array */
          struct semaphore        lov_lock;
          struct obd_connect_data lov_ocd;
          struct lov_qos          lov_qos;               /* qos info per lov */
@@ -594,13 +700,14 @@ struct lov_obd {
          __u32                   lov_active_tgt_count;  /* how many active */
          __u32                   lov_death_row;/* tgts scheduled to be deleted */
          __u32                   lov_tgt_size;   /* size of tgts array */
-        __u32                   lov_start_idx;  /* start index of new inode */
-        __u32                   lov_offset_idx; /* aliasing for start_idx  */
-        int                     lov_start_count;/* reseed counter */
          int                     lov_connects;
          obd_page_removal_cb_t   lov_page_removal_cb;
          obd_pin_extent_cb       lov_page_pin_cb;
          obd_lock_cancel_cb      lov_lock_cancel_cb;
+        int                     lov_pool_count;
+        lustre_hash_t          *lov_pools_hash_body; /* used for key access */
+        struct list_head        lov_pool_list; /* used for sequential access */
+        cfs_proc_dir_entry_t   *lov_pool_proc_entry;
  };
  
  struct niobuf_local {
@@ -652,8 +759,10 @@ struct obd_trans_info {
          int                      oti_numcookies;
  
          /* initial thread handling transaction */
-        int                      oti_thread_id;
+        struct ptlrpc_thread *   oti_thread;
          __u32                    oti_conn_cnt;
+        /* VBR: versions */
+        __u64                    oti_pre_version;
  
          struct obd_uuid         *oti_ost_uuid;
  };
@@ -669,10 +778,19 @@ static inline void oti_init(struct obd_trans_info *oti,
                  return;
  
          oti->oti_xid = req->rq_xid;
+        /* VBR: take versions from request */
+        if (req->rq_reqmsg != NULL &&
+            lustre_msg_get_flags(req->rq_reqmsg) & MSG_REPLAY) {
+                __u64 *pre_version = lustre_msg_get_versions(req->rq_reqmsg);
+                /* b1.6 interoperability check. pre_versions may be NULL */
+                oti->oti_pre_version = pre_version ? pre_version[0] : 0;
+                oti->oti_transno = lustre_msg_get_transno(req->rq_reqmsg);
+        }
  
+        /* called from mds_create_objects */
          if (req->rq_repmsg != NULL)
                  oti->oti_transno = lustre_msg_get_transno(req->rq_repmsg);
-        oti->oti_thread_id = req->rq_svc_thread ? req->rq_svc_thread->t_id : -1;
+        oti->oti_thread = req->rq_svc_thread;
          if (req->rq_reqmsg != NULL)
                  oti->oti_conn_cnt = lustre_msg_get_conn_cnt(req->rq_reqmsg);
  }
@@ -741,8 +859,9 @@ enum obd_notify_event {
          OBD_NOTIFY_CONFIG
  };
  
-#define CONFIG_LOG  0x1  /* finished processing config log */
-#define CONFIG_SYNC 0x2  /* mdt synced 1 ost */
+#define CONFIG_LOG      0x1  /* finished processing config log */
+#define CONFIG_SYNC     0x2  /* mdt synced 1 ost */
+#define CONFIG_TARGET   0x4  /* one target is added */
  
  /*
   * Data structure used to pass obd_notify()-event to non-obd listeners (llite
@@ -758,6 +877,7 @@ struct obd_notify_upcall {
  /* corresponds to one of the obd's */
  #define MAX_OBD_NAME 128
  #define OBD_DEVICE_MAGIC        0XAB5CD6EF
+#define OBD_DEV_BY_DEVNAME      0xffffd0de
  struct obd_device {
          struct obd_type        *obd_type;
          __u32                   obd_magic;
@@ -770,7 +890,8 @@ struct obd_device {
          unsigned long obd_attached:1,      /* finished attach */
                        obd_set_up:1,        /* finished setup */
                        obd_recovering:1,    /* there are recoverable clients */
-                      obd_abort_recovery:1,/* somebody ioctl'ed us to abort */ 
+                      obd_abort_recovery:1,/* recovery expired */
+                      obd_version_recov:1, /* obd uses version checking */
                        obd_replayable:1,    /* recovery is enabled; inform clients */
                        obd_no_transno:1,    /* no committed-transno notification */
                        obd_no_recov:1,      /* fail instead of retry messages */
@@ -781,18 +902,18 @@ struct obd_device {
                        obd_async_recov:1,   /* allow asyncronous orphan cleanup */
                        obd_no_conn:1,       /* deny new connections */
                        obd_inactive:1;      /* device active/inactive
-                                           * (for /proc/status only!!) */
+                                            * (for /proc/status only!!) */
          /* uuid-export hash body */
-        struct lustre_class_hash_body *obd_uuid_hash_body;
+        struct lustre_hash     *obd_uuid_hash;
          /* nid-export hash body */
-        struct lustre_class_hash_body *obd_nid_hash_body;
+        struct lustre_hash     *obd_nid_hash;
          /* nid stats body */
-        struct lustre_class_hash_body *obd_nid_stats_hash_body;
+        struct lustre_hash     *obd_nid_stats_hash;
          struct list_head        obd_nid_stats;
          atomic_t                obd_refcount;
          cfs_waitq_t             obd_refcount_waitq;
-        cfs_waitq_t             obd_llog_waitq;
          struct list_head        obd_exports;
+        struct list_head        obd_delayed_exports;
          int                     obd_num_exports;
          spinlock_t              obd_nid_lock;
          struct ldlm_namespace  *obd_namespace;
@@ -804,9 +925,14 @@ struct obd_device {
          struct fsfilt_operations *obd_fsops;
          spinlock_t              obd_osfs_lock;
          struct obd_statfs       obd_osfs;       /* locked by obd_osfs_lock */
-        __u64                   obd_osfs_age;   
+        __u64                   obd_osfs_age;
          struct lvfs_run_ctxt    obd_lvfs_ctxt;
+
          struct llog_ctxt        *obd_llog_ctxt[LLOG_MAX_CTXTS];
+        struct semaphore        obd_llog_alloc;
+        struct semaphore        obd_llog_cat_process;
+        cfs_waitq_t             obd_llog_waitq;
+
          struct obd_device       *obd_observer;
          struct obd_notify_upcall obd_upcall;
          struct obd_export       *obd_self_export;
@@ -819,14 +945,13 @@ struct obd_device {
          int                              obd_max_recoverable_clients;
          int                              obd_connected_clients;
          int                              obd_recoverable_clients;
+        int                              obd_delayed_clients;
          spinlock_t                       obd_processing_task_lock; /* BH lock (timer) */
          pid_t                            obd_processing_task;
          __u64                            obd_next_recovery_transno;
          int                              obd_replayed_requests;
          int                              obd_requests_queued_for_recovery;
          cfs_waitq_t                      obd_next_transno_waitq;
-        struct list_head                 obd_uncommitted_replies;
-        spinlock_t                       obd_uncommitted_replies_lock;
          cfs_timer_t                      obd_recovery_timer;
          struct list_head                 obd_recovery_queue;
          struct list_head                 obd_delayed_reply_queue;
@@ -864,9 +989,6 @@ struct obd_device {
          __u64                  obd_pool_slv;
  };
  
-#define OBD_OPT_FORCE           0x0001
-#define OBD_OPT_FAILOVER        0x0002
-
  #define OBD_LLOG_FL_SENDNOW     0x0001
  
  enum obd_cleanup_stage {
@@ -889,11 +1011,11 @@ enum obd_cleanup_stage {
  #define KEY_LOVDESC             "lovdesc"
  #define KEY_INIT_RECOV          "initial_recov"
  #define KEY_INIT_RECOV_BACKUP   "init_recov_bk"
-#define KEY_LOV_IDX             "lov_idx"
  #define KEY_LAST_ID             "last_id"
  #define KEY_LOCK_TO_STRIPE      "lock_to_stripe"
  #define KEY_CHECKSUM            "checksum"
-#define KEY_READONLY            "readonly"
+#define KEY_READONLY            "read-only"
+#define KEY_READONLY_166COMPAT  "readonly"
  #define KEY_UNLINKED            "unlinked"
  #define KEY_EVICT_BY_NID        "evict_by_nid"
  #define KEY_REGISTER_TARGET     "register_target"
@@ -903,6 +1025,7 @@ enum obd_cleanup_stage {
  #define KEY_BLOCKSIZE           "blocksize"
  #define KEY_BLOCKSIZE_BITS      "blocksize_bits"
  #define KEY_MAX_EASIZE          "max_ea_size"
+#define KEY_FIEMAP              "FIEMAP"
  /* XXX unused */
  #define KEY_ASYNC               "async"
  
@@ -911,7 +1034,7 @@ struct obd_ops {
          int (*o_iocontrol)(unsigned int cmd, struct obd_export *exp, int len,
                             void *karg, void *uarg);
          int (*o_get_info)(struct obd_export *, __u32 keylen, void *key,
-                          __u32 *vallen, void *val);
+                          __u32 *vallen, void *val, struct lov_stripe_md *lsm);
          int (*o_set_info_async)(struct obd_export *, __u32 keylen, void *key,
                                  __u32 vallen, void *val,
                                  struct ptlrpc_request_set *set);
@@ -936,9 +1059,14 @@ struct obd_ops {
                           void *localdata);
          int (*o_reconnect)(struct obd_export *exp, struct obd_device *src,
                             struct obd_uuid *cluuid,
-                           struct obd_connect_data *ocd);
+                           struct obd_connect_data *ocd,
+                           void *localdata);
          int (*o_disconnect)(struct obd_export *exp);
  
+        /* Initialize/finalize fids infrastructure. */
+        int (*o_fid_init)(struct obd_export *exp);
+        int (*o_fid_fini)(struct obd_export *exp);
+
          int (*o_statfs)(struct obd_device *obd, struct obd_statfs *osfs,
                          __u64 max_age, __u32 flags);
          int (*o_statfs_async)(struct obd_device *obd, struct obd_info *oinfo,
@@ -975,7 +1103,7 @@ struct obd_ops {
          int (*o_prep_async_page)(struct obd_export *exp,
                                   struct lov_stripe_md *lsm,
                                   struct lov_oinfo *loi,
-                                 cfs_page_t *page, obd_off offset, 
+                                 cfs_page_t *page, obd_off offset,
                                   struct obd_async_page_ops *ops, void *data,
                                   void **res, int nocache,
                                   struct lustre_handle *lockh);
@@ -1011,13 +1139,16 @@ struct obd_ops {
                                       struct lov_oinfo *loi, void *cookie);
          int (*o_merge_lvb)(struct obd_export *exp, struct lov_stripe_md *lsm,
                             struct ost_lvb *lvb, int kms_only);
+        int (*o_update_lvb)(struct obd_export *exp, struct lov_stripe_md *lsm,
+                            struct ost_lvb *lvb, obd_flag valid);
          int (*o_adjust_kms)(struct obd_export *exp, struct lov_stripe_md *lsm,
                              obd_off size, int shrink);
          int (*o_punch)(struct obd_export *exp, struct obd_info *oinfo,
                         struct obd_trans_info *oti,
                         struct ptlrpc_request_set *rqset);
-        int (*o_sync)(struct obd_export *exp, struct obdo *oa,
-                      struct lov_stripe_md *ea, obd_size start, obd_size end);
+        int (*o_sync)(struct obd_export *exp, struct obd_info *oinfo,
+                      obd_size start, obd_size end,
+                      struct ptlrpc_request_set *rqset);
          int (*o_migrate)(struct lustre_handle *conn, struct lov_stripe_md *dst,
                           struct lov_stripe_md *src, obd_size start,
                           obd_size end, struct obd_trans_info *oti);
@@ -1029,11 +1160,13 @@ struct obd_ops {
                           obd_id *startid, obd_gr group, void *data);
          int (*o_preprw)(int cmd, struct obd_export *exp, struct obdo *oa,
                          int objcount, struct obd_ioobj *obj,
-                        int niocount, struct niobuf_remote *remote,
-                        struct niobuf_local *local, struct obd_trans_info *oti);
+                        struct niobuf_remote *remote, int *nr_pages,
+                        struct niobuf_local *local,
+                        struct obd_trans_info *oti);
          int (*o_commitrw)(int cmd, struct obd_export *exp, struct obdo *oa,
                            int objcount, struct obd_ioobj *obj,
-                          int niocount, struct niobuf_local *local,
+                          struct niobuf_remote *remote, int pages,
+                          struct niobuf_local *local,
                            struct obd_trans_info *oti, int rc);
          int (*o_enqueue)(struct obd_export *, struct obd_info *oinfo,
                           struct ldlm_enqueue_info *einfo,
@@ -1056,12 +1189,12 @@ struct obd_ops {
  
          /* llog related obd_methods */
          int (*o_llog_init)(struct obd_device *obd, struct obd_device *disk_obd,
-                           int count, struct llog_catid *logid, 
+                           int count, struct llog_catid *logid,
                             struct obd_uuid *uuid);
          int (*o_llog_finish)(struct obd_device *obd, int count);
  
          /* metadata-only methods */
-        int (*o_pin)(struct obd_export *, obd_id ino, __u32 gen, int type,
+        int (*o_pin)(struct obd_export *, struct ll_fid *,
                       struct obd_client_handle *, int flag);
          int (*o_unpin)(struct obd_export *, struct obd_client_handle *, int);
  
@@ -1077,7 +1210,8 @@ struct obd_ops {
          int (*o_quotacheck)(struct obd_export *, struct obd_quotactl *);
          int (*o_quotactl)(struct obd_export *, struct obd_quotactl *);
          int (*o_quota_adjust_qunit)(struct obd_export *exp,
-                                    struct quota_adjust_qunit *oqaq);
+                                    struct quota_adjust_qunit *oqaq,
+                                    struct lustre_quota_ctxt *qctxt);
  
  
          int (*o_ping)(struct obd_export *exp);
@@ -1091,7 +1225,13 @@ struct obd_ops {
                                         obd_lock_cancel_cb cb);
          int (*o_unregister_lock_cancel_cb)(struct obd_export *exp,
                                           obd_lock_cancel_cb cb);
-        
+        /* pools methods */
+        int (*o_pool_new)(struct obd_device *obd, char *poolname);
+        int (*o_pool_del)(struct obd_device *obd, char *poolname);
+        int (*o_pool_add)(struct obd_device *obd, char *poolname,
+                          char *ostname);
+        int (*o_pool_rem)(struct obd_device *obd, char *poolname,
+                          char *ostname);
          /*
           * NOTE: If adding ops, add another LPROCFS_OBD_OP_INIT() line
           * to lprocfs_alloc_obd_stats() in obdclass/lprocfs_status.c.
@@ -1103,9 +1243,9 @@ struct lsm_operations {
          int (*lsm_destroy)(struct lov_stripe_md *, struct obdo *oa,
                             struct obd_export *md_exp);
          void (*lsm_stripe_by_index)(struct lov_stripe_md *, int *, obd_off *,
-                                     unsigned long *);
+                                    obd_off *);
          void (*lsm_stripe_by_offset)(struct lov_stripe_md *, int *, obd_off *,
-                                     unsigned long *);
+                                     obd_off *);
          obd_off (*lsm_stripe_offset_by_index)(struct lov_stripe_md *, int);
          obd_off (*lsm_stripe_offset_by_offset)(struct lov_stripe_md *, obd_off);
          int (*lsm_stripe_index_by_offset)(struct lov_stripe_md *, obd_off);
@@ -1116,15 +1256,18 @@ struct lsm_operations {
                               struct lov_mds_md *lmm);
  };
  
-extern struct lsm_operations lsm_plain_ops;
+extern struct lsm_operations lsm_v1_ops;
  extern struct lsm_operations lsm_join_ops;
+extern struct lsm_operations lsm_v3_ops;
  static inline struct lsm_operations *lsm_op_find(int magic)
  {
          switch(magic) {
-        case LOV_MAGIC:
-               return &lsm_plain_ops;
+        case LOV_MAGIC_V1:
+               return &lsm_v1_ops;
          case LOV_MAGIC_JOIN:
                 return &lsm_join_ops;
+        case LOV_MAGIC_V3:
+               return &lsm_v3_ops;
          default:
                 CERROR("Cannot recognize lsm_magic %x\n", magic);
                 return NULL;
@@ -1138,19 +1281,24 @@ int lvfs_check_io_health(struct obd_device *obd, struct file *file);
  #define OBD_CALC_STRIPE_END     2
  
  static inline void obd_transno_commit_cb(struct obd_device *obd, __u64 transno,
-                                         int error)
+                                         struct obd_export *exp, int error)
  {
          if (error) {
                  CERROR("%s: transno "LPU64" commit error: %d\n",
                         obd->obd_name, transno, error);
                  return;
          }
-        CDEBUG(D_HA, "%s: transno "LPU64" committed\n",
-               obd->obd_name, transno);
-        if (transno > obd->obd_last_committed) {
-                obd->obd_last_committed = transno;
-                ptlrpc_commit_replies (obd);
+        if (exp && transno > exp->exp_last_committed) {
+                CDEBUG(D_HA, "%s: transno "LPU64" committed\n",
+                       obd->obd_name, transno);
+                exp->exp_last_committed = transno;
+                ptlrpc_commit_replies(exp);
+        } else {
+                CDEBUG(D_INFO, "%s: transno "LPU64" committed\n",
+                       obd->obd_name, transno);
          }
+        if (transno > obd->obd_last_committed)
+                obd->obd_last_committed = transno;
  }
  
  static inline void init_obd_quota_ops(quota_interface_t *interface,
diff --git a/lustre/include/obd_cache.h b/lustre/include/obd_cache.h

index c5ec326..f067950 100644 (file)
--- a/lustre/include/obd_cache.h
+++ b/lustre/include/obd_cache.h
@@ -1,5 +1,37 @@
  /* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
   * vim:expandtab:shiftwidth=8:tabstop=8:
+ *
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright  2008 Sun Microsystems, Inc. All rights reserved
+ * Use is subject to license terms.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
   */
  
  #ifndef _OBD_CACHE_H__
diff --git a/lustre/include/obd_class.h b/lustre/include/obd_class.h

index ec94471..c07578d 100644 (file)
--- a/lustre/include/obd_class.h
+++ b/lustre/include/obd_class.h
@@ -1,23 +1,37 @@
  /* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
   * vim:expandtab:shiftwidth=8:tabstop=8:
   *
- *  Copyright (C) 2001-2003 Cluster File Systems, Inc.
+ * GPL HEADER START
   *
- *   This file is part of Lustre, http://www.lustre.org.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   *
- *   Lustre is free software; you can redistribute it and/or
- *   modify it under the terms of version 2 of the GNU General Public
- *   License as published by the Free Software Foundation.
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
   *
- *   Lustre is distributed in the hope that it will be useful,
- *   but WITHOUT ANY WARRANTY; without even the implied warranty of
- *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- *   GNU General Public License for more details.
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
   *
- *   You should have received a copy of the GNU General Public License
- *   along with Lustre; if not, write to the Free Software
- *   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
   *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright  2008 Sun Microsystems, Inc. All rights reserved
+ * Use is subject to license terms.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
   */
  
  #ifndef __CLASS_OBD_H
@@ -44,6 +58,7 @@
  /* OBD Device Declarations */
  extern struct obd_device *obd_devs[MAX_OBD_DEVICES];
  extern spinlock_t obd_dev_lock;
+extern cfs_mem_cache_t *obd_lvfs_ctxt_cache;
  
  /* OBD Operations Declarations */
  extern struct obd_device *class_conn2obd(struct lustre_handle *);
@@ -90,7 +105,7 @@ void obd_zombie_impexp_cull(void);
  
  /* obd_config.c */
  int class_process_config(struct lustre_cfg *lcfg);
-int class_process_proc_param(char *prefix, struct lprocfs_vars *lvars, 
+int class_process_proc_param(char *prefix, struct lprocfs_vars *lvars,
                               struct lustre_cfg *lcfg, void *data);
  int class_attach(struct lustre_cfg *lcfg);
  int class_setup(struct obd_device *obd, struct lustre_cfg *lcfg);
@@ -121,7 +136,7 @@ struct config_llog_instance {
          struct super_block *cfg_sb;
          struct obd_uuid     cfg_uuid;
          int                 cfg_last_idx; /* for partial llog processing */
-        int                 cfg_flags; 
+        int                 cfg_flags;
  };
  int class_config_parse_llog(struct llog_ctxt *ctxt, char *name,
                              struct config_llog_instance *cfg);
@@ -205,9 +220,22 @@ int class_connect(struct lustre_handle *conn, struct obd_device *obd,
  int class_disconnect(struct obd_export *exp);
  void class_fail_export(struct obd_export *exp);
  void class_disconnect_exports(struct obd_device *obddev);
-void class_disconnect_stale_exports(struct obd_device *obddev);
+void class_set_export_delayed(struct obd_export *exp);
+void class_handle_stale_exports(struct obd_device *obddev);
+void class_disconnect_expired_exports(struct obd_device *obd);
+void class_disconnect_stale_exports(struct obd_device *obddev,
+                                    enum obd_option flags);
+int class_stale_export_list(struct obd_device *obd, struct obd_ioctl_data *data);
  int class_manual_cleanup(struct obd_device *obd);
  
+static inline enum obd_option exp_flags_from_obd(struct obd_device *obd)
+{
+        return ((obd->obd_fail ? OBD_OPT_FAILOVER : 0) |
+                (obd->obd_force ? OBD_OPT_FORCE : 0) |
+                (obd->obd_abort_recovery ? OBD_OPT_ABORT_RECOV : 0) |
+                0);
+}
+
  /* obdo.c */
  void obdo_cpy_md(struct obdo *dst, struct obdo *src, obd_flag valid);
  void obdo_to_ioobj(struct obdo *oa, struct obd_ioobj *ioobj);
@@ -316,7 +344,8 @@ static inline int class_devno_max(void)
  }
  
  static inline int obd_get_info(struct obd_export *exp, __u32 keylen,
-                               void *key, __u32 *vallen, void *val)
+                               void *key, __u32 *vallen, void *val,
+                               struct lov_stripe_md *lsm)
  {
          int rc;
          ENTRY;
@@ -324,7 +353,7 @@ static inline int obd_get_info(struct obd_export *exp, __u32 keylen,
          EXP_CHECK_OP(exp, get_info);
          EXP_COUNTER_INCREMENT(exp, get_info);
  
-        rc = OBP(exp->exp_obd, get_info)(exp, keylen, key, vallen, val);
+        rc = OBP(exp->exp_obd, get_info)(exp, keylen, key, vallen, val, lsm);
          RETURN(rc);
  }
  
@@ -338,7 +367,7 @@ static inline int obd_set_info_async(struct obd_export *exp, obd_count keylen,
          EXP_CHECK_OP(exp, set_info_async);
          EXP_COUNTER_INCREMENT(exp, set_info_async);
  
-        rc = OBP(exp->exp_obd, set_info_async)(exp, keylen, key, vallen, val, 
+        rc = OBP(exp->exp_obd, set_info_async)(exp, keylen, key, vallen, val,
                                                 set);
          RETURN(rc);
  }
@@ -355,7 +384,7 @@ static inline int obd_setup(struct obd_device *obd, int datalen, void *data)
          RETURN(rc);
  }
  
-static inline int obd_precleanup(struct obd_device *obd, 
+static inline int obd_precleanup(struct obd_device *obd,
                                   enum obd_cleanup_stage cleanup_stage)
  {
          int rc;
@@ -663,7 +692,8 @@ static inline int obd_connect(struct lustre_handle *conn,struct obd_device *obd,
  static inline int obd_reconnect(struct obd_export *exp,
                                  struct obd_device *obd,
                                  struct obd_uuid *cluuid,
-                                struct obd_connect_data *d)
+                                struct obd_connect_data *d,
+                                void *localdata)
  {
          int rc;
          __u64 ocf = d ? d->ocd_connect_flags : 0; /* for post-condition check */
@@ -673,7 +703,7 @@ static inline int obd_reconnect(struct obd_export *exp,
          OBD_CHECK_OP(obd, reconnect, 0);
          OBD_COUNTER_INCREMENT(obd, reconnect);
  
-        rc = OBP(obd, reconnect)(exp, obd, cluuid, d);
+        rc = OBP(obd, reconnect)(exp, obd, cluuid, d, localdata);
          /* check that only subset is granted */
          LASSERT(ergo(d != NULL,
                       (d->ocd_connect_flags & ocf) == d->ocd_connect_flags));
@@ -692,6 +722,30 @@ static inline int obd_disconnect(struct obd_export *exp)
          RETURN(rc);
  }
  
+static inline int obd_fid_init(struct obd_export *exp)
+{
+        int rc;
+        ENTRY;
+
+        OBD_CHECK_OP(exp->exp_obd, fid_init, 0);
+        EXP_COUNTER_INCREMENT(exp, fid_init);
+
+        rc = OBP(exp->exp_obd, fid_init)(exp);
+        RETURN(rc);
+}
+
+static inline int obd_fid_fini(struct obd_export *exp)
+{
+        int rc;
+        ENTRY;
+
+        OBD_CHECK_OP(exp->exp_obd, fid_fini, 0);
+        EXP_COUNTER_INCREMENT(exp, fid_fini);
+
+        rc = OBP(exp->exp_obd, fid_fini)(exp);
+        RETURN(rc);
+}
+
  static inline int obd_ping(struct obd_export *exp)
  {
          int rc;
@@ -704,6 +758,54 @@ static inline int obd_ping(struct obd_export *exp)
          RETURN(rc);
  }
  
+static inline int obd_pool_new(struct obd_device *obd, char *poolname)
+{
+        int rc;
+        ENTRY;
+
+        OBD_CHECK_OP(obd, pool_new, -EOPNOTSUPP);
+        OBD_COUNTER_INCREMENT(obd, pool_new);
+
+        rc = OBP(obd, pool_new)(obd, poolname);
+        RETURN(rc);
+}
+
+static inline int obd_pool_del(struct obd_device *obd, char *poolname)
+{
+        int rc;
+        ENTRY;
+
+        OBD_CHECK_OP(obd, pool_del, -EOPNOTSUPP);
+        OBD_COUNTER_INCREMENT(obd, pool_del);
+
+        rc = OBP(obd, pool_del)(obd, poolname);
+        RETURN(rc);
+}
+
+static inline int obd_pool_add(struct obd_device *obd, char *poolname, char *ostname)
+{
+        int rc;
+        ENTRY;
+
+        OBD_CHECK_OP(obd, pool_add, -EOPNOTSUPP);
+        OBD_COUNTER_INCREMENT(obd, pool_add);
+
+        rc = OBP(obd, pool_add)(obd, poolname, ostname);
+        RETURN(rc);
+}
+
+static inline int obd_pool_rem(struct obd_device *obd, char *poolname, char *ostname)
+{
+        int rc;
+        ENTRY;
+
+        OBD_CHECK_OP(obd, pool_rem, -EOPNOTSUPP);
+        OBD_COUNTER_INCREMENT(obd, pool_rem);
+
+        rc = OBP(obd, pool_rem)(obd, poolname, ostname);
+        RETURN(rc);
+}
+
  static inline int obd_init_export(struct obd_export *exp)
  {
          int rc = 0;
@@ -778,6 +880,7 @@ static inline int obd_statfs_async(struct obd_device *obd,
                  spin_lock(&obd->obd_osfs_lock);
                  memcpy(oinfo->oi_osfs, &obd->obd_osfs, sizeof(*oinfo->oi_osfs));
                  spin_unlock(&obd->obd_osfs_lock);
+                oinfo->oi_flags |= OBD_STATFS_FROM_CACHE;
                  if (oinfo->oi_cb_up)
                          oinfo->oi_cb_up(oinfo, 0);
          }
@@ -844,9 +947,30 @@ static inline int obd_statfs(struct obd_device *obd, struct obd_statfs *osfs,
          RETURN(rc);
  }
  
-static inline int obd_sync(struct obd_export *exp, struct obdo *oa,
-                           struct lov_stripe_md *ea, obd_size start,
-                           obd_size end)
+static inline int obd_sync_rqset(struct obd_export *exp, struct obd_info *oinfo,
+                                 obd_size start, obd_size end)
+{
+        struct ptlrpc_request_set *set = NULL;
+        int rc;
+        ENTRY;
+
+        OBD_CHECK_OP(exp->exp_obd, sync, -EOPNOTSUPP);
+        EXP_COUNTER_INCREMENT(exp, sync);
+
+        set =  ptlrpc_prep_set();
+        if (set == NULL)
+                RETURN(-ENOMEM);
+
+        rc = OBP(exp->exp_obd, sync)(exp, oinfo, start, end, set);
+        if (rc == 0)
+                rc = ptlrpc_set_wait(set);
+        ptlrpc_set_destroy(set);
+        RETURN(rc);
+}
+
+static inline int obd_sync(struct obd_export *exp, struct obd_info *oinfo,
+                           obd_size start, obd_size end,
+                           struct ptlrpc_request_set *set)
  {
          int rc;
          ENTRY;
@@ -854,7 +978,7 @@ static inline int obd_sync(struct obd_export *exp, struct obdo *oa,
          OBD_CHECK_OP(exp->exp_obd, sync, -EOPNOTSUPP);
          EXP_COUNTER_INCREMENT(exp, sync);
  
-        rc = OBP(exp->exp_obd, sync)(exp, oa, ea, start, end);
+        rc = OBP(exp->exp_obd, sync)(exp, oinfo, start, end, set);
          RETURN(rc);
  }
  
@@ -1097,7 +1221,7 @@ static inline int obd_teardown_async_page(struct obd_export *exp,
  
  static inline int obd_preprw(int cmd, struct obd_export *exp, struct obdo *oa,
                               int objcount, struct obd_ioobj *obj,
-                             int niocount, struct niobuf_remote *remote,
+                             struct niobuf_remote *remote, int *pages,
                               struct niobuf_local *local,
                               struct obd_trans_info *oti)
  {
@@ -1107,14 +1231,15 @@ static inline int obd_preprw(int cmd, struct obd_export *exp, struct obdo *oa,
          OBD_CHECK_OP(exp->exp_obd, preprw, -EOPNOTSUPP);
          EXP_COUNTER_INCREMENT(exp, preprw);
  
-        rc = OBP(exp->exp_obd, preprw)(cmd, exp, oa, objcount, obj, niocount,
-                                       remote, local, oti);
+        rc = OBP(exp->exp_obd, preprw)(cmd, exp, oa, objcount, obj, remote,
+                                       pages, local, oti);
          RETURN(rc);
  }
  
  static inline int obd_commitrw(int cmd, struct obd_export *exp, struct obdo *oa,
                                 int objcount, struct obd_ioobj *obj,
-                               int niocount, struct niobuf_local *local,
+                               struct niobuf_remote *rnb, int pages,
+                               struct niobuf_local *local,
                                 struct obd_trans_info *oti, int rc)
  {
          ENTRY;
@@ -1122,8 +1247,8 @@ static inline int obd_commitrw(int cmd, struct obd_export *exp, struct obdo *oa,
          OBD_CHECK_OP(exp->exp_obd, commitrw, -EOPNOTSUPP);
          EXP_COUNTER_INCREMENT(exp, commitrw);
  
-        rc = OBP(exp->exp_obd, commitrw)(cmd, exp, oa, objcount, obj, niocount,
-                                         local, oti, rc);
+        rc = OBP(exp->exp_obd, commitrw)(cmd, exp, oa, objcount, obj,
+                                         rnb, pages, local, oti, rc);
          RETURN(rc);
  }
  
@@ -1141,6 +1266,20 @@ static inline int obd_merge_lvb(struct obd_export *exp,
          RETURN(rc);
  }
  
+static inline int obd_update_lvb(struct obd_export *exp,
+                                 struct lov_stripe_md *lsm,
+                                 struct ost_lvb *lvb, obd_flag valid)
+{
+        int rc;
+        ENTRY;
+
+        OBD_CHECK_OP(exp->exp_obd, update_lvb, -EOPNOTSUPP);
+        EXP_COUNTER_INCREMENT(exp, update_lvb);
+
+        rc = OBP(exp->exp_obd, update_lvb)(exp, lsm, lvb, valid);
+        RETURN(rc);
+}
+
  static inline int obd_adjust_kms(struct obd_export *exp,
                                   struct lov_stripe_md *lsm, obd_off size,
                                   int shrink)
@@ -1275,8 +1414,8 @@ static inline int obd_join_lru(struct obd_export *exp,
          RETURN(rc);
  }
  
-static inline int obd_pin(struct obd_export *exp, obd_id ino, __u32 gen,
-                          int type, struct obd_client_handle *handle, int flag)
+static inline int obd_pin(struct obd_export *exp, struct ll_fid *fid,
+                          struct obd_client_handle *handle, int flag)
  {
          int rc;
          ENTRY;
@@ -1284,7 +1423,7 @@ static inline int obd_pin(struct obd_export *exp, obd_id ino, __u32 gen,
          EXP_CHECK_OP(exp, pin);
          EXP_COUNTER_INCREMENT(exp, pin);
  
-        rc = OBP(exp->exp_obd, pin)(exp, ino, gen, type, handle, flag);
+        rc = OBP(exp->exp_obd, pin)(exp, fid, handle, flag);
          RETURN(rc);
  }
  
@@ -1329,13 +1468,13 @@ static inline int obd_notify(struct obd_device *obd,
          /* the check for async_recov is a complete hack - I'm hereby
             overloading the meaning to also mean "this was called from
             mds_postsetup".  I know that my mds is able to handle notifies
-           by this point, and it needs to get them to execute mds_postrecov. */                                                                                
+           by this point, and it needs to get them to execute mds_postrecov. */
          if (!obd->obd_set_up && !obd->obd_async_recov) {
                  CDEBUG(D_HA, "obd %s not set up\n", obd->obd_name);
                  RETURN(-EINVAL);
          }
  
-        if (!OBP(obd, notify)) 
+        if (!OBP(obd, notify))
                  RETURN(-ENOSYS);
  
          OBD_COUNTER_INCREMENT(obd, notify);
@@ -1394,15 +1533,34 @@ static inline int obd_quotactl(struct obd_export *exp,
  }
  
  static inline int obd_quota_adjust_qunit(struct obd_export *exp,
-                                         struct quota_adjust_qunit *oqaq)
+                                         struct quota_adjust_qunit *oqaq,
+                                         struct lustre_quota_ctxt *qctxt)
  {
+#if defined(LPROCFS) && defined(HAVE_QUOTA_SUPPORT)
+        struct timeval work_start;
+        struct timeval work_end;
+        long timediff;
+#endif
          int rc;
          ENTRY;
  
+#if defined(LPROCFS) && defined(HAVE_QUOTA_SUPPORT)
+        if (qctxt)
+                do_gettimeofday(&work_start);
+#endif
          EXP_CHECK_OP(exp, quota_adjust_qunit);
          EXP_COUNTER_INCREMENT(exp, quota_adjust_qunit);
  
-        rc = OBP(exp->exp_obd, quota_adjust_qunit)(exp, oqaq);
+        rc = OBP(exp->exp_obd, quota_adjust_qunit)(exp, oqaq, qctxt);
+
+#if defined(LPROCFS) && defined(HAVE_QUOTA_SUPPORT)
+        if (qctxt) {
+                do_gettimeofday(&work_end);
+                timediff = cfs_timeval_sub(&work_end, &work_start, NULL);
+                lprocfs_counter_add(qctxt->lqc_stats, LQUOTA_ADJUST_QUNIT,
+                                    timediff);
+        }
+#endif
          RETURN(rc);
  }
  
diff --git a/lustre/include/obd_echo.h b/lustre/include/obd_echo.h

index 53b0e6b..7465b68 100644 (file)
--- a/lustre/include/obd_echo.h
+++ b/lustre/include/obd_echo.h
@@ -1,5 +1,37 @@
  /* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
   * vim:expandtab:shiftwidth=8:tabstop=8:
+ *
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright  2008 Sun Microsystems, Inc. All rights reserved
+ * Use is subject to license terms.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
   */
  
  #ifndef _OBD_ECHO_H
diff --git a/lustre/include/obd_lov.h b/lustre/include/obd_lov.h

index c0f302d..da3ca51 100644 (file)
--- a/lustre/include/obd_lov.h
+++ b/lustre/include/obd_lov.h
@@ -1,5 +1,37 @@
  /* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
   * vim:expandtab:shiftwidth=8:tabstop=8:
+ *
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright  2008 Sun Microsystems, Inc. All rights reserved
+ * Use is subject to license terms.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
   */
  
  #ifndef _OBD_LOV_H__
@@ -10,13 +42,17 @@ static inline int lov_stripe_md_size(int stripes)
          return sizeof(struct lov_stripe_md) + stripes*sizeof(struct lov_oinfo*);
  }
  
-#define lov_mds_md_size(stripes) lov_mds_md_v1_size(stripes)
-static inline int lov_mds_md_v1_size(int stripes)
+static inline int lov_mds_md_size(int stripes, int lmm_magic)
  {
-        return sizeof(struct lov_mds_md_v1) +
-                stripes * sizeof(struct lov_ost_data_v1);
+        if (lmm_magic == LOV_MAGIC_V3)
+                return sizeof(struct lov_mds_md_v3) +
+                        stripes * sizeof(struct lov_ost_data_v1);
+        else
+                return sizeof(struct lov_mds_md_v1) +
+                        stripes * sizeof(struct lov_ost_data_v1);
  }
  
+
  #define IOC_LOV_TYPE                   'g'
  #define IOC_LOV_MIN_NR                 50
  #define IOC_LOV_SET_OSC_ACTIVE         _IOWR('g', 50, long)
diff --git a/lustre/include/obd_ost.h b/lustre/include/obd_ost.h

index d2b399e..b2df596 100644 (file)
--- a/lustre/include/obd_ost.h
+++ b/lustre/include/obd_ost.h
@@ -1,7 +1,39 @@
  /* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
   * vim:expandtab:shiftwidth=8:tabstop=8:
   *
- *   This file is part of Lustre, http://www.lustre.org
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright  2008 Sun Microsystems, Inc. All rights reserved
+ * Use is subject to license terms.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/include/obd_ost.h
   *
   * Data structures for object storage targets and client: OST & OSC's
   * 
@@ -38,4 +70,24 @@ int osc_extent_blocking_cb(struct ldlm_lock *lock,
                             struct ldlm_lock_desc *new, void *data,
                             int flag);
  
+/** 
+ * Build DLM resource name from object id & group for osc-ost extent lock.
+ */
+static inline struct ldlm_res_id *osc_build_res_name(__u64 id, __u64 gr,
+                                                     struct ldlm_res_id *name)
+{
+        memset(name, 0, sizeof *name);
+        name->name[0] = id;
+        name->name[1] = gr;
+        return name;
+}
+
+/**
+ * Return true if the resource is for the object identified by this id & group.
+ */
+static inline int osc_res_name_eq(__u64 id, __u64 gr, struct ldlm_res_id *name)
+{
+        return name->name[0] == id && name->name[1] == gr;
+}
+
  #endif
diff --git a/lustre/include/obd_support.h b/lustre/include/obd_support.h

index 34cf827..0417fc1 100644 (file)
--- a/lustre/include/obd_support.h
+++ b/lustre/include/obd_support.h
@@ -1,23 +1,37 @@
  /* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
   * vim:expandtab:shiftwidth=8:tabstop=8:
   *
- *  Copyright (C) 2001, 2002 Cluster File Systems, Inc.
+ * GPL HEADER START
   *
- *   This file is part of Lustre, http://www.lustre.org.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   *
- *   Lustre is free software; you can redistribute it and/or
- *   modify it under the terms of version 2 of the GNU General Public
- *   License as published by the Free Software Foundation.
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
   *
- *   Lustre is distributed in the hope that it will be useful,
- *   but WITHOUT ANY WARRANTY; without even the implied warranty of
- *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- *   GNU General Public License for more details.
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
   *
- *   You should have received a copy of the GNU General Public License
- *   along with Lustre; if not, write to the Free Software
- *   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
   *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright  2008 Sun Microsystems, Inc. All rights reserved
+ * Use is subject to license terms.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
   */
  
  #ifndef _OBD_SUPPORT
@@ -41,7 +55,7 @@ extern unsigned int obd_fail_val;
  extern unsigned int obd_debug_peer_on_timeout;
  extern unsigned int obd_dump_on_timeout;
  extern unsigned int obd_dump_on_eviction;
-/* obd_timeout should only be used for recovery, not for 
+/* obd_timeout should only be used for recovery, not for
     networking / disk / timings affected by load (use Adaptive Timeouts) */
  extern unsigned int obd_timeout;          /* seconds */
  extern unsigned int ldlm_timeout;         /* seconds */
@@ -53,8 +67,14 @@ extern int obd_race_state;
  extern unsigned int obd_alloc_fail_rate;
  
  /* Timeout definitions */
-#define OBD_TIMEOUT_DEFAULT 100
-#define LDLM_TIMEOUT_DEFAULT 20
+#define OBD_TIMEOUT_DEFAULT             100
+#define LDLM_TIMEOUT_DEFAULT            20
+#define MDS_LDLM_TIMEOUT_DEFAULT        6
+#ifdef HAVE_DELAYED_RECOVERY
+#define STALE_EXPORT_MAXTIME_DEFAULT    (24*60*60) /**< one day, in seconds */
+#else
+#define STALE_EXPORT_MAXTIME_DEFAULT    (0) /**< zero if no delayed recovery */
+#endif
  #ifdef CRAY_XT3
   #define OBD_RECOVERY_MAX_TIME (obd_timeout * 18) /* b13079 */
  #endif
@@ -77,12 +97,12 @@ extern unsigned int obd_alloc_fail_rate;
  #define CONNECTION_SWITCH_MAX min(50U, max(CONNECTION_SWITCH_MIN,obd_timeout))
  #define CONNECTION_SWITCH_INC 5  /* Connection timeout backoff */
  #ifndef CRAY_XT3
-/* In general this should be low to have quick detection of a system 
+/* In general this should be low to have quick detection of a system
     running on a backup server. (If it's too low, import_select_connection
     will increase the timeout anyhow.)  */
  #define INITIAL_CONNECT_TIMEOUT max(CONNECTION_SWITCH_MIN,obd_timeout/20)
  #else
-/* ...but for very large systems (e.g. CRAY) we need to keep the initial 
+/* ...but for very large systems (e.g. CRAY) we need to keep the initial
     connect t.o. high (bz 10803), because they will nearly ALWAYS be doing the
     connects for the first time (clients "reboot" after every process, so no
     chance to generate adaptive timeout data. */
@@ -153,6 +173,8 @@ extern unsigned int obd_alloc_fail_rate;
  #define OBD_FAIL_MDS_CLOSE_NET_REP       0x13b
  #define OBD_FAIL_MDS_BLOCK_QUOTA_REQ     0x13c
  #define OBD_FAIL_MDS_DROP_QUOTA_REQ      0x13d
+#define OBD_FAIL_MDS_REMOVE_COMMON_EA    0x13e
+#define OBD_FAIL_MDS_ALLOW_COMMON_EA_SETTING   0x13f
  
  #define OBD_FAIL_OST                     0x200
  #define OBD_FAIL_OST_CONNECT_NET         0x201
@@ -192,6 +214,7 @@ extern unsigned int obd_alloc_fail_rate;
  #define OBD_FAIL_OST_PAUSE_CREATE        0x223
  #define OBD_FAIL_OST_BRW_PAUSE_PACK      0x224
  #define OBD_FAIL_OST_CONNECT_NET2        0x225
+#define OBD_FAIL_OST_NOMEM               0x226
  
  #define OBD_FAIL_LDLM                    0x300
  #define OBD_FAIL_LDLM_NAMESPACE_NEW      0x301
@@ -214,6 +237,10 @@ extern unsigned int obd_alloc_fail_rate;
  #define OBD_FAIL_LDLM_PAUSE_CANCEL       0x312
  #define OBD_FAIL_LDLM_CLOSE_THREAD       0x313
  #define OBD_FAIL_LDLM_CANCEL_BL_CB_RACE  0x314
+#define OBD_FAIL_LDLM_CP_CB_WAIT         0x315
+#define OBD_FAIL_LDLM_OST_FAIL_RACE      0x316
+#define OBD_FAIL_LDLM_INTR_CP_AST        0x317
+#define OBD_FAIL_LDLM_CP_BL_RACE         0x318
  
  #define OBD_FAIL_OSC                     0x400
  #define OBD_FAIL_OSC_BRW_READ_BULK       0x401
@@ -241,12 +268,21 @@ extern unsigned int obd_alloc_fail_rate;
  #define OBD_FAIL_PTLRPC_CLIENT_BULK_CB   0x508
  #define OBD_FAIL_PTLRPC_PAUSE_REQ        0x50a
  #define OBD_FAIL_PTLRPC_PAUSE_REP        0x50c
+#define OBD_FAIL_PTLRPC_IMP_DEACTIVE     0x50d
+
+#define OBD_FAIL_PTLRPC_DUMP_LOG         0x50e
+#define OBD_FAIL_PTLRPC_LONG_REPL_UNLINK 0x50f
+#define OBD_FAIL_PTLRPC_LONG_BULK_UNLINK 0x510
+#define OBD_FAIL_PTLRPC_HPREQ_TIMEOUT    0x511
+#define OBD_FAIL_PTLRPC_HPREQ_NOTIMEOUT  0x512
  
  #define OBD_FAIL_OBD_PING_NET            0x600
  #define OBD_FAIL_OBD_LOG_CANCEL_NET      0x601
  #define OBD_FAIL_OBD_LOGD_NET            0x602
  #define OBD_FAIL_OBD_QC_CALLBACK_NET     0x603
  #define OBD_FAIL_OBD_DQACQ               0x604
+#define OBD_FAIL_OBD_LLOG_SETUP          0x605
+#define OBD_FAIL_OBD_LOG_CANCEL_REP      0x606
  
  #define OBD_FAIL_TGT_REPLY_NET           0x700
  #define OBD_FAIL_TGT_CONN_RACE           0x701
@@ -256,6 +292,9 @@ extern unsigned int obd_alloc_fail_rate;
  #define OBD_FAIL_TGT_DELAY_PRECREATE     0x705
  #define OBD_FAIL_TGT_TOOMANY_THREADS     0x706
  #define OBD_FAIL_TGT_REPLAY_DROP         0x707
+#define OBD_FAIL_TGT_FAKE_EXP            0x708
+#define OBD_FAIL_TGT_REPLAY_DELAY        0x709
+#define OBD_FAIL_TGT_LAST_REPLAY         0x710
  
  #define OBD_FAIL_MDC_REVALIDATE_PAUSE    0x800
  #define OBD_FAIL_MDC_ENQUEUE_PAUSE       0x801
@@ -382,7 +421,7 @@ extern atomic_t libcfs_kmemory;
  #define OBD_ALLOC_FAIL_MASK ((1 << OBD_ALLOC_FAIL_BITS) - 1)
  #define OBD_ALLOC_FAIL_MULT (OBD_ALLOC_FAIL_MASK / 100)
  
-#ifdef LPROCFS 
+#ifdef LPROCFS
  #define obd_memory_add(size)                                                  \
          lprocfs_counter_add(obd_memory, OBD_MEMORY_STAT, (long)(size))
  #define obd_memory_sub(size)                                                  \
@@ -424,7 +463,7 @@ static inline void obd_memory_sub(long size)
          obd_alloc -= size;
  }
  
-static inline void obd_pages_add(int order) 
+static inline void obd_pages_add(int order)
  {
          obd_pages += 1<< order;
          if (obd_pages > obd_max_pages)
@@ -584,7 +623,7 @@ do {                                                                          \
          cfs_mem_cache_free((slab), (ptr));                                    \
          (ptr) = NULL;                                                         \
          0;                                                                    \
-}) 
+})
  #define OBD_SLAB_ALLOC(ptr, slab, type, size)                                 \
  do {                                                                          \
          LASSERT(!in_interrupt());                                             \
diff --git a/lustre/kernel_patches/LICENSE.cray b/lustre/kernel_patches/LICENSE.cray

new file mode 100644 (file)

index 0000000..2a767f3
--- /dev/null
+++ b/lustre/kernel_patches/LICENSE.cray
@@ -0,0 +1,371 @@
+All files in this subtree are licensed under the terms and conditions
+of the GNU General Public License version 2.
+
+Reproduced below is the GPL v2, and Linus's clarifying statement from
+the Linux kernel source code:
+
+----------------------------------------
+
+   NOTE! This copyright does *not* cover user programs that use kernel
+ services by normal system calls - this is merely considered normal use
+ of the kernel, and does *not* fall under the heading of "derived work".
+ Also note that the GPL below is copyrighted by the Free Software
+ Foundation, but the instance of code that it refers to (the Linux
+ kernel) is copyrighted by me and others who actually wrote it.
+
+                       Linus Torvalds
+
+----------------------------------------
+
+                   GNU GENERAL PUBLIC LICENSE
+                      Version 2, June 1991
+
+ Copyright (C) 1989, 1991 Free Software Foundation, Inc.
+                       59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ Everyone is permitted to copy and distribute verbatim copies
+ of this license document, but changing it is not allowed.
+
+                           Preamble
+
+  The licenses for most software are designed to take away your
+freedom to share and change it.  By contrast, the GNU General Public
+License is intended to guarantee your freedom to share and change free
+software--to make sure the software is free for all its users.  This
+General Public License applies to most of the Free Software
+Foundation's software and to any other program whose authors commit to
+using it.  (Some other Free Software Foundation software is covered by
+the GNU Library General Public License instead.)  You can apply it to
+your programs, too.
+
+  When we speak of free software, we are referring to freedom, not
+price.  Our General Public Licenses are designed to make sure that you
+have the freedom to distribute copies of free software (and charge for
+this service if you wish), that you receive source code or can get it
+if you want it, that you can change the software or use pieces of it
+in new free programs; and that you know you can do these things.
+
+  To protect your rights, we need to make restrictions that forbid
+anyone to deny you these rights or to ask you to surrender the rights.
+These restrictions translate to certain responsibilities for you if you
+distribute copies of the software, or if you modify it.
+
+  For example, if you distribute copies of such a program, whether
+gratis or for a fee, you must give the recipients all the rights that
+you have.  You must make sure that they, too, receive or can get the
+source code.  And you must show them these terms so they know their
+rights.
+
+  We protect your rights with two steps: (1) copyright the software, and
+(2) offer you this license which gives you legal permission to copy,
+distribute and/or modify the software.
+
+  Also, for each author's protection and ours, we want to make certain
+that everyone understands that there is no warranty for this free
+software.  If the software is modified by someone else and passed on, we
+want its recipients to know that what they have is not the original, so
+that any problems introduced by others will not reflect on the original
+authors' reputations.
+
+  Finally, any free program is threatened constantly by software
+patents.  We wish to avoid the danger that redistributors of a free
+program will individually obtain patent licenses, in effect making the
+program proprietary.  To prevent this, we have made it clear that any
+patent must be licensed for everyone's free use or not licensed at all.
+
+  The precise terms and conditions for copying, distribution and
+modification follow.
+\f
+                   GNU GENERAL PUBLIC LICENSE
+   TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION
+
+  0. This License applies to any program or other work which contains
+a notice placed by the copyright holder saying it may be distributed
+under the terms of this General Public License.  The "Program", below,
+refers to any such program or work, and a "work based on the Program"
+means either the Program or any derivative work under copyright law:
+that is to say, a work containing the Program or a portion of it,
+either verbatim or with modifications and/or translated into another
+language.  (Hereinafter, translation is included without limitation in
+the term "modification".)  Each licensee is addressed as "you".
+
+Activities other than copying, distribution and modification are not
+covered by this License; they are outside its scope.  The act of
+running the Program is not restricted, and the output from the Program
+is covered only if its contents constitute a work based on the
+Program (independent of having been made by running the Program).
+Whether that is true depends on what the Program does.
+
+  1. You may copy and distribute verbatim copies of the Program's
+source code as you receive it, in any medium, provided that you
+conspicuously and appropriately publish on each copy an appropriate
+copyright notice and disclaimer of warranty; keep intact all the
+notices that refer to this License and to the absence of any warranty;
+and give any other recipients of the Program a copy of this License
+along with the Program.
+
+You may charge a fee for the physical act of transferring a copy, and
+you may at your option offer warranty protection in exchange for a fee.
+
+  2. You may modify your copy or copies of the Program or any portion
+of it, thus forming a work based on the Program, and copy and
+distribute such modifications or work under the terms of Section 1
+above, provided that you also meet all of these conditions:
+
+    a) You must cause the modified files to carry prominent notices
+    stating that you changed the files and the date of any change.
+
+    b) You must cause any work that you distribute or publish, that in
+    whole or in part contains or is derived from the Program or any
+    part thereof, to be licensed as a whole at no charge to all third
+    parties under the terms of this License.
+
+    c) If the modified program normally reads commands interactively
+    when run, you must cause it, when started running for such
+    interactive use in the most ordinary way, to print or display an
+    announcement including an appropriate copyright notice and a
+    notice that there is no warranty (or else, saying that you provide
+    a warranty) and that users may redistribute the program under
+    these conditions, and telling the user how to view a copy of this
+    License.  (Exception: if the Program itself is interactive but
+    does not normally print such an announcement, your work based on
+    the Program is not required to print an announcement.)
+\f
+These requirements apply to the modified work as a whole.  If
+identifiable sections of that work are not derived from the Program,
+and can be reasonably considered independent and separate works in
+themselves, then this License, and its terms, do not apply to those
+sections when you distribute them as separate works.  But when you
+distribute the same sections as part of a whole which is a work based
+on the Program, the distribution of the whole must be on the terms of
+this License, whose permissions for other licensees extend to the
+entire whole, and thus to each and every part regardless of who wrote it.
+
+Thus, it is not the intent of this section to claim rights or contest
+your rights to work written entirely by you; rather, the intent is to
+exercise the right to control the distribution of derivative or
+collective works based on the Program.
+
+In addition, mere aggregation of another work not based on the Program
+with the Program (or with a work based on the Program) on a volume of
+a storage or distribution medium does not bring the other work under
+the scope of this License.
+
+  3. You may copy and distribute the Program (or a work based on it,
+under Section 2) in object code or executable form under the terms of
+Sections 1 and 2 above provided that you also do one of the following:
+
+    a) Accompany it with the complete corresponding machine-readable
+    source code, which must be distributed under the terms of Sections
+    1 and 2 above on a medium customarily used for software interchange; or,
+
+    b) Accompany it with a written offer, valid for at least three
+    years, to give any third party, for a charge no more than your
+    cost of physically performing source distribution, a complete
+    machine-readable copy of the corresponding source code, to be
+    distributed under the terms of Sections 1 and 2 above on a medium
+    customarily used for software interchange; or,
+
+    c) Accompany it with the information you received as to the offer
+    to distribute corresponding source code.  (This alternative is
+    allowed only for noncommercial distribution and only if you
+    received the program in object code or executable form with such
+    an offer, in accord with Subsection b above.)
+
+The source code for a work means the preferred form of the work for
+making modifications to it.  For an executable work, complete source
+code means all the source code for all modules it contains, plus any
+associated interface definition files, plus the scripts used to
+control compilation and installation of the executable.  However, as a
+special exception, the source code distributed need not include
+anything that is normally distributed (in either source or binary
+form) with the major components (compiler, kernel, and so on) of the
+operating system on which the executable runs, unless that component
+itself accompanies the executable.
+
+If distribution of executable or object code is made by offering
+access to copy from a designated place, then offering equivalent
+access to copy the source code from the same place counts as
+distribution of the source code, even though third parties are not
+compelled to copy the source along with the object code.
+\f
+  4. You may not copy, modify, sublicense, or distribute the Program
+except as expressly provided under this License.  Any attempt
+otherwise to copy, modify, sublicense or distribute the Program is
+void, and will automatically terminate your rights under this License.
+However, parties who have received copies, or rights, from you under
+this License will not have their licenses terminated so long as such
+parties remain in full compliance.
+
+  5. You are not required to accept this License, since you have not
+signed it.  However, nothing else grants you permission to modify or
+distribute the Program or its derivative works.  These actions are
+prohibited by law if you do not accept this License.  Therefore, by
+modifying or distributing the Program (or any work based on the
+Program), you indicate your acceptance of this License to do so, and
+all its terms and conditions for copying, distributing or modifying
+the Program or works based on it.
+
+  6. Each time you redistribute the Program (or any work based on the
+Program), the recipient automatically receives a license from the
+original licensor to copy, distribute or modify the Program subject to
+these terms and conditions.  You may not impose any further
+restrictions on the recipients' exercise of the rights granted herein.
+You are not responsible for enforcing compliance by third parties to
+this License.
+
+  7. If, as a consequence of a court judgment or allegation of patent
+infringement or for any other reason (not limited to patent issues),
+conditions are imposed on you (whether by court order, agreement or
+otherwise) that contradict the conditions of this License, they do not
+excuse you from the conditions of this License.  If you cannot
+distribute so as to satisfy simultaneously your obligations under this
+License and any other pertinent obligations, then as a consequence you
+may not distribute the Program at all.  For example, if a patent
+license would not permit royalty-free redistribution of the Program by
+all those who receive copies directly or indirectly through you, then
+the only way you could satisfy both it and this License would be to
+refrain entirely from distribution of the Program.
+
+If any portion of this section is held invalid or unenforceable under
+any particular circumstance, the balance of the section is intended to
+apply and the section as a whole is intended to apply in other
+circumstances.
+
+It is not the purpose of this section to induce you to infringe any
+patents or other property right claims or to contest validity of any
+such claims; this section has the sole purpose of protecting the
+integrity of the free software distribution system, which is
+implemented by public license practices.  Many people have made
+generous contributions to the wide range of software distributed
+through that system in reliance on consistent application of that
+system; it is up to the author/donor to decide if he or she is willing
+to distribute software through any other system and a licensee cannot
+impose that choice.
+
+This section is intended to make thoroughly clear what is believed to
+be a consequence of the rest of this License.
+\f
+  8. If the distribution and/or use of the Program is restricted in
+certain countries either by patents or by copyrighted interfaces, the
+original copyright holder who places the Program under this License
+may add an explicit geographical distribution limitation excluding
+those countries, so that distribution is permitted only in or among
+countries not thus excluded.  In such case, this License incorporates
+the limitation as if written in the body of this License.
+
+  9. The Free Software Foundation may publish revised and/or new versions
+of the General Public License from time to time.  Such new versions will
+be similar in spirit to the present version, but may differ in detail to
+address new problems or concerns.
+
+Each version is given a distinguishing version number.  If the Program
+specifies a version number of this License which applies to it and "any
+later version", you have the option of following the terms and conditions
+either of that version or of any later version published by the Free
+Software Foundation.  If the Program does not specify a version number of
+this License, you may choose any version ever published by the Free Software
+Foundation.
+
+  10. If you wish to incorporate parts of the Program into other free
+programs whose distribution conditions are different, write to the author
+to ask for permission.  For software which is copyrighted by the Free
+Software Foundation, write to the Free Software Foundation; we sometimes
+make exceptions for this.  Our decision will be guided by the two goals
+of preserving the free status of all derivatives of our free software and
+of promoting the sharing and reuse of software generally.
+
+                           NO WARRANTY
+
+  11. BECAUSE THE PROGRAM IS LICENSED FREE OF CHARGE, THERE IS NO WARRANTY
+FOR THE PROGRAM, TO THE EXTENT PERMITTED BY APPLICABLE LAW.  EXCEPT WHEN
+OTHERWISE STATED IN WRITING THE COPYRIGHT HOLDERS AND/OR OTHER PARTIES
+PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY OF ANY KIND, EITHER EXPRESSED
+OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
+MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE.  THE ENTIRE RISK AS
+TO THE QUALITY AND PERFORMANCE OF THE PROGRAM IS WITH YOU.  SHOULD THE
+PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF ALL NECESSARY SERVICING,
+REPAIR OR CORRECTION.
+
+  12. IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING
+WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MAY MODIFY AND/OR
+REDISTRIBUTE THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES,
+INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING
+OUT OF THE USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED
+TO LOSS OF DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY
+YOU OR THIRD PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER
+PROGRAMS), EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGES.
+
+                    END OF TERMS AND CONDITIONS
+\f
+           How to Apply These Terms to Your New Programs
+
+  If you develop a new program, and you want it to be of the greatest
+possible use to the public, the best way to achieve this is to make it
+free software which everyone can redistribute and change under these terms.
+
+  To do so, attach the following notices to the program.  It is safest
+to attach them to the start of each source file to most effectively
+convey the exclusion of warranty; and each file should have at least
+the "copyright" line and a pointer to where the full notice is found.
+
+    <one line to give the program's name and a brief idea of what it does.>
+    Copyright (C) 19yy  <name of author>
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with this program; if not, write to the Free Software
+    Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+
+
+Also add information on how to contact you by electronic and paper mail.
+
+If the program is interactive, make it output a short notice like this
+when it starts in an interactive mode:
+
+    Gnomovision version 69, Copyright (C) 19yy name of author
+    Gnomovision comes with ABSOLUTELY NO WARRANTY; for details type `show w'.
+    This is free software, and you are welcome to redistribute it
+    under certain conditions; type `show c' for details.
+
+The hypothetical commands `show w' and `show c' should show the appropriate
+parts of the General Public License.  Of course, the commands you use may
+be called something other than `show w' and `show c'; they could even be
+mouse-clicks or menu items--whatever suits your program.
+
+You should also get your employer (if you work as a programmer) or your
+school, if any, to sign a "copyright disclaimer" for the program, if
+necessary.  Here is a sample; alter the names:
+
+  Yoyodyne, Inc., hereby disclaims all copyright interest in the program
+  `Gnomovision' (which makes passes at compilers) written by James Hacker.
+
+  <signature of Ty Coon>, 1 April 1989
+  Ty Coon, President of Vice
+
+This General Public License does not permit incorporating your program into
+proprietary programs.  If your program is a subroutine library, you may
+consider it more useful to permit linking proprietary applications with the
+library.  If this is what you want to do, use the GNU Library General
+Public License instead of this License.
+
+*** Use this verbiage for CRAY ***
+
+You may have signed or agreed to another license before downloading
+this software.  If so, you are bound by the terms and conditions
+of that agreement, and the following does not apply to you.  See the
+LICENSE file included with this distribution for more information.
+
+If you did not agree to a different license, then this copy of Lustre
+is open source software; you can redistribute it and/or modify it
+under the terms of version 2 of the GNU General Public License as
+published by the Free Software Foundation.
diff --git a/lustre/kernel_patches/README b/lustre/kernel_patches/README

index 7899f24..f340042 100644 (file)
--- a/lustre/kernel_patches/README
+++ b/lustre/kernel_patches/README
@@ -1,3 +1,3 @@
  The Linux kernel patches for Lustre.
-See https://mail.clusterfs.com/wikis/lustre/LustreHowto for information on
+See http://manual.lustre.org/index.php?title=Main_Page for information on
  how to patch your kernel.
diff --git a/lustre/kernel_patches/kernel_configs/kernel-2.6.16-2.6-sles10-i686-bigsmp.config b/lustre/kernel_patches/kernel_configs/kernel-2.6.16-2.6-sles10-i686-bigsmp.config

index 5f7cef8..97d697e 100644 (file)
--- a/lustre/kernel_patches/kernel_configs/kernel-2.6.16-2.6-sles10-i686-bigsmp.config
+++ b/lustre/kernel_patches/kernel_configs/kernel-2.6.16-2.6-sles10-i686-bigsmp.config
@@ -1,7 +1,7 @@
  #
  # Automatically generated make config: don't edit
-# Linux kernel version: 2.6.16.46
-# Tue Jul  3 17:56:03 2007
+# Linux kernel version: 2.6.16.60
+# Wed May 21 20:30:49 2008
  #
  CONFIG_X86_32=y
  CONFIG_SEMAPHORE_SLEEPERS=y
@@ -26,15 +26,15 @@ CONFIG_LOCALVERSION=""
  CONFIG_LOCALVERSION_AUTO=y
  CONFIG_SUSE_KERNEL=y
  CONFIG_SLE_VERSION=10
-CONFIG_SLE_SP=1
+CONFIG_SLE_SP=2
  CONFIG_SLE_SP_SUBLEVEL=0
  CONFIG_SWAP=y
  CONFIG_SYSVIPC=y
  CONFIG_POSIX_MQUEUE=y
  CONFIG_BSD_PROCESS_ACCT=y
  CONFIG_BSD_PROCESS_ACCT_V3=y
-CONFIG_TASK_DELAY_ACCT=y
  CONFIG_TASKSTATS=y
+CONFIG_TASK_DELAY_ACCT=y
  CONFIG_TASK_XACCT=y
  CONFIG_SYSCTL=y
  CONFIG_AUDIT=y
@@ -112,6 +112,7 @@ CONFIG_DEFAULT_IOSCHED="cfq"
  # CONFIG_X86_VISWS is not set
  CONFIG_X86_GENERICARCH=y
  # CONFIG_X86_ES7000 is not set
+# CONFIG_X86_VMI is not set
  CONFIG_X86_CYCLONE_TIMER=y
  # CONFIG_M386 is not set
  # CONFIG_M486 is not set
@@ -180,6 +181,7 @@ CONFIG_X86_CPUID=m
  CONFIG_EDD=m
  CONFIG_DELL_RBU=m
  CONFIG_DCDBAS=m
+CONFIG_DMIID=y
  # CONFIG_NOHIGHMEM is not set
  # CONFIG_HIGHMEM4G is not set
  CONFIG_HIGHMEM64G=y
@@ -236,6 +238,7 @@ CONFIG_ACPI_BUTTON=m
  CONFIG_ACPI_VIDEO=m
  # CONFIG_ACPI_HOTKEY is not set
  CONFIG_ACPI_FAN=m
+CONFIG_ACPI_DOCK=m
  CONFIG_ACPI_PROCESSOR=m
  CONFIG_ACPI_HOTPLUG_CPU=y
  CONFIG_ACPI_THERMAL=m
@@ -323,6 +326,7 @@ CONFIG_PCI_MMCONFIG=y
  CONFIG_PCIEPORTBUS=y
  CONFIG_HOTPLUG_PCI_PCIE=m
  # CONFIG_HOTPLUG_PCI_PCIE_POLL_EVENT_MODE is not set
+CONFIG_PCIEAER=y
  CONFIG_PCI_MSI=y
  # CONFIG_PCI_LEGACY_PROC is not set
  # CONFIG_PCI_DEBUG is not set
@@ -1201,6 +1205,7 @@ CONFIG_SCSI_FC_ATTRS=m
  # CONFIG_SCSI_ISCSI_ATTRS is not set
  CONFIG_SCSI_SAS_ATTRS=m
  CONFIG_SCSI_SAS_LIBSAS=m
+# CONFIG_SCSI_SAS_ATA is not set
  # CONFIG_SCSI_SAS_LIBSAS_DEBUG is not set
  CONFIG_ISCSI_TARGET=m
  
@@ -1397,11 +1402,8 @@ CONFIG_DM_MULTIPATH=m
  CONFIG_DM_MULTIPATH_EMC=m
  CONFIG_DM_MULTIPATH_HP_SW=m
  CONFIG_DM_MULTIPATH_RDAC=m
+CONFIG_DM_MULTIPATH_ALUA=m
  CONFIG_DM_NL_EVT=y
-
-#
-# Fusion MPT device support
-#
  CONFIG_FUSION=y
  CONFIG_FUSION_SPI=m
  CONFIG_FUSION_FC=m
@@ -1410,6 +1412,7 @@ CONFIG_FUSION_MAX_SGE=128
  CONFIG_FUSION_MAX_FC_SGE=256
  CONFIG_FUSION_CTL=m
  CONFIG_FUSION_LAN=m
+# CONFIG_FUSION_LOGGING is not set
  
  #
  # IEEE 1394 (FireWire) support
@@ -1595,6 +1598,7 @@ CONFIG_DL2K=m
  CONFIG_E1000=m
  CONFIG_E1000_NAPI=y
  # CONFIG_E1000_DISABLE_PACKET_SPLIT is not set
+CONFIG_IGB=m
  CONFIG_NS83820=m
  CONFIG_HAMACHI=m
  CONFIG_YELLOWFIN=m
@@ -1608,17 +1612,22 @@ CONFIG_SK98LIN=m
  CONFIG_VIA_VELOCITY=m
  CONFIG_TIGON3=m
  CONFIG_BNX2=m
+CONFIG_BNX2X=m
  CONFIG_QLA3XXX=m
  
  #
  # Ethernet (10000 Mbit)
  #
  CONFIG_CHELSIO_T1=m
+# CONFIG_CHELSIO_T3 is not set
+CONFIG_IXGBE=m
+# CONFIG_IXGBE_NAPI is not set
  CONFIG_IXGB=m
  CONFIG_IXGB_NAPI=y
  CONFIG_S2IO=m
  CONFIG_S2IO_NAPI=y
  CONFIG_NETXEN_NIC=m
+CONFIG_MYRI10GE=m
  
  #
  # Token Ring devices
@@ -1801,7 +1810,6 @@ CONFIG_NET_FC=y
  CONFIG_SHAPER=m
  CONFIG_NETCONSOLE=m
  CONFIG_NETPOLL=y
-CONFIG_NETPOLL_RX=y
  CONFIG_NETPOLL_TRAP=y
  CONFIG_NET_POLL_CONTROLLER=y
  
@@ -2366,6 +2374,8 @@ CONFIG_SENSORS_HDAPS=m
  # Misc devices
  #
  CONFIG_IBM_ASM=m
+CONFIG_TIFM_CORE=m
+CONFIG_TIFM_7XX1=m
  
  #
  # Multimedia Capabilities Port drivers
@@ -3113,8 +3123,22 @@ CONFIG_USB_XUSBATM=m
  #
  CONFIG_MMC=m
  # CONFIG_MMC_DEBUG is not set
+# CONFIG_MMC_UNSAFE_RESUME is not set
+
+#
+# MMC/SD Card Drivers
+#
  CONFIG_MMC_BLOCK=m
+CONFIG_MMC_BLOCK_BOUNCE=y
+CONFIG_SDIO_UART=m
+
+#
+# MMC/SD Host Controller Drivers
+#
+CONFIG_MMC_SDHCI=m
+CONFIG_MMC_RICOH_MMC=m
  CONFIG_MMC_WBSD=m
+CONFIG_MMC_TIFM_SD=m
  
  #
  # InfiniBand support
@@ -3394,7 +3418,7 @@ CONFIG_LOG_BUF_SHIFT=17
  # CONFIG_DEBUG_KOBJECT is not set
  # CONFIG_DEBUG_HIGHMEM is not set
  CONFIG_DEBUG_BUGVERBOSE=y
-# CONFIG_DEBUG_INFO is not set
+CONFIG_DEBUG_INFO=y
  CONFIG_DEBUG_FS=y
  # CONFIG_DEBUG_VM is not set
  # CONFIG_FRAME_POINTER is not set
@@ -3431,7 +3455,13 @@ CONFIG_SECURITY_APPARMOR=m
  # Cryptographic options
  #
  CONFIG_CRYPTO=y
+CONFIG_CRYPTO_ALGAPI=m
+CONFIG_CRYPTO_ABLKCIPHER=m
+CONFIG_CRYPTO_BLKCIPHER=m
+CONFIG_CRYPTO_HASH=m
+CONFIG_CRYPTO_MANAGER=m
  CONFIG_CRYPTO_HMAC=y
+CONFIG_CRYPTO_XCBC=m
  CONFIG_CRYPTO_NULL=m
  CONFIG_CRYPTO_MD4=m
  CONFIG_CRYPTO_MD5=y
@@ -3440,9 +3470,18 @@ CONFIG_CRYPTO_SHA256=m
  CONFIG_CRYPTO_SHA512=m
  CONFIG_CRYPTO_WP512=m
  CONFIG_CRYPTO_TGR192=m
+CONFIG_CRYPTO_GF128MUL=m
+CONFIG_CRYPTO_ECB=m
+CONFIG_CRYPTO_CBC=m
+CONFIG_CRYPTO_PCBC=m
+CONFIG_CRYPTO_LRW=m
+CONFIG_CRYPTO_CRYPTD=m
  CONFIG_CRYPTO_DES=y
+CONFIG_CRYPTO_FCRYPT=m
  CONFIG_CRYPTO_BLOWFISH=m
  CONFIG_CRYPTO_TWOFISH=m
+CONFIG_CRYPTO_TWOFISH_COMMON=m
+CONFIG_CRYPTO_TWOFISH_586=m
  CONFIG_CRYPTO_SERPENT=m
  CONFIG_CRYPTO_AES=m
  CONFIG_CRYPTO_AES_586=m
@@ -3455,13 +3494,15 @@ CONFIG_CRYPTO_ANUBIS=m
  CONFIG_CRYPTO_DEFLATE=m
  CONFIG_CRYPTO_MICHAEL_MIC=m
  CONFIG_CRYPTO_CRC32C=m
+CONFIG_CRYPTO_CAMELLIA=m
  CONFIG_CRYPTO_TEST=m
  
  #
  # Hardware crypto devices
  #
  CONFIG_CRYPTO_DEV_PADLOCK=m
-CONFIG_CRYPTO_DEV_PADLOCK_AES=y
+CONFIG_CRYPTO_DEV_PADLOCK_AES=m
+CONFIG_CRYPTO_DEV_PADLOCK_SHA=m
  
  #
  # Library routines
diff --git a/lustre/kernel_patches/kernel_configs/kernel-2.6.16-2.6-sles10-i686.config b/lustre/kernel_patches/kernel_configs/kernel-2.6.16-2.6-sles10-i686.config

index 8ef7035..ea7c80f 100644 (file)
--- a/lustre/kernel_patches/kernel_configs/kernel-2.6.16-2.6-sles10-i686.config
+++ b/lustre/kernel_patches/kernel_configs/kernel-2.6.16-2.6-sles10-i686.config
@@ -1,7 +1,7 @@
  #
  # Automatically generated make config: don't edit
-# Linux kernel version: 2.6.16.46
-# Tue Jul  3 17:46:57 2007
+# Linux kernel version: 2.6.16.60
+# Wed May 21 20:31:52 2008
  #
  CONFIG_X86_32=y
  CONFIG_SEMAPHORE_SLEEPERS=y
@@ -26,15 +26,15 @@ CONFIG_LOCALVERSION=""
  CONFIG_LOCALVERSION_AUTO=y
  CONFIG_SUSE_KERNEL=y
  CONFIG_SLE_VERSION=10
-CONFIG_SLE_SP=1
+CONFIG_SLE_SP=2
  CONFIG_SLE_SP_SUBLEVEL=0
  CONFIG_SWAP=y
  CONFIG_SYSVIPC=y
  CONFIG_POSIX_MQUEUE=y
  CONFIG_BSD_PROCESS_ACCT=y
  CONFIG_BSD_PROCESS_ACCT_V3=y
-CONFIG_TASK_DELAY_ACCT=y
  CONFIG_TASKSTATS=y
+CONFIG_TASK_DELAY_ACCT=y
  CONFIG_TASK_XACCT=y
  CONFIG_SYSCTL=y
  CONFIG_AUDIT=y
@@ -112,6 +112,7 @@ CONFIG_DEFAULT_IOSCHED="cfq"
  # CONFIG_X86_VISWS is not set
  CONFIG_X86_GENERICARCH=y
  # CONFIG_X86_ES7000 is not set
+# CONFIG_X86_VMI is not set
  CONFIG_X86_CYCLONE_TIMER=y
  # CONFIG_M386 is not set
  # CONFIG_M486 is not set
@@ -180,6 +181,7 @@ CONFIG_X86_CPUID=m
  CONFIG_EDD=m
  CONFIG_DELL_RBU=m
  CONFIG_DCDBAS=m
+CONFIG_DMIID=y
  # CONFIG_NOHIGHMEM is not set
  # CONFIG_HIGHMEM4G is not set
  CONFIG_HIGHMEM64G=y
@@ -236,6 +238,7 @@ CONFIG_ACPI_BUTTON=m
  CONFIG_ACPI_VIDEO=m
  # CONFIG_ACPI_HOTKEY is not set
  CONFIG_ACPI_FAN=m
+CONFIG_ACPI_DOCK=m
  CONFIG_ACPI_PROCESSOR=m
  CONFIG_ACPI_HOTPLUG_CPU=y
  CONFIG_ACPI_THERMAL=m
@@ -323,6 +326,7 @@ CONFIG_PCI_MMCONFIG=y
  CONFIG_PCIEPORTBUS=y
  CONFIG_HOTPLUG_PCI_PCIE=m
  # CONFIG_HOTPLUG_PCI_PCIE_POLL_EVENT_MODE is not set
+CONFIG_PCIEAER=y
  CONFIG_PCI_MSI=y
  # CONFIG_PCI_LEGACY_PROC is not set
  # CONFIG_PCI_DEBUG is not set
@@ -1201,6 +1205,7 @@ CONFIG_SCSI_FC_ATTRS=m
  # CONFIG_SCSI_ISCSI_ATTRS is not set
  CONFIG_SCSI_SAS_ATTRS=m
  CONFIG_SCSI_SAS_LIBSAS=m
+# CONFIG_SCSI_SAS_ATA is not set
  # CONFIG_SCSI_SAS_LIBSAS_DEBUG is not set
  CONFIG_ISCSI_TARGET=m
  
@@ -1397,11 +1402,8 @@ CONFIG_DM_MULTIPATH=m
  CONFIG_DM_MULTIPATH_EMC=m
  CONFIG_DM_MULTIPATH_HP_SW=m
  CONFIG_DM_MULTIPATH_RDAC=m
+CONFIG_DM_MULTIPATH_ALUA=m
  CONFIG_DM_NL_EVT=y
-
-#
-# Fusion MPT device support
-#
  CONFIG_FUSION=y
  CONFIG_FUSION_SPI=m
  CONFIG_FUSION_FC=m
@@ -1410,6 +1412,7 @@ CONFIG_FUSION_MAX_SGE=128
  CONFIG_FUSION_MAX_FC_SGE=256
  CONFIG_FUSION_CTL=m
  CONFIG_FUSION_LAN=m
+# CONFIG_FUSION_LOGGING is not set
  
  #
  # IEEE 1394 (FireWire) support
@@ -1595,6 +1598,7 @@ CONFIG_DL2K=m
  CONFIG_E1000=m
  CONFIG_E1000_NAPI=y
  # CONFIG_E1000_DISABLE_PACKET_SPLIT is not set
+CONFIG_IGB=m
  CONFIG_NS83820=m
  CONFIG_HAMACHI=m
  CONFIG_YELLOWFIN=m
@@ -1608,17 +1612,22 @@ CONFIG_SK98LIN=m
  CONFIG_VIA_VELOCITY=m
  CONFIG_TIGON3=m
  CONFIG_BNX2=m
+CONFIG_BNX2X=m
  CONFIG_QLA3XXX=m
  
  #
  # Ethernet (10000 Mbit)
  #
  CONFIG_CHELSIO_T1=m
+# CONFIG_CHELSIO_T3 is not set
+CONFIG_IXGBE=m
+# CONFIG_IXGBE_NAPI is not set
  CONFIG_IXGB=m
  CONFIG_IXGB_NAPI=y
  CONFIG_S2IO=m
  CONFIG_S2IO_NAPI=y
  CONFIG_NETXEN_NIC=m
+CONFIG_MYRI10GE=m
  
  #
  # Token Ring devices
@@ -1801,7 +1810,6 @@ CONFIG_NET_FC=y
  CONFIG_SHAPER=m
  CONFIG_NETCONSOLE=m
  CONFIG_NETPOLL=y
-CONFIG_NETPOLL_RX=y
  CONFIG_NETPOLL_TRAP=y
  CONFIG_NET_POLL_CONTROLLER=y
  
@@ -2366,6 +2374,8 @@ CONFIG_SENSORS_HDAPS=m
  # Misc devices
  #
  CONFIG_IBM_ASM=m
+CONFIG_TIFM_CORE=m
+CONFIG_TIFM_7XX1=m
  
  #
  # Multimedia Capabilities Port drivers
@@ -3113,8 +3123,22 @@ CONFIG_USB_XUSBATM=m
  #
  CONFIG_MMC=m
  # CONFIG_MMC_DEBUG is not set
+# CONFIG_MMC_UNSAFE_RESUME is not set
+
+#
+# MMC/SD Card Drivers
+#
  CONFIG_MMC_BLOCK=m
+CONFIG_MMC_BLOCK_BOUNCE=y
+CONFIG_SDIO_UART=m
+
+#
+# MMC/SD Host Controller Drivers
+#
+CONFIG_MMC_SDHCI=m
+CONFIG_MMC_RICOH_MMC=m
  CONFIG_MMC_WBSD=m
+CONFIG_MMC_TIFM_SD=m
  
  #
  # InfiniBand support
@@ -3394,7 +3418,7 @@ CONFIG_LOG_BUF_SHIFT=17
  # CONFIG_DEBUG_KOBJECT is not set
  # CONFIG_DEBUG_HIGHMEM is not set
  CONFIG_DEBUG_BUGVERBOSE=y
-# CONFIG_DEBUG_INFO is not set
+CONFIG_DEBUG_INFO=y
  CONFIG_DEBUG_FS=y
  # CONFIG_DEBUG_VM is not set
  # CONFIG_FRAME_POINTER is not set
@@ -3431,7 +3455,13 @@ CONFIG_SECURITY_APPARMOR=m
  # Cryptographic options
  #
  CONFIG_CRYPTO=y
+CONFIG_CRYPTO_ALGAPI=m
+CONFIG_CRYPTO_ABLKCIPHER=m
+CONFIG_CRYPTO_BLKCIPHER=m
+CONFIG_CRYPTO_HASH=m
+CONFIG_CRYPTO_MANAGER=m
  CONFIG_CRYPTO_HMAC=y
+CONFIG_CRYPTO_XCBC=m
  CONFIG_CRYPTO_NULL=m
  CONFIG_CRYPTO_MD4=m
  CONFIG_CRYPTO_MD5=y
@@ -3440,9 +3470,18 @@ CONFIG_CRYPTO_SHA256=m
  CONFIG_CRYPTO_SHA512=m
  CONFIG_CRYPTO_WP512=m
  CONFIG_CRYPTO_TGR192=m
+CONFIG_CRYPTO_GF128MUL=m
+CONFIG_CRYPTO_ECB=m
+CONFIG_CRYPTO_CBC=m
+CONFIG_CRYPTO_PCBC=m
+CONFIG_CRYPTO_LRW=m
+CONFIG_CRYPTO_CRYPTD=m
  CONFIG_CRYPTO_DES=y
+CONFIG_CRYPTO_FCRYPT=m
  CONFIG_CRYPTO_BLOWFISH=m
  CONFIG_CRYPTO_TWOFISH=m
+CONFIG_CRYPTO_TWOFISH_COMMON=m
+CONFIG_CRYPTO_TWOFISH_586=m
  CONFIG_CRYPTO_SERPENT=m
  CONFIG_CRYPTO_AES=m
  CONFIG_CRYPTO_AES_586=m
@@ -3455,13 +3494,15 @@ CONFIG_CRYPTO_ANUBIS=m
  CONFIG_CRYPTO_DEFLATE=m
  CONFIG_CRYPTO_MICHAEL_MIC=m
  CONFIG_CRYPTO_CRC32C=m
+CONFIG_CRYPTO_CAMELLIA=m
  CONFIG_CRYPTO_TEST=m
  
  #
  # Hardware crypto devices
  #
  CONFIG_CRYPTO_DEV_PADLOCK=m
-CONFIG_CRYPTO_DEV_PADLOCK_AES=y
+CONFIG_CRYPTO_DEV_PADLOCK_AES=m
+CONFIG_CRYPTO_DEV_PADLOCK_SHA=m
  
  #
  # Library routines
diff --git a/lustre/kernel_patches/kernel_configs/kernel-2.6.16-2.6-sles10-ppc64-smp.config b/lustre/kernel_patches/kernel_configs/kernel-2.6.16-2.6-sles10-ppc64-smp.config

new file mode 100644 (file)

index 0000000..dd43736
--- /dev/null
+++ b/lustre/kernel_patches/kernel_configs/kernel-2.6.16-2.6-sles10-ppc64-smp.config
@@ -0,0 +1,2190 @@
+#
+# Automatically generated make config: don't edit
+#
+CONFIG_PPC64=y
+CONFIG_64BIT=y
+CONFIG_PPC_MERGE=y
+CONFIG_MMU=y
+CONFIG_GENERIC_HARDIRQS=y
+CONFIG_RWSEM_XCHGADD_ALGORITHM=y
+CONFIG_GENERIC_CALIBRATE_DELAY=y
+CONFIG_PPC=y
+CONFIG_EARLY_PRINTK=y
+CONFIG_COMPAT=y
+CONFIG_SYSVIPC_COMPAT=y
+CONFIG_SCHED_NO_NO_OMIT_FRAME_POINTER=y
+CONFIG_ARCH_MAY_HAVE_PC_FDC=y
+CONFIG_PPC_OF=y
+CONFIG_PPC_UDBG_16550=y
+CONFIG_GENERIC_TBSYNC=y
+CONFIG_AUDIT_ARCH=y
+# CONFIG_DEFAULT_UIMAGE is not set
+
+#
+# Processor support
+#
+# CONFIG_POWER4_ONLY is not set
+CONFIG_POWER3=y
+CONFIG_POWER4=y
+CONFIG_PPC_FPU=y
+CONFIG_ALTIVEC=y
+CONFIG_PPC_STD_MMU=y
+CONFIG_VIRT_CPU_ACCOUNTING=y
+CONFIG_SMP=y
+CONFIG_NR_CPUS=128
+
+#
+# Code maturity level options
+#
+CONFIG_EXPERIMENTAL=y
+CONFIG_LOCK_KERNEL=y
+CONFIG_INIT_ENV_ARG_LIMIT=32
+
+#
+# General setup
+#
+CONFIG_LOCALVERSION="-ppc64"
+# CONFIG_LOCALVERSION_AUTO is not set
+CONFIG_SUSE_KERNEL=y
+CONFIG_SLE_VERSION=10
+CONFIG_SLE_SP=2
+CONFIG_SLE_SP_SUBLEVEL=0
+CONFIG_SWAP=y
+CONFIG_SYSVIPC=y
+CONFIG_POSIX_MQUEUE=y
+CONFIG_BSD_PROCESS_ACCT=y
+CONFIG_BSD_PROCESS_ACCT_V3=y
+CONFIG_TASKSTATS=y
+CONFIG_TASK_DELAY_ACCT=y
+CONFIG_TASK_XACCT=y
+CONFIG_SYSCTL=y
+CONFIG_AUDIT=y
+CONFIG_AUDITSYSCALL=y
+CONFIG_IKCONFIG=y
+CONFIG_IKCONFIG_PROC=y
+CONFIG_CPUSETS=y
+CONFIG_RELAY=y
+CONFIG_INITRAMFS_SOURCE=""
+CONFIG_CC_OPTIMIZE_FOR_SIZE=y
+# CONFIG_EMBEDDED is not set
+CONFIG_KALLSYMS=y
+CONFIG_KALLSYMS_ALL=y
+# CONFIG_KALLSYMS_EXTRA_PASS is not set
+CONFIG_HOTPLUG=y
+CONFIG_PRINTK=y
+CONFIG_BUG=y
+CONFIG_ELF_CORE=y
+CONFIG_BASE_FULL=y
+CONFIG_FUTEX=y
+CONFIG_EPOLL=y
+CONFIG_SHMEM=y
+CONFIG_CC_ALIGN_FUNCTIONS=0
+CONFIG_CC_ALIGN_LABELS=0
+CONFIG_CC_ALIGN_LOOPS=0
+CONFIG_CC_ALIGN_JUMPS=0
+CONFIG_SLAB=y
+# CONFIG_TINY_SHMEM is not set
+CONFIG_BASE_SMALL=0
+# CONFIG_SLOB is not set
+CONFIG_OBSOLETE_INTERMODULE=m
+
+#
+# Loadable module support
+#
+CONFIG_MODULES=y
+CONFIG_MODULE_UNLOAD=y
+# CONFIG_MODULE_FORCE_UNLOAD is not set
+CONFIG_MODVERSIONS=y
+CONFIG_MODULE_SRCVERSION_ALL=y
+CONFIG_KMOD=y
+CONFIG_STOP_MACHINE=y
+
+#
+# Block layer
+#
+CONFIG_BLK_DEV_IO_TRACE=y
+
+#
+# IO Schedulers
+#
+CONFIG_IOSCHED_NOOP=y
+CONFIG_IOSCHED_AS=y
+CONFIG_IOSCHED_DEADLINE=y
+CONFIG_IOSCHED_CFQ=y
+# CONFIG_DEFAULT_AS is not set
+CONFIG_DEFAULT_DEADLINE=y
+# CONFIG_DEFAULT_CFQ is not set
+# CONFIG_DEFAULT_NOOP is not set
+CONFIG_DEFAULT_IOSCHED="deadline"
+
+#
+# Platform support
+#
+CONFIG_PPC_MULTIPLATFORM=y
+# CONFIG_PPC_ISERIES is not set
+# CONFIG_EMBEDDED6xx is not set
+# CONFIG_APUS is not set
+CONFIG_PPC_PSERIES=y
+CONFIG_PPC_PMAC=y
+CONFIG_PPC_PMAC64=y
+CONFIG_PPC_MAPLE=y
+CONFIG_PPC_CELL=y
+CONFIG_XICS=y
+CONFIG_U3_DART=y
+CONFIG_MPIC=y
+CONFIG_PPC_RTAS=y
+CONFIG_RTAS_ERROR_LOGGING=y
+CONFIG_RTAS_PROC=y
+CONFIG_RTAS_FLASH=y
+CONFIG_MMIO_NVRAM=y
+CONFIG_MPIC_BROKEN_U3=y
+CONFIG_CELL_IIC=y
+CONFIG_IBMVIO=y
+CONFIG_IBMEBUS=y
+# CONFIG_PPC_MPC106 is not set
+CONFIG_CPU_FREQ=y
+CONFIG_CPU_FREQ_TABLE=y
+CONFIG_CPU_FREQ_DEBUG=y
+CONFIG_CPU_FREQ_STAT=m
+CONFIG_CPU_FREQ_STAT_DETAILS=y
+CONFIG_CPU_FREQ_DEFAULT_GOV_PERFORMANCE=y
+# CONFIG_CPU_FREQ_DEFAULT_GOV_USERSPACE is not set
+CONFIG_CPU_FREQ_GOV_PERFORMANCE=y
+CONFIG_CPU_FREQ_GOV_POWERSAVE=m
+CONFIG_CPU_FREQ_GOV_USERSPACE=m
+CONFIG_CPU_FREQ_GOV_ONDEMAND=m
+CONFIG_CPU_FREQ_GOV_CONSERVATIVE=m
+CONFIG_CPU_FREQ_PMAC64=y
+# CONFIG_WANT_EARLY_SERIAL is not set
+
+#
+# Cell Broadband Engine options
+#
+CONFIG_SPU_FS=m
+
+#
+# Kernel options
+#
+CONFIG_HZ_100=y
+# CONFIG_HZ_250 is not set
+# CONFIG_HZ_1000 is not set
+CONFIG_HZ=100
+CONFIG_PREEMPT_NONE=y
+# CONFIG_PREEMPT_VOLUNTARY is not set
+# CONFIG_PREEMPT is not set
+# CONFIG_PREEMPT_BKL is not set
+CONFIG_BINFMT_ELF=y
+CONFIG_BINFMT_MISC=m
+CONFIG_FORCE_MAX_ZONEORDER=13
+CONFIG_IOMMU_VMERGE=y
+CONFIG_HOTPLUG_CPU=y
+CONFIG_ARCH_ENABLE_MEMORY_HOTPLUG=y
+CONFIG_KEXEC=y
+# CONFIG_CRASH_DUMP is not set
+CONFIG_IRQ_ALL_CPUS=y
+CONFIG_PPC_SPLPAR=y
+CONFIG_EEH=y
+CONFIG_SCANLOG=m
+CONFIG_LPARCFG=y
+CONFIG_NUMA=y
+CONFIG_ARCH_SELECT_MEMORY_MODEL=y
+CONFIG_ARCH_SPARSEMEM_ENABLE=y
+CONFIG_ARCH_SPARSEMEM_DEFAULT=y
+CONFIG_SELECT_MEMORY_MODEL=y
+# CONFIG_FLATMEM_MANUAL is not set
+# CONFIG_DISCONTIGMEM_MANUAL is not set
+CONFIG_SPARSEMEM_MANUAL=y
+CONFIG_SPARSEMEM=y
+CONFIG_NEED_MULTIPLE_NODES=y
+CONFIG_HAVE_MEMORY_PRESENT=y
+# CONFIG_SPARSEMEM_STATIC is not set
+CONFIG_SPARSEMEM_EXTREME=y
+CONFIG_MEMORY_HOTPLUG=y
+CONFIG_SPLIT_PTLOCK_CPUS=4
+CONFIG_MIGRATION=y
+CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID=y
+CONFIG_ARCH_MEMORY_PROBE=y
+# CONFIG_PPC_64K_PAGES is not set
+CONFIG_SCHED_SMT=y
+CONFIG_PROC_DEVICETREE=y
+# CONFIG_CMDLINE_BOOL is not set
+# CONFIG_PM is not set
+CONFIG_SECCOMP=y
+CONFIG_ISA_DMA_API=y
+
+#
+# Bus options
+#
+CONFIG_GENERIC_ISA_DMA=y
+CONFIG_PPC_I8259=y
+# CONFIG_PPC_INDIRECT_PCI is not set
+CONFIG_PCI=y
+CONFIG_PCI_DOMAINS=y
+CONFIG_PCIEPORTBUS=y
+# CONFIG_HOTPLUG_PCI_PCIE is not set
+CONFIG_PCI_MSI=y
+# CONFIG_PCI_LEGACY_PROC is not set
+# CONFIG_PCI_DEBUG is not set
+
+#
+# PCCARD (PCMCIA/CardBus) support
+#
+# CONFIG_PCCARD is not set
+
+#
+# PCI Hotplug Support
+#
+CONFIG_HOTPLUG_PCI=y
+# CONFIG_HOTPLUG_PCI_FAKE is not set
+# CONFIG_HOTPLUG_PCI_CPCI is not set
+# CONFIG_HOTPLUG_PCI_SHPC is not set
+CONFIG_HOTPLUG_PCI_RPA=y
+CONFIG_HOTPLUG_PCI_RPA_DLPAR=y
+CONFIG_KERNEL_START=0xc000000000000000
+
+#
+# Networking
+#
+CONFIG_NET=y
+
+#
+# Networking options
+#
+# CONFIG_NETDEBUG is not set
+CONFIG_PACKET=y
+CONFIG_PACKET_MMAP=y
+CONFIG_UNIX=y
+CONFIG_XFRM=y
+CONFIG_XFRM_USER=m
+CONFIG_NET_KEY=y
+CONFIG_INET=y
+CONFIG_IP_MULTICAST=y
+CONFIG_IP_ADVANCED_ROUTER=y
+CONFIG_ASK_IP_FIB_HASH=y
+# CONFIG_IP_FIB_TRIE is not set
+CONFIG_IP_FIB_HASH=y
+CONFIG_IP_MULTIPLE_TABLES=y
+# CONFIG_IP_ROUTE_FWMARK is not set
+CONFIG_IP_ROUTE_MULTIPATH=y
+# CONFIG_IP_ROUTE_MULTIPATH_CACHED is not set
+CONFIG_IP_ROUTE_VERBOSE=y
+# CONFIG_IP_PNP is not set
+CONFIG_NET_IPIP=m
+CONFIG_NET_IPGRE=m
+CONFIG_NET_IPGRE_BROADCAST=y
+CONFIG_IP_MROUTE=y
+CONFIG_IP_PIMSM_V1=y
+CONFIG_IP_PIMSM_V2=y
+# CONFIG_ARPD is not set
+CONFIG_SYN_COOKIES=y
+CONFIG_INET_AH=m
+CONFIG_INET_ESP=m
+CONFIG_INET_IPCOMP=m
+CONFIG_INET_TUNNEL=m
+CONFIG_INET_DIAG=m
+CONFIG_INET_TCP_DIAG=m
+CONFIG_TCP_CONG_ADVANCED=y
+
+#
+# TCP congestion control
+#
+CONFIG_TCP_CONG_BIC=m
+CONFIG_TCP_CONG_CUBIC=m
+CONFIG_TCP_CONG_WESTWOOD=m
+CONFIG_TCP_CONG_HTCP=m
+CONFIG_TCP_CONG_HSTCP=m
+CONFIG_TCP_CONG_HYBLA=m
+CONFIG_TCP_CONG_VEGAS=m
+CONFIG_TCP_CONG_SCALABLE=m
+
+#
+# IP: Virtual Server Configuration
+#
+# CONFIG_IP_VS is not set
+CONFIG_IPV6=m
+CONFIG_IPV6_PRIVACY=y
+CONFIG_INET6_AH=m
+CONFIG_INET6_ESP=m
+CONFIG_INET6_IPCOMP=m
+CONFIG_INET6_TUNNEL=m
+CONFIG_IPV6_TUNNEL=m
+CONFIG_NETFILTER=y
+# CONFIG_NETFILTER_DEBUG is not set
+CONFIG_BRIDGE_NETFILTER=y
+
+#
+# Core Netfilter Configuration
+#
+CONFIG_NETFILTER_NETLINK=m
+CONFIG_NETFILTER_NETLINK_QUEUE=m
+CONFIG_NETFILTER_NETLINK_LOG=m
+CONFIG_NETFILTER_XTABLES=m
+CONFIG_NETFILTER_XT_TARGET_CLASSIFY=m
+CONFIG_NETFILTER_XT_TARGET_CONNMARK=m
+CONFIG_NETFILTER_XT_TARGET_MARK=m
+CONFIG_NETFILTER_XT_TARGET_NFQUEUE=m
+CONFIG_NETFILTER_XT_TARGET_NOTRACK=m
+CONFIG_NETFILTER_XT_MATCH_COMMENT=m
+CONFIG_NETFILTER_XT_MATCH_CONNMARK=m
+CONFIG_NETFILTER_XT_MATCH_CONNTRACK=m
+CONFIG_NETFILTER_XT_MATCH_DCCP=m
+CONFIG_NETFILTER_XT_MATCH_HELPER=m
+CONFIG_NETFILTER_XT_MATCH_LENGTH=m
+CONFIG_NETFILTER_XT_MATCH_LIMIT=m
+CONFIG_NETFILTER_XT_MATCH_MAC=m
+CONFIG_NETFILTER_XT_MATCH_MARK=m
+CONFIG_NETFILTER_XT_MATCH_PHYSDEV=m
+CONFIG_NETFILTER_XT_MATCH_PKTTYPE=m
+CONFIG_NETFILTER_XT_MATCH_REALM=m
+CONFIG_NETFILTER_XT_MATCH_SCTP=m
+CONFIG_NETFILTER_XT_MATCH_STATE=m
+CONFIG_NETFILTER_XT_MATCH_STRING=m
+CONFIG_NETFILTER_XT_MATCH_TCPMSS=m
+
+#
+# IP: Netfilter Configuration
+#
+CONFIG_IP_NF_CONNTRACK=m
+# CONFIG_IP_NF_CT_ACCT is not set
+CONFIG_IP_NF_CONNTRACK_MARK=y
+CONFIG_IP_NF_CONNTRACK_EVENTS=y
+CONFIG_IP_NF_CONNTRACK_NETLINK=m
+CONFIG_IP_NF_CT_PROTO_SCTP=m
+CONFIG_IP_NF_FTP=m
+CONFIG_IP_NF_IRC=m
+CONFIG_IP_NF_NETBIOS_NS=m
+CONFIG_IP_NF_TFTP=m
+CONFIG_IP_NF_AMANDA=m
+CONFIG_IP_NF_PPTP=m
+CONFIG_IP_NF_QUEUE=m
+CONFIG_IP_NF_IPTABLES=m
+CONFIG_IP_NF_MATCH_IPRANGE=m
+CONFIG_IP_NF_MATCH_MULTIPORT=m
+CONFIG_IP_NF_MATCH_TOS=m
+CONFIG_IP_NF_MATCH_RECENT=m
+CONFIG_IP_NF_MATCH_ECN=m
+CONFIG_IP_NF_MATCH_DSCP=m
+CONFIG_IP_NF_MATCH_AH_ESP=m
+CONFIG_IP_NF_MATCH_TTL=m
+CONFIG_IP_NF_MATCH_OWNER=m
+CONFIG_IP_NF_MATCH_ADDRTYPE=m
+CONFIG_IP_NF_MATCH_HASHLIMIT=m
+CONFIG_IP_NF_MATCH_POLICY=m
+CONFIG_IP_NF_MATCH_IPV4OPTIONS=m
+CONFIG_IP_NF_FILTER=m
+CONFIG_IP_NF_TARGET_REJECT=m
+CONFIG_IP_NF_TARGET_LOG=m
+CONFIG_IP_NF_TARGET_ULOG=m
+CONFIG_IP_NF_TARGET_TCPMSS=m
+CONFIG_IP_NF_NAT=m
+CONFIG_IP_NF_NAT_NEEDED=y
+CONFIG_IP_NF_TARGET_MASQUERADE=m
+CONFIG_IP_NF_TARGET_REDIRECT=m
+CONFIG_IP_NF_TARGET_NETMAP=m
+CONFIG_IP_NF_TARGET_SAME=m
+CONFIG_IP_NF_NAT_SNMP_BASIC=m
+CONFIG_IP_NF_NAT_IRC=m
+CONFIG_IP_NF_NAT_FTP=m
+CONFIG_IP_NF_NAT_TFTP=m
+CONFIG_IP_NF_NAT_AMANDA=m
+CONFIG_IP_NF_NAT_PPTP=m
+CONFIG_IP_NF_MANGLE=m
+CONFIG_IP_NF_TARGET_TOS=m
+CONFIG_IP_NF_TARGET_ECN=m
+CONFIG_IP_NF_TARGET_DSCP=m
+CONFIG_IP_NF_TARGET_TTL=m
+CONFIG_IP_NF_TARGET_CLUSTERIP=m
+CONFIG_IP_NF_RAW=m
+CONFIG_IP_NF_ARPTABLES=m
+CONFIG_IP_NF_ARPFILTER=m
+CONFIG_IP_NF_ARP_MANGLE=m
+
+#
+# IPv6: Netfilter Configuration (EXPERIMENTAL)
+#
+CONFIG_IP6_NF_QUEUE=m
+CONFIG_IP6_NF_IPTABLES=m
+CONFIG_IP6_NF_MATCH_RT=m
+CONFIG_IP6_NF_MATCH_OPTS=m
+CONFIG_IP6_NF_MATCH_FRAG=m
+CONFIG_IP6_NF_MATCH_HL=m
+CONFIG_IP6_NF_MATCH_MULTIPORT=m
+CONFIG_IP6_NF_MATCH_OWNER=m
+CONFIG_IP6_NF_MATCH_IPV6HEADER=m
+CONFIG_IP6_NF_MATCH_AHESP=m
+CONFIG_IP6_NF_MATCH_EUI64=m
+CONFIG_IP6_NF_MATCH_POLICY=m
+CONFIG_IP6_NF_FILTER=m
+CONFIG_IP6_NF_TARGET_LOG=m
+CONFIG_IP6_NF_TARGET_REJECT=m
+CONFIG_IP6_NF_MANGLE=m
+CONFIG_IP6_NF_TARGET_HL=m
+CONFIG_IP6_NF_RAW=m
+
+#
+# Bridge: Netfilter Configuration
+#
+CONFIG_BRIDGE_NF_EBTABLES=m
+CONFIG_BRIDGE_EBT_BROUTE=m
+CONFIG_BRIDGE_EBT_T_FILTER=m
+CONFIG_BRIDGE_EBT_T_NAT=m
+CONFIG_BRIDGE_EBT_802_3=m
+CONFIG_BRIDGE_EBT_AMONG=m
+CONFIG_BRIDGE_EBT_ARP=m
+CONFIG_BRIDGE_EBT_IP=m
+CONFIG_BRIDGE_EBT_LIMIT=m
+CONFIG_BRIDGE_EBT_MARK=m
+CONFIG_BRIDGE_EBT_PKTTYPE=m
+CONFIG_BRIDGE_EBT_STP=m
+CONFIG_BRIDGE_EBT_VLAN=m
+CONFIG_BRIDGE_EBT_ARPREPLY=m
+CONFIG_BRIDGE_EBT_DNAT=m
+CONFIG_BRIDGE_EBT_MARK_T=m
+CONFIG_BRIDGE_EBT_REDIRECT=m
+CONFIG_BRIDGE_EBT_SNAT=m
+CONFIG_BRIDGE_EBT_LOG=m
+CONFIG_BRIDGE_EBT_ULOG=m
+
+#
+# DCCP Configuration (EXPERIMENTAL)
+#
+CONFIG_IP_DCCP=m
+CONFIG_INET_DCCP_DIAG=m
+
+#
+# DCCP CCIDs Configuration (EXPERIMENTAL)
+#
+CONFIG_IP_DCCP_CCID3=m
+CONFIG_IP_DCCP_TFRC_LIB=m
+
+#
+# DCCP Kernel Hacking
+#
+# CONFIG_IP_DCCP_DEBUG is not set
+# CONFIG_IP_DCCP_UNLOAD_HACK is not set
+
+#
+# SCTP Configuration (EXPERIMENTAL)
+#
+CONFIG_IP_SCTP=m
+# CONFIG_SCTP_DBG_MSG is not set
+# CONFIG_SCTP_DBG_OBJCNT is not set
+CONFIG_SCTP_HMAC_NONE=y
+# CONFIG_SCTP_HMAC_SHA1 is not set
+# CONFIG_SCTP_HMAC_MD5 is not set
+
+#
+# TIPC Configuration (EXPERIMENTAL)
+#
+# CONFIG_TIPC is not set
+# CONFIG_ATM is not set
+CONFIG_BRIDGE=m
+CONFIG_VLAN_8021Q=m
+# CONFIG_DECNET is not set
+CONFIG_LLC=y
+CONFIG_LLC2=m
+CONFIG_IPX=m
+CONFIG_IPX_INTERN=y
+CONFIG_ATALK=m
+CONFIG_DEV_APPLETALK=y
+CONFIG_IPDDP=m
+CONFIG_IPDDP_ENCAP=y
+CONFIG_IPDDP_DECAP=y
+# CONFIG_X25 is not set
+# CONFIG_LAPB is not set
+# CONFIG_NET_DIVERT is not set
+# CONFIG_ECONET is not set
+# CONFIG_WAN_ROUTER is not set
+
+#
+# QoS and/or fair queueing
+#
+CONFIG_NET_SCHED=y
+CONFIG_NET_SCH_CLK_JIFFIES=y
+# CONFIG_NET_SCH_CLK_GETTIMEOFDAY is not set
+# CONFIG_NET_SCH_CLK_CPU is not set
+
+#
+# Queueing/Scheduling
+#
+CONFIG_NET_SCH_CBQ=m
+CONFIG_NET_SCH_HTB=m
+CONFIG_NET_SCH_HFSC=m
+CONFIG_NET_SCH_PRIO=m
+CONFIG_NET_SCH_RED=m
+CONFIG_NET_SCH_SFQ=m
+CONFIG_NET_SCH_TEQL=m
+CONFIG_NET_SCH_TBF=m
+CONFIG_NET_SCH_GRED=m
+CONFIG_NET_SCH_DSMARK=m
+CONFIG_NET_SCH_NETEM=m
+CONFIG_NET_SCH_INGRESS=m
+
+#
+# Classification
+#
+CONFIG_NET_CLS=y
+CONFIG_NET_CLS_BASIC=m
+CONFIG_NET_CLS_TCINDEX=m
+CONFIG_NET_CLS_ROUTE4=m
+CONFIG_NET_CLS_ROUTE=y
+CONFIG_NET_CLS_FW=m
+CONFIG_NET_CLS_U32=m
+CONFIG_CLS_U32_PERF=y
+CONFIG_CLS_U32_MARK=y
+CONFIG_NET_CLS_RSVP=m
+CONFIG_NET_CLS_RSVP6=m
+CONFIG_NET_EMATCH=y
+CONFIG_NET_EMATCH_STACK=32
+CONFIG_NET_EMATCH_CMP=m
+CONFIG_NET_EMATCH_NBYTE=m
+CONFIG_NET_EMATCH_U32=m
+CONFIG_NET_EMATCH_META=m
+CONFIG_NET_EMATCH_TEXT=m
+CONFIG_NET_CLS_ACT=y
+CONFIG_NET_ACT_POLICE=m
+CONFIG_NET_ACT_GACT=m
+CONFIG_GACT_PROB=y
+CONFIG_NET_ACT_MIRRED=m
+CONFIG_NET_ACT_IPT=m
+CONFIG_NET_ACT_PEDIT=m
+CONFIG_NET_ACT_SIMP=m
+CONFIG_NET_CLS_IND=y
+CONFIG_NET_ESTIMATOR=y
+
+#
+# Network testing
+#
+CONFIG_NET_PKTGEN=m
+# CONFIG_HAMRADIO is not set
+# CONFIG_IRDA is not set
+CONFIG_BT=m
+CONFIG_BT_L2CAP=m
+CONFIG_BT_SCO=m
+CONFIG_BT_RFCOMM=m
+CONFIG_BT_RFCOMM_TTY=y
+CONFIG_BT_BNEP=m
+CONFIG_BT_BNEP_MC_FILTER=y
+CONFIG_BT_BNEP_PROTO_FILTER=y
+CONFIG_BT_HIDP=m
+
+#
+# Bluetooth device drivers
+#
+CONFIG_BT_HCIUSB=m
+CONFIG_BT_HCIUSB_SCO=y
+# CONFIG_BT_HCIUART is not set
+CONFIG_BT_HCIBCM203X=m
+CONFIG_BT_HCIBPA10X=m
+CONFIG_BT_HCIBFUSB=m
+CONFIG_BT_HCIVHCI=m
+CONFIG_IEEE80211=m
+# CONFIG_IEEE80211_DEBUG is not set
+CONFIG_IEEE80211_CRYPT_WEP=m
+CONFIG_IEEE80211_CRYPT_CCMP=m
+
+#
+# Device Drivers
+#
+
+#
+# Generic Driver Options
+#
+CONFIG_STANDALONE=y
+CONFIG_PREVENT_FIRMWARE_BUILD=y
+CONFIG_FW_LOADER=m
+# CONFIG_DEBUG_DRIVER is not set
+# CONFIG_SYS_HYPERVISOR is not set
+
+#
+# Connector - unified userspace <-> kernelspace linker
+#
+CONFIG_CONNECTOR=y
+CONFIG_PROC_EVENTS=y
+
+#
+# Memory Technology Devices (MTD)
+#
+# CONFIG_MTD is not set
+
+#
+# Parallel port support
+#
+CONFIG_PARPORT=m
+CONFIG_PARPORT_PC=m
+CONFIG_PARPORT_SERIAL=m
+CONFIG_PARPORT_PC_FIFO=y
+# CONFIG_PARPORT_PC_SUPERIO is not set
+CONFIG_PARPORT_NOT_PC=y
+# CONFIG_PARPORT_GSC is not set
+CONFIG_PARPORT_1284=y
+
+#
+# Plug and Play support
+#
+
+#
+# Block devices
+#
+CONFIG_BLK_DEV_FD=m
+# CONFIG_PARIDE is not set
+# CONFIG_BLK_CPQ_DA is not set
+# CONFIG_BLK_CPQ_CISS_DA is not set
+# CONFIG_BLK_DEV_DAC960 is not set
+# CONFIG_BLK_DEV_UMEM is not set
+# CONFIG_BLK_DEV_COW_COMMON is not set
+CONFIG_BLK_DEV_LOOP=m
+CONFIG_BLK_DEV_CRYPTOLOOP=m
+CONFIG_BLK_DEV_NBD=m
+# CONFIG_BLK_DEV_SX8 is not set
+# CONFIG_BLK_DEV_UB is not set
+CONFIG_BLK_DEV_RAM=y
+CONFIG_BLK_DEV_RAM_COUNT=16
+CONFIG_BLK_DEV_RAM_SIZE=123456
+CONFIG_BLK_DEV_INITRD=y
+CONFIG_CDROM_PKTCDVD=m
+CONFIG_CDROM_PKTCDVD_BUFFERS=8
+CONFIG_CDROM_PKTCDVD_WCACHE=y
+CONFIG_CIPHER_TWOFISH=m
+CONFIG_ATA_OVER_ETH=m
+
+#
+# ATA/ATAPI/MFM/RLL support
+#
+CONFIG_IDE=y
+CONFIG_BLK_DEV_IDE=y
+
+#
+# Please see Documentation/ide.txt for help/info on IDE drives
+#
+# CONFIG_BLK_DEV_IDE_SATA is not set
+CONFIG_BLK_DEV_IDEDISK=y
+# CONFIG_IDEDISK_MULTI_MODE is not set
+CONFIG_BLK_DEV_IDECD=m
+# CONFIG_BLK_DEV_IDETAPE is not set
+# CONFIG_BLK_DEV_IDEFLOPPY is not set
+CONFIG_BLK_DEV_IDESCSI=m
+CONFIG_IDE_TASK_IOCTL=y
+
+#
+# IDE chipset support/bugfixes
+#
+CONFIG_IDE_GENERIC=y
+CONFIG_BLK_DEV_IDEPCI=y
+CONFIG_IDEPCI_SHARE_IRQ=y
+# CONFIG_BLK_DEV_OFFBOARD is not set
+# CONFIG_BLK_DEV_GENERIC is not set
+# CONFIG_BLK_DEV_OPTI621 is not set
+# CONFIG_BLK_DEV_SL82C105 is not set
+CONFIG_BLK_DEV_IDEDMA_PCI=y
+CONFIG_BLK_DEV_IDEDMA_FORCED=y
+CONFIG_IDEDMA_PCI_AUTO=y
+# CONFIG_IDEDMA_ONLYDISK is not set
+# CONFIG_BLK_DEV_AEC62XX is not set
+# CONFIG_BLK_DEV_ALI15X3 is not set
+CONFIG_BLK_DEV_AMD74XX=y
+# CONFIG_BLK_DEV_CMD64X is not set
+# CONFIG_BLK_DEV_TRIFLEX is not set
+# CONFIG_BLK_DEV_CY82C693 is not set
+# CONFIG_BLK_DEV_CS5520 is not set
+# CONFIG_BLK_DEV_CS5530 is not set
+# CONFIG_BLK_DEV_HPT34X is not set
+# CONFIG_BLK_DEV_HPT366 is not set
+# CONFIG_BLK_DEV_SC1200 is not set
+# CONFIG_BLK_DEV_PIIX is not set
+# CONFIG_BLK_DEV_IT821X is not set
+# CONFIG_BLK_DEV_NS87415 is not set
+# CONFIG_BLK_DEV_PDC202XX_OLD is not set
+# CONFIG_BLK_DEV_PDC202XX_NEW is not set
+# CONFIG_BLK_DEV_SVWKS is not set
+CONFIG_BLK_DEV_SIIMAGE=y
+# CONFIG_BLK_DEV_SLC90E66 is not set
+# CONFIG_BLK_DEV_TRM290 is not set
+# CONFIG_BLK_DEV_VIA82CXXX is not set
+CONFIG_BLK_DEV_IDE_PMAC=y
+CONFIG_BLK_DEV_IDE_PMAC_ATA100FIRST=y
+CONFIG_BLK_DEV_IDEDMA_PMAC=y
+# CONFIG_BLK_DEV_IDE_PMAC_BLINK is not set
+# CONFIG_IDE_ARM is not set
+CONFIG_BLK_DEV_IDEDMA=y
+# CONFIG_IDEDMA_IVB is not set
+CONFIG_IDEDMA_AUTO=y
+# CONFIG_BLK_DEV_HD is not set
+
+#
+# SCSI device support
+#
+CONFIG_RAID_ATTRS=m
+CONFIG_SCSI=m
+CONFIG_SCSI_NETLINK=y
+CONFIG_SCSI_PROC_FS=y
+
+#
+# SCSI support type (disk, tape, CD-ROM)
+#
+CONFIG_BLK_DEV_SD=m
+CONFIG_SD_IOSTATS=y
+CONFIG_CHR_DEV_ST=m
+# CONFIG_CHR_DEV_OSST is not set
+CONFIG_BLK_DEV_SR=m
+CONFIG_BLK_DEV_SR_VENDOR=y
+CONFIG_CHR_DEV_SG=m
+CONFIG_CHR_DEV_SCH=m
+
+#
+# Some SCSI devices (e.g. CD jukebox) support multiple LUNs
+#
+CONFIG_SCSI_MULTI_LUN=y
+CONFIG_SCSI_CONSTANTS=y
+CONFIG_SCSI_LOGGING=y
+
+#
+# SCSI Transport Attributes
+#
+CONFIG_SCSI_SPI_ATTRS=m
+CONFIG_SCSI_FC_ATTRS=m
+# CONFIG_SCSI_ISCSI_ATTRS is not set
+CONFIG_SCSI_SAS_ATTRS=m
+CONFIG_SCSI_SAS_LIBSAS=m
+CONFIG_SCSI_SAS_ATA=y
+# CONFIG_SCSI_SAS_LIBSAS_DEBUG is not set
+CONFIG_ISCSI_TARGET=m
+
+#
+# SCSI low-level drivers
+#
+# CONFIG_ISCSI_TCP is not set
+# CONFIG_BLK_DEV_3W_XXXX_RAID is not set
+# CONFIG_SCSI_3W_9XXX is not set
+# CONFIG_SCSI_ACARD is not set
+# CONFIG_SCSI_AACRAID is not set
+# CONFIG_SCSI_AIC7XXX is not set
+# CONFIG_SCSI_AIC7XXX_OLD is not set
+# CONFIG_SCSI_AIC79XX is not set
+CONFIG_SCSI_AIC94XX=m
+# CONFIG_AIC94XX_DEBUG is not set
+# CONFIG_SCSI_ARCMSR is not set
+# CONFIG_MEGARAID_NEWGEN is not set
+# CONFIG_MEGARAID_LEGACY is not set
+CONFIG_MEGARAID_SAS=m
+# CONFIG_SCSI_HPTIOP is not set
+# CONFIG_SCSI_BUSLOGIC is not set
+# CONFIG_SCSI_DMX3191D is not set
+# CONFIG_SCSI_EATA is not set
+# CONFIG_SCSI_FUTURE_DOMAIN is not set
+# CONFIG_SCSI_GDTH is not set
+# CONFIG_SCSI_IPS is not set
+CONFIG_SCSI_IBMVSCSI=m
+CONFIG_SCSI_IBMVSCSIS=m
+# CONFIG_SCSI_INITIO is not set
+# CONFIG_SCSI_INIA100 is not set
+# CONFIG_SCSI_PPA is not set
+# CONFIG_SCSI_IMM is not set
+CONFIG_SCSI_SYM53C8XX_2=m
+CONFIG_SCSI_SYM53C8XX_DMA_ADDRESSING_MODE=0
+CONFIG_SCSI_SYM53C8XX_DEFAULT_TAGS=16
+CONFIG_SCSI_SYM53C8XX_MAX_TAGS=64
+CONFIG_SCSI_SYM53C8XX_MMIO=y
+CONFIG_SCSI_IPR=m
+CONFIG_SCSI_IPR_TRACE=y
+CONFIG_SCSI_IPR_DUMP=y
+# CONFIG_SCSI_QLOGIC_FC is not set
+# CONFIG_SCSI_QLOGIC_1280 is not set
+CONFIG_SCSI_QLA_FC=m
+# CONFIG_SCSI_QLA_ISCSI is not set
+CONFIG_SCSI_LPFC=m
+# CONFIG_SCSI_DC395x is not set
+# CONFIG_SCSI_DC390T is not set
+CONFIG_SCSI_DEBUG=m
+
+#
+# Serial ATA (prod) and Parallel ATA (experimental) drivers
+#
+CONFIG_ATA=m
+CONFIG_SATA_AHCI=m
+CONFIG_SATA_SVW=m
+# CONFIG_ATA_PIIX is not set
+# CONFIG_SATA_MV is not set
+# CONFIG_SATA_NV is not set
+# CONFIG_PDC_ADMA is not set
+# CONFIG_SATA_QSTOR is not set
+# CONFIG_SATA_PROMISE is not set
+# CONFIG_SATA_SX4 is not set
+# CONFIG_SATA_SIL is not set
+# CONFIG_SATA_SIL24 is not set
+# CONFIG_SATA_SIS is not set
+# CONFIG_SATA_ULI is not set
+# CONFIG_SATA_VIA is not set
+CONFIG_SATA_VITESSE=m
+CONFIG_SATA_INTEL_COMBINED=y
+# CONFIG_PATA_ALI is not set
+# CONFIG_PATA_AMD is not set
+# CONFIG_PATA_ARTOP is not set
+# CONFIG_PATA_ATIIXP is not set
+# CONFIG_PATA_CMD64X is not set
+# CONFIG_PATA_CS5520 is not set
+# CONFIG_PATA_CS5530 is not set
+# CONFIG_PATA_CYPRESS is not set
+# CONFIG_PATA_EFAR is not set
+# CONFIG_ATA_GENERIC is not set
+# CONFIG_PATA_HPT366 is not set
+# CONFIG_PATA_HPT37X is not set
+# CONFIG_PATA_HPT3X2N is not set
+# CONFIG_PATA_HPT3X3 is not set
+# CONFIG_PATA_IT821X is not set
+# CONFIG_PATA_IT8213 is not set
+# CONFIG_PATA_JMICRON is not set
+# CONFIG_PATA_TRIFLEX is not set
+# CONFIG_PATA_MARVELL is not set
+# CONFIG_PATA_MPIIX is not set
+# CONFIG_PATA_OLDPIIX is not set
+# CONFIG_PATA_NETCELL is not set
+# CONFIG_PATA_NS87410 is not set
+# CONFIG_PATA_OPTI is not set
+# CONFIG_PATA_OPTIDMA is not set
+# CONFIG_PATA_PDC_OLD is not set
+# CONFIG_PATA_RADISYS is not set
+# CONFIG_PATA_RZ1000 is not set
+# CONFIG_PATA_SC1200 is not set
+# CONFIG_PATA_SERVERWORKS is not set
+CONFIG_PATA_PDC2027X=m
+# CONFIG_PATA_SIL680 is not set
+# CONFIG_PATA_SIS is not set
+# CONFIG_PATA_VIA is not set
+CONFIG_PATA_WINBOND=m
+
+#
+# Multi-device support (RAID and LVM)
+#
+CONFIG_MD=y
+CONFIG_BLK_DEV_MD=y
+CONFIG_MD_LINEAR=m
+CONFIG_MD_RAID0=m
+CONFIG_MD_RAID1=m
+CONFIG_MD_RAID10=m
+CONFIG_MD_RAID5=m
+CONFIG_MD_RAID6=m
+CONFIG_MD_MULTIPATH=m
+CONFIG_MD_FAULTY=m
+CONFIG_BLK_DEV_DM=m
+CONFIG_DM_CRYPT=m
+CONFIG_DM_SNAPSHOT=m
+CONFIG_DM_MIRROR=m
+CONFIG_DM_ZERO=m
+CONFIG_DM_MULTIPATH=m
+CONFIG_DM_MULTIPATH_EMC=m
+CONFIG_DM_MULTIPATH_HP_SW=m
+CONFIG_DM_MULTIPATH_RDAC=m
+CONFIG_DM_MULTIPATH_ALUA=m
+CONFIG_DM_NL_EVT=y
+CONFIG_FUSION=y
+CONFIG_FUSION_SPI=m
+CONFIG_FUSION_FC=m
+CONFIG_FUSION_SAS=m
+CONFIG_FUSION_MAX_SGE=128
+CONFIG_FUSION_MAX_FC_SGE=256
+CONFIG_FUSION_CTL=m
+CONFIG_FUSION_LAN=m
+CONFIG_FUSION_LOGGING=y
+
+#
+# IEEE 1394 (FireWire) support
+#
+CONFIG_IEEE1394=m
+
+#
+# Subsystem Options
+#
+# CONFIG_IEEE1394_VERBOSEDEBUG is not set
+# CONFIG_IEEE1394_OUI_DB is not set
+CONFIG_IEEE1394_EXTRA_CONFIG_ROMS=y
+CONFIG_IEEE1394_CONFIG_ROM_IP1394=y
+CONFIG_IEEE1394_EXPORT_FULL_API=y
+
+#
+# Device Drivers
+#
+# CONFIG_IEEE1394_PCILYNX is not set
+CONFIG_IEEE1394_OHCI1394=m
+
+#
+# Protocol Drivers
+#
+CONFIG_IEEE1394_VIDEO1394=m
+CONFIG_IEEE1394_SBP2=m
+# CONFIG_IEEE1394_SBP2_PHYS_DMA is not set
+CONFIG_IEEE1394_ETH1394=m
+CONFIG_IEEE1394_DV1394=m
+CONFIG_IEEE1394_RAWIO=m
+
+#
+# I2O device support
+#
+# CONFIG_I2O is not set
+
+#
+# Macintosh device drivers
+#
+CONFIG_ADB=y
+CONFIG_ADB_PMU=y
+CONFIG_PMAC_SMU=y
+CONFIG_INPUT_ADBHID=y
+CONFIG_MAC_EMUMOUSEBTN=y
+CONFIG_THERM_PM72=y
+CONFIG_WINDFARM=y
+CONFIG_WINDFARM_PM81=y
+CONFIG_WINDFARM_PM91=y
+CONFIG_WINDFARM_PM112=y
+
+#
+# Network device support
+#
+CONFIG_NETDEVICES=y
+CONFIG_IFB=m
+CONFIG_DUMMY=m
+CONFIG_BONDING=m
+CONFIG_EQUALIZER=m
+CONFIG_TUN=m
+
+#
+# ARCnet devices
+#
+# CONFIG_ARCNET is not set
+
+#
+# PHY device support
+#
+CONFIG_PHYLIB=m
+
+#
+# MII PHY device drivers
+#
+CONFIG_MARVELL_PHY=m
+CONFIG_DAVICOM_PHY=m
+CONFIG_QSEMI_PHY=m
+CONFIG_LXT_PHY=m
+CONFIG_CICADA_PHY=m
+
+#
+# Ethernet (10 or 100Mbit)
+#
+CONFIG_NET_ETHERNET=y
+CONFIG_MII=m
+# CONFIG_HAPPYMEAL is not set
+CONFIG_SUNGEM=m
+CONFIG_CASSINI=m
+CONFIG_NET_VENDOR_3COM=y
+CONFIG_VORTEX=m
+CONFIG_TYPHOON=m
+
+#
+# Tulip family network device support
+#
+CONFIG_NET_TULIP=y
+# CONFIG_DE2104X is not set
+CONFIG_TULIP=m
+CONFIG_TULIP_MWI=y
+CONFIG_TULIP_MMIO=y
+CONFIG_TULIP_NAPI=y
+CONFIG_TULIP_NAPI_HW_MITIGATION=y
+CONFIG_DE4X5=m
+CONFIG_WINBOND_840=m
+CONFIG_DM9102=m
+CONFIG_ULI526X=m
+# CONFIG_HP100 is not set
+CONFIG_IBMVETH=m
+CONFIG_NET_PCI=y
+CONFIG_PCNET32=m
+CONFIG_AMD8111_ETH=m
+CONFIG_AMD8111E_NAPI=y
+# CONFIG_ADAPTEC_STARFIRE is not set
+# CONFIG_B44 is not set
+# CONFIG_FORCEDETH is not set
+# CONFIG_DGRS is not set
+# CONFIG_EEPRO100 is not set
+CONFIG_E100=m
+# CONFIG_FEALNX is not set
+# CONFIG_NATSEMI is not set
+# CONFIG_NE2K_PCI is not set
+# CONFIG_8139CP is not set
+# CONFIG_8139TOO is not set
+# CONFIG_SIS900 is not set
+# CONFIG_EPIC100 is not set
+# CONFIG_SUNDANCE is not set
+# CONFIG_VIA_RHINE is not set
+# CONFIG_NET_POCKET is not set
+
+#
+# Ethernet (1000 Mbit)
+#
+CONFIG_ACENIC=m
+CONFIG_ACENIC_OMIT_TIGON_I=y
+# CONFIG_DL2K is not set
+CONFIG_E1000=m
+CONFIG_E1000_NAPI=y
+# CONFIG_E1000_DISABLE_PACKET_SPLIT is not set
+CONFIG_IGB=m
+# CONFIG_NS83820 is not set
+# CONFIG_HAMACHI is not set
+# CONFIG_YELLOWFIN is not set
+CONFIG_R8169=m
+CONFIG_R8169_NAPI=y
+CONFIG_R8169_VLAN=y
+CONFIG_SIS190=m
+# CONFIG_SKGE is not set
+CONFIG_SKY2=m
+# CONFIG_SK98LIN is not set
+# CONFIG_VIA_VELOCITY is not set
+CONFIG_TIGON3=m
+CONFIG_BNX2=m
+CONFIG_BNX2X=m
+CONFIG_SPIDER_NET=m
+# CONFIG_MV643XX_ETH is not set
+CONFIG_QLA3XXX=m
+
+#
+# Ethernet (10000 Mbit)
+#
+CONFIG_CHELSIO_T1=m
+CONFIG_EHEA=m
+# CONFIG_CHELSIO_T3 is not set
+CONFIG_IXGBE=m
+CONFIG_IXGBE_NAPI=y
+CONFIG_IXGB=m
+CONFIG_IXGB_NAPI=y
+CONFIG_S2IO=m
+CONFIG_S2IO_NAPI=y
+CONFIG_NETXEN_NIC=m
+CONFIG_MYRI10GE=m
+
+#
+# Token Ring devices
+#
+CONFIG_TR=y
+CONFIG_IBMOL=m
+# CONFIG_3C359 is not set
+# CONFIG_TMS380TR is not set
+
+#
+# Wireless LAN (non-hamradio)
+#
+# CONFIG_NET_RADIO is not set
+
+#
+# Wan interfaces
+#
+# CONFIG_WAN is not set
+# CONFIG_FDDI is not set
+# CONFIG_HIPPI is not set
+# CONFIG_PLIP is not set
+CONFIG_PPP=m
+CONFIG_PPP_MULTILINK=y
+CONFIG_PPP_FILTER=y
+CONFIG_PPP_ASYNC=m
+CONFIG_PPP_SYNC_TTY=m
+CONFIG_PPP_DEFLATE=m
+CONFIG_PPP_BSDCOMP=m
+CONFIG_PPP_MPPE=m
+CONFIG_PPPOE=m
+CONFIG_SLIP=m
+CONFIG_SLIP_COMPRESSED=y
+CONFIG_SLIP_SMART=y
+# CONFIG_SLIP_MODE_SLIP6 is not set
+CONFIG_NET_FC=y
+CONFIG_SHAPER=m
+CONFIG_NETCONSOLE=m
+CONFIG_NETPOLL=y
+CONFIG_NETPOLL_TRAP=y
+CONFIG_NET_POLL_CONTROLLER=y
+
+#
+# ISDN subsystem
+#
+# CONFIG_ISDN is not set
+
+#
+# Telephony Support
+#
+# CONFIG_PHONE is not set
+
+#
+# Input device support
+#
+CONFIG_INPUT=y
+
+#
+# Userland interfaces
+#
+CONFIG_INPUT_MOUSEDEV=y
+# CONFIG_INPUT_MOUSEDEV_PSAUX is not set
+CONFIG_INPUT_MOUSEDEV_SCREEN_X=1024
+CONFIG_INPUT_MOUSEDEV_SCREEN_Y=768
+CONFIG_INPUT_JOYDEV=m
+CONFIG_INPUT_TSDEV=m
+CONFIG_INPUT_TSDEV_SCREEN_X=240
+CONFIG_INPUT_TSDEV_SCREEN_Y=320
+CONFIG_INPUT_EVDEV=y
+CONFIG_INPUT_EVBUG=m
+
+#
+# Input Device Drivers
+#
+CONFIG_INPUT_KEYBOARD=y
+CONFIG_KEYBOARD_ATKBD=y
+# CONFIG_KEYBOARD_SUNKBD is not set
+# CONFIG_KEYBOARD_LKKBD is not set
+# CONFIG_KEYBOARD_XTKBD is not set
+# CONFIG_KEYBOARD_NEWTON is not set
+CONFIG_INPUT_MOUSE=y
+CONFIG_MOUSE_PS2=y
+CONFIG_MOUSE_SERIAL=m
+# CONFIG_MOUSE_VSXXXAA is not set
+CONFIG_INPUT_JOYSTICK=y
+# CONFIG_JOYSTICK_ANALOG is not set
+# CONFIG_JOYSTICK_A3D is not set
+# CONFIG_JOYSTICK_ADI is not set
+# CONFIG_JOYSTICK_COBRA is not set
+# CONFIG_JOYSTICK_GF2K is not set
+# CONFIG_JOYSTICK_GRIP is not set
+# CONFIG_JOYSTICK_GRIP_MP is not set
+# CONFIG_JOYSTICK_GUILLEMOT is not set
+# CONFIG_JOYSTICK_INTERACT is not set
+# CONFIG_JOYSTICK_SIDEWINDER is not set
+# CONFIG_JOYSTICK_TMDC is not set
+CONFIG_JOYSTICK_IFORCE=m
+CONFIG_JOYSTICK_IFORCE_USB=y
+CONFIG_JOYSTICK_IFORCE_232=y
+CONFIG_JOYSTICK_WARRIOR=m
+CONFIG_JOYSTICK_MAGELLAN=m
+CONFIG_JOYSTICK_SPACEORB=m
+CONFIG_JOYSTICK_SPACEBALL=m
+CONFIG_JOYSTICK_STINGER=m
+CONFIG_JOYSTICK_TWIDJOY=m
+# CONFIG_JOYSTICK_DB9 is not set
+# CONFIG_JOYSTICK_GAMECON is not set
+# CONFIG_JOYSTICK_TURBOGRAFX is not set
+CONFIG_JOYSTICK_JOYDUMP=m
+CONFIG_INPUT_TOUCHSCREEN=y
+CONFIG_TOUCHSCREEN_ADS7846=m
+# CONFIG_TOUCHSCREEN_GUNZE is not set
+# CONFIG_TOUCHSCREEN_ELO is not set
+# CONFIG_TOUCHSCREEN_MTOUCH is not set
+# CONFIG_TOUCHSCREEN_MK712 is not set
+CONFIG_INPUT_MISC=y
+CONFIG_INPUT_PCSPKR=m
+CONFIG_INPUT_UINPUT=m
+
+#
+# Hardware I/O ports
+#
+CONFIG_SERIO=y
+CONFIG_SERIO_I8042=y
+CONFIG_SERIO_SERPORT=m
+# CONFIG_SERIO_PARKBD is not set
+# CONFIG_SERIO_PCIPS2 is not set
+CONFIG_SERIO_LIBPS2=y
+CONFIG_SERIO_RAW=m
+CONFIG_GAMEPORT=m
+# CONFIG_GAMEPORT_NS558 is not set
+# CONFIG_GAMEPORT_L4 is not set
+# CONFIG_GAMEPORT_EMU10K1 is not set
+# CONFIG_GAMEPORT_FM801 is not set
+
+#
+# Character devices
+#
+CONFIG_VT=y
+CONFIG_VT_CONSOLE=y
+CONFIG_HW_CONSOLE=y
+# CONFIG_SERIAL_NONSTANDARD is not set
+# CONFIG_NOZOMI is not set
+
+#
+# Serial drivers
+#
+CONFIG_SERIAL_8250=y
+CONFIG_SERIAL_8250_CONSOLE=y
+CONFIG_SERIAL_8250_NR_UARTS=4
+CONFIG_SERIAL_8250_RUNTIME_UARTS=4
+# CONFIG_SERIAL_8250_EXTENDED is not set
+
+#
+# Non-8250 serial port support
+#
+CONFIG_SERIAL_CORE=y
+CONFIG_SERIAL_CORE_CONSOLE=y
+CONFIG_SERIAL_PMACZILOG=y
+CONFIG_SERIAL_PMACZILOG_CONSOLE=y
+CONFIG_SERIAL_ICOM=m
+CONFIG_SERIAL_JSM=m
+CONFIG_UNIX98_PTYS=y
+CONFIG_LEGACY_PTYS=y
+CONFIG_LEGACY_PTY_COUNT=64
+CONFIG_PRINTER=m
+# CONFIG_LP_CONSOLE is not set
+# CONFIG_PPDEV is not set
+# CONFIG_TIPAR is not set
+CONFIG_HVC_CONSOLE=y
+CONFIG_HVC_RTAS=y
+CONFIG_HVCS=m
+
+#
+# IPMI
+#
+# CONFIG_IPMI_HANDLER is not set
+
+#
+# Watchdog Cards
+#
+CONFIG_WATCHDOG=y
+# CONFIG_WATCHDOG_NOWAYOUT is not set
+
+#
+# Watchdog Device Drivers
+#
+CONFIG_SOFT_WATCHDOG=m
+CONFIG_WATCHDOG_RTAS=m
+
+#
+# PCI-based Watchdog Cards
+#
+# CONFIG_PCIPCWATCHDOG is not set
+# CONFIG_WDTPCI is not set
+
+#
+# USB-based Watchdog Cards
+#
+# CONFIG_USBPCWATCHDOG is not set
+# CONFIG_RTC is not set
+CONFIG_GEN_RTC=y
+# CONFIG_GEN_RTC_X is not set
+# CONFIG_DTLK is not set
+# CONFIG_R3964 is not set
+# CONFIG_APPLICOM is not set
+
+#
+# Ftape, the floppy tape device driver
+#
+CONFIG_AGP=m
+CONFIG_AGP_UNINORTH=m
+CONFIG_DRM=m
+# CONFIG_DRM_TDFX is not set
+CONFIG_DRM_R128=m
+CONFIG_DRM_RADEON=m
+# CONFIG_DRM_MGA is not set
+# CONFIG_DRM_SIS is not set
+# CONFIG_DRM_VIA is not set
+# CONFIG_DRM_SAVAGE is not set
+CONFIG_RAW_DRIVER=m
+CONFIG_MAX_RAW_DEVS=4096
+CONFIG_HANGCHECK_TIMER=m
+
+#
+# TPM devices
+#
+CONFIG_TCG_TPM=m
+CONFIG_TCG_TIS=m
+CONFIG_TCG_ATMEL=m
+# CONFIG_TELCLOCK is not set
+CONFIG_CRASHER=m
+
+#
+# I2C support
+#
+CONFIG_I2C=y
+CONFIG_I2C_CHARDEV=m
+
+#
+# I2C Algorithms
+#
+CONFIG_I2C_ALGOBIT=y
+# CONFIG_I2C_ALGOPCF is not set
+# CONFIG_I2C_ALGOPCA is not set
+
+#
+# I2C Hardware Bus support
+#
+# CONFIG_I2C_ALI1535 is not set
+# CONFIG_I2C_ALI1563 is not set
+# CONFIG_I2C_ALI15X3 is not set
+# CONFIG_I2C_AMD756 is not set
+CONFIG_I2C_AMD8111=m
+# CONFIG_I2C_I801 is not set
+# CONFIG_I2C_I810 is not set
+# CONFIG_I2C_PIIX4 is not set
+CONFIG_I2C_POWERMAC=y
+# CONFIG_I2C_NFORCE2 is not set
+# CONFIG_I2C_PARPORT is not set
+# CONFIG_I2C_PARPORT_LIGHT is not set
+# CONFIG_I2C_PROSAVAGE is not set
+# CONFIG_I2C_SAVAGE4 is not set
+# CONFIG_SCx200_ACB is not set
+# CONFIG_I2C_SIS5595 is not set
+# CONFIG_I2C_SIS630 is not set
+# CONFIG_I2C_SIS96X is not set
+# CONFIG_I2C_STUB is not set
+# CONFIG_I2C_VIA is not set
+# CONFIG_I2C_VIAPRO is not set
+# CONFIG_I2C_VOODOO3 is not set
+# CONFIG_I2C_PCA_ISA is not set
+
+#
+# Miscellaneous I2C Chip support
+#
+# CONFIG_SENSORS_DS1337 is not set
+# CONFIG_SENSORS_DS1374 is not set
+# CONFIG_SENSORS_EEPROM is not set
+# CONFIG_SENSORS_PCF8574 is not set
+# CONFIG_SENSORS_PCA9539 is not set
+# CONFIG_SENSORS_PCF8591 is not set
+# CONFIG_SENSORS_RTC8564 is not set
+# CONFIG_SENSORS_MAX6875 is not set
+# CONFIG_RTC_X1205_I2C is not set
+# CONFIG_I2C_DEBUG_CORE is not set
+# CONFIG_I2C_DEBUG_ALGO is not set
+# CONFIG_I2C_DEBUG_BUS is not set
+# CONFIG_I2C_DEBUG_CHIP is not set
+
+#
+# SPI support
+#
+CONFIG_SPI=y
+CONFIG_SPI_DEBUG=y
+CONFIG_SPI_MASTER=y
+
+#
+# SPI Master Controller Drivers
+#
+CONFIG_SPI_BITBANG=m
+CONFIG_SPI_BUTTERFLY=m
+
+#
+# SPI Protocol Masters
+#
+
+#
+# Dallas's 1-wire bus
+#
+# CONFIG_W1 is not set
+
+#
+# Hardware Monitoring support
+#
+# CONFIG_HWMON is not set
+# CONFIG_HWMON_VID is not set
+
+#
+# Misc devices
+#
+# CONFIG_TIFM_CORE is not set
+
+#
+# Multimedia Capabilities Port drivers
+#
+
+#
+# Multimedia devices
+#
+CONFIG_VIDEO_DEV=m
+
+#
+# Video For Linux
+#
+
+#
+# Video Adapters
+#
+# CONFIG_VIDEO_ADV_DEBUG is not set
+CONFIG_VIDEO_BT848=m
+CONFIG_VIDEO_SAA6588=m
+# CONFIG_VIDEO_BWQCAM is not set
+# CONFIG_VIDEO_CQCAM is not set
+# CONFIG_VIDEO_W9966 is not set
+# CONFIG_VIDEO_CPIA is not set
+# CONFIG_VIDEO_SAA5246A is not set
+# CONFIG_VIDEO_SAA5249 is not set
+# CONFIG_TUNER_3036 is not set
+# CONFIG_VIDEO_STRADIS is not set
+# CONFIG_VIDEO_ZORAN is not set
+# CONFIG_VIDEO_SAA7134 is not set
+# CONFIG_VIDEO_MXB is not set
+# CONFIG_VIDEO_DPC is not set
+# CONFIG_VIDEO_HEXIUM_ORION is not set
+# CONFIG_VIDEO_HEXIUM_GEMINI is not set
+# CONFIG_VIDEO_CX88 is not set
+# CONFIG_VIDEO_EM28XX is not set
+# CONFIG_VIDEO_OVCAMCHIP is not set
+# CONFIG_VIDEO_AUDIO_DECODER is not set
+# CONFIG_VIDEO_DECODER is not set
+
+#
+# Radio Adapters
+#
+# CONFIG_RADIO_GEMTEK_PCI is not set
+# CONFIG_RADIO_MAXIRADIO is not set
+# CONFIG_RADIO_MAESTRO is not set
+
+#
+# Digital Video Broadcasting Devices
+#
+# CONFIG_DVB is not set
+CONFIG_VIDEO_TUNER=m
+CONFIG_VIDEO_BUF=m
+CONFIG_VIDEO_BTCX=m
+CONFIG_VIDEO_IR=m
+CONFIG_VIDEO_TVEEPROM=m
+
+#
+# Graphics support
+#
+CONFIG_FB=y
+CONFIG_FB_CFB_FILLRECT=y
+CONFIG_FB_CFB_COPYAREA=y
+CONFIG_FB_CFB_IMAGEBLIT=y
+CONFIG_FB_MACMODES=y
+CONFIG_FB_MODE_HELPERS=y
+CONFIG_FB_TILEBLITTING=y
+# CONFIG_FB_CIRRUS is not set
+# CONFIG_FB_PM2 is not set
+# CONFIG_FB_CYBER2000 is not set
+CONFIG_FB_OF=y
+# CONFIG_FB_CONTROL is not set
+# CONFIG_FB_PLATINUM is not set
+# CONFIG_FB_VALKYRIE is not set
+# CONFIG_FB_CT65550 is not set
+# CONFIG_FB_ASILIANT is not set
+# CONFIG_FB_IMSTT is not set
+# CONFIG_FB_VGA16 is not set
+# CONFIG_FB_S1D13XXX is not set
+CONFIG_FB_NVIDIA=y
+CONFIG_FB_NVIDIA_I2C=y
+# CONFIG_FB_RIVA is not set
+CONFIG_FB_MATROX=y
+CONFIG_FB_MATROX_MILLENIUM=y
+CONFIG_FB_MATROX_MYSTIQUE=y
+CONFIG_FB_MATROX_G=y
+CONFIG_FB_MATROX_I2C=m
+CONFIG_FB_MATROX_MAVEN=m
+CONFIG_FB_MATROX_MULTIHEAD=y
+# CONFIG_FB_RADEON_OLD is not set
+CONFIG_FB_RADEON=y
+CONFIG_FB_RADEON_I2C=y
+# CONFIG_FB_RADEON_DEBUG is not set
+# CONFIG_FB_ATY128 is not set
+# CONFIG_FB_ATY is not set
+# CONFIG_FB_SAVAGE is not set
+# CONFIG_FB_SIS is not set
+# CONFIG_FB_NEOMAGIC is not set
+# CONFIG_FB_KYRO is not set
+# CONFIG_FB_3DFX is not set
+# CONFIG_FB_VOODOO1 is not set
+# CONFIG_FB_TRIDENT is not set
+# CONFIG_FB_VIRTUAL is not set
+
+#
+# Console display driver support
+#
+# CONFIG_VGA_CONSOLE is not set
+CONFIG_DUMMY_CONSOLE=y
+CONFIG_FRAMEBUFFER_CONSOLE=y
+CONFIG_FRAMEBUFFER_CONSOLE_ROTATION=y
+# CONFIG_FONTS is not set
+CONFIG_FONT_8x8=y
+CONFIG_FONT_8x16=y
+
+#
+# Logo configuration
+#
+CONFIG_LOGO=y
+# CONFIG_LOGO_LINUX_MONO is not set
+CONFIG_LOGO_LINUX_VGA16=y
+# CONFIG_LOGO_LINUX_CLUT224 is not set
+CONFIG_BACKLIGHT_LCD_SUPPORT=y
+CONFIG_BACKLIGHT_CLASS_DEVICE=m
+CONFIG_BACKLIGHT_DEVICE=y
+CONFIG_LCD_CLASS_DEVICE=m
+CONFIG_LCD_DEVICE=y
+
+#
+# Bootsplash configuration
+#
+
+#
+# Sound
+#
+CONFIG_SOUND=m
+
+#
+# Advanced Linux Sound Architecture
+#
+CONFIG_SND=m
+CONFIG_SND_TIMER=m
+CONFIG_SND_PCM=m
+CONFIG_SND_HWDEP=m
+CONFIG_SND_RAWMIDI=m
+CONFIG_SND_SEQUENCER=m
+CONFIG_SND_SEQ_DUMMY=m
+CONFIG_SND_OSSEMUL=y
+CONFIG_SND_MIXER_OSS=m
+CONFIG_SND_PCM_OSS=m
+CONFIG_SND_PCM_OSS_PLUGINS=y
+CONFIG_SND_SEQUENCER_OSS=y
+CONFIG_SND_DYNAMIC_MINORS=y
+CONFIG_SND_SUPPORT_OLD_API=y
+CONFIG_SND_VERBOSE_PROCFS=y
+CONFIG_SND_VERBOSE_PRINTK=y
+CONFIG_SND_DEBUG=y
+# CONFIG_SND_DEBUG_DETECT is not set
+
+#
+# Generic devices
+#
+CONFIG_SND_MPU401_UART=m
+CONFIG_SND_DUMMY=m
+CONFIG_SND_VIRMIDI=m
+CONFIG_SND_MTPAV=m
+CONFIG_SND_SERIAL_U16550=m
+CONFIG_SND_MPU401=m
+
+#
+# PCI devices
+#
+# CONFIG_SND_AD1889 is not set
+# CONFIG_SND_ALS4000 is not set
+# CONFIG_SND_ALI5451 is not set
+# CONFIG_SND_ATIIXP is not set
+# CONFIG_SND_ATIIXP_MODEM is not set
+# CONFIG_SND_AU8810 is not set
+# CONFIG_SND_AU8820 is not set
+# CONFIG_SND_AU8830 is not set
+# CONFIG_SND_AZT3328 is not set
+# CONFIG_SND_BT87X is not set
+# CONFIG_SND_CA0106 is not set
+# CONFIG_SND_CMIPCI is not set
+# CONFIG_SND_CS4281 is not set
+# CONFIG_SND_CS46XX is not set
+CONFIG_SND_DARLA20=m
+CONFIG_SND_GINA20=m
+CONFIG_SND_LAYLA20=m
+CONFIG_SND_DARLA24=m
+CONFIG_SND_GINA24=m
+CONFIG_SND_LAYLA24=m
+CONFIG_SND_MONA=m
+CONFIG_SND_MIA=m
+CONFIG_SND_ECHO3G=m
+CONFIG_SND_INDIGO=m
+CONFIG_SND_INDIGOIO=m
+CONFIG_SND_INDIGODJ=m
+# CONFIG_SND_EMU10K1 is not set
+# CONFIG_SND_EMU10K1X is not set
+# CONFIG_SND_ENS1370 is not set
+# CONFIG_SND_ENS1371 is not set
+# CONFIG_SND_ES1938 is not set
+# CONFIG_SND_ES1968 is not set
+# CONFIG_SND_FM801 is not set
+# CONFIG_SND_HDA_INTEL is not set
+# CONFIG_SND_HDSP is not set
+# CONFIG_SND_HDSPM is not set
+# CONFIG_SND_ICE1712 is not set
+# CONFIG_SND_ICE1724 is not set
+# CONFIG_SND_INTEL8X0 is not set
+# CONFIG_SND_INTEL8X0M is not set
+# CONFIG_SND_KORG1212 is not set
+# CONFIG_SND_MAESTRO3 is not set
+# CONFIG_SND_MIXART is not set
+# CONFIG_SND_NM256 is not set
+# CONFIG_SND_PCXHR is not set
+# CONFIG_SND_RME32 is not set
+# CONFIG_SND_RME96 is not set
+# CONFIG_SND_RME9652 is not set
+# CONFIG_SND_SONICVIBES is not set
+# CONFIG_SND_TRIDENT is not set
+# CONFIG_SND_VIA82XX is not set
+# CONFIG_SND_VIA82XX_MODEM is not set
+# CONFIG_SND_VX222 is not set
+# CONFIG_SND_YMFPCI is not set
+
+#
+# ALSA PowerMac devices
+#
+CONFIG_SND_POWERMAC=m
+CONFIG_SND_POWERMAC_AUTO_DRC=y
+
+#
+# USB devices
+#
+CONFIG_SND_USB_AUDIO=m
+CONFIG_SND_USB_USX2Y=m
+
+#
+# Open Sound System
+#
+# CONFIG_SOUND_PRIME is not set
+
+#
+# USB support
+#
+CONFIG_USB_ARCH_HAS_HCD=y
+CONFIG_USB_ARCH_HAS_OHCI=y
+CONFIG_USB=y
+# CONFIG_USB_DEBUG is not set
+
+#
+# Miscellaneous USB options
+#
+CONFIG_USB_DEVICEFS=y
+CONFIG_USB_BANDWIDTH=y
+# CONFIG_USB_DYNAMIC_MINORS is not set
+# CONFIG_USB_OTG is not set
+
+#
+# USB Host Controller Drivers
+#
+CONFIG_USB_EHCI_HCD=y
+CONFIG_USB_EHCI_SPLIT_ISO=y
+CONFIG_USB_EHCI_ROOT_HUB_TT=y
+# CONFIG_USB_ISP116X_HCD is not set
+CONFIG_USB_OHCI_HCD=y
+# CONFIG_USB_OHCI_BIG_ENDIAN is not set
+CONFIG_USB_OHCI_LITTLE_ENDIAN=y
+CONFIG_USB_UHCI_HCD=m
+# CONFIG_USB_SL811_HCD is not set
+
+#
+# USB Device Class drivers
+#
+# CONFIG_OBSOLETE_OSS_USB_DRIVER is not set
+CONFIG_USB_ACM=m
+CONFIG_USB_PRINTER=m
+
+#
+# NOTE: USB_STORAGE enables SCSI, and 'SCSI disk support'
+#
+
+#
+# may also be needed; see USB_STORAGE Help for more information
+#
+CONFIG_USB_STORAGE=m
+# CONFIG_USB_STORAGE_DEBUG is not set
+CONFIG_USB_STORAGE_DATAFAB=y
+CONFIG_USB_STORAGE_FREECOM=y
+CONFIG_USB_STORAGE_ISD200=y
+CONFIG_USB_STORAGE_DPCM=y
+CONFIG_USB_STORAGE_USBAT=y
+CONFIG_USB_STORAGE_SDDR09=y
+CONFIG_USB_STORAGE_SDDR55=y
+CONFIG_USB_STORAGE_JUMPSHOT=y
+CONFIG_USB_STORAGE_ALAUDA=y
+CONFIG_USB_STORAGE_ONETOUCH=y
+# CONFIG_USB_LIBUSUAL is not set
+
+#
+# USB Input Devices
+#
+CONFIG_USB_HID=y
+CONFIG_USB_HIDINPUT=y
+# CONFIG_USB_HIDINPUT_POWERBOOK is not set
+CONFIG_HID_FF=y
+CONFIG_HID_PID=y
+CONFIG_LOGITECH_FF=y
+CONFIG_THRUSTMASTER_FF=y
+CONFIG_USB_HIDDEV=y
+CONFIG_USB_AIPTEK=m
+CONFIG_USB_WACOM=m
+CONFIG_USB_ACECAD=m
+CONFIG_USB_KBTAB=m
+CONFIG_USB_POWERMATE=m
+CONFIG_USB_MTOUCH=m
+CONFIG_USB_ITMTOUCH=m
+CONFIG_USB_EGALAX=m
+CONFIG_USB_YEALINK=m
+CONFIG_USB_XPAD=m
+CONFIG_USB_ATI_REMOTE=m
+CONFIG_USB_ATI_REMOTE2=m
+CONFIG_USB_KEYSPAN_REMOTE=m
+CONFIG_USB_APPLETOUCH=m
+
+#
+# USB Imaging devices
+#
+CONFIG_USB_MDC800=m
+CONFIG_USB_MICROTEK=m
+
+#
+# USB Multimedia devices
+#
+CONFIG_USB_DABUSB=m
+CONFIG_USB_VICAM=m
+CONFIG_USB_DSBR=m
+CONFIG_USB_ET61X251=m
+CONFIG_USB_IBMCAM=m
+CONFIG_USB_KONICAWC=m
+CONFIG_USB_OV511=m
+CONFIG_USB_SE401=m
+CONFIG_USB_SN9C102=m
+CONFIG_USB_STV680=m
+CONFIG_USB_PWC=m
+
+#
+# USB Network Adapters
+#
+CONFIG_USB_CATC=m
+CONFIG_USB_KAWETH=m
+CONFIG_USB_PEGASUS=m
+CONFIG_USB_RTL8150=m
+CONFIG_USB_USBNET=m
+CONFIG_USB_NET_AX8817X=m
+CONFIG_USB_NET_CDCETHER=m
+CONFIG_USB_NET_GL620A=m
+CONFIG_USB_NET_NET1080=m
+CONFIG_USB_NET_PLUSB=m
+CONFIG_USB_NET_RNDIS_HOST=m
+CONFIG_USB_NET_CDC_SUBSET=m
+CONFIG_USB_ALI_M5632=y
+CONFIG_USB_AN2720=y
+CONFIG_USB_BELKIN=y
+CONFIG_USB_ARMLINUX=y
+CONFIG_USB_EPSON2888=y
+CONFIG_USB_NET_ZAURUS=m
+# CONFIG_USB_MON is not set
+
+#
+# USB port drivers
+#
+CONFIG_USB_USS720=m
+
+#
+# USB Serial Converter support
+#
+CONFIG_USB_SERIAL=m
+CONFIG_USB_SERIAL_GENERIC=y
+CONFIG_USB_SERIAL_AIRPRIME=m
+CONFIG_USB_SERIAL_ANYDATA=m
+CONFIG_USB_SERIAL_BELKIN=m
+CONFIG_USB_SERIAL_WHITEHEAT=m
+CONFIG_USB_SERIAL_DIGI_ACCELEPORT=m
+CONFIG_USB_SERIAL_CP2101=m
+CONFIG_USB_SERIAL_CYPRESS_M8=m
+CONFIG_USB_SERIAL_EMPEG=m
+CONFIG_USB_SERIAL_FTDI_SIO=m
+CONFIG_USB_SERIAL_FUNSOFT=m
+CONFIG_USB_SERIAL_VISOR=m
+CONFIG_USB_SERIAL_IPAQ=m
+CONFIG_USB_SERIAL_IR=m
+CONFIG_USB_SERIAL_EDGEPORT=m
+CONFIG_USB_SERIAL_EDGEPORT_TI=m
+CONFIG_USB_SERIAL_GARMIN=m
+CONFIG_USB_SERIAL_IPW=m
+CONFIG_USB_SERIAL_KEYSPAN_PDA=m
+CONFIG_USB_SERIAL_KEYSPAN=m
+CONFIG_USB_SERIAL_KEYSPAN_MPR=y
+CONFIG_USB_SERIAL_KEYSPAN_USA28=y
+CONFIG_USB_SERIAL_KEYSPAN_USA28X=y
+CONFIG_USB_SERIAL_KEYSPAN_USA28XA=y
+CONFIG_USB_SERIAL_KEYSPAN_USA28XB=y
+CONFIG_USB_SERIAL_KEYSPAN_USA19=y
+CONFIG_USB_SERIAL_KEYSPAN_USA18X=y
+CONFIG_USB_SERIAL_KEYSPAN_USA19W=y
+CONFIG_USB_SERIAL_KEYSPAN_USA19QW=y
+CONFIG_USB_SERIAL_KEYSPAN_USA19QI=y
+CONFIG_USB_SERIAL_KEYSPAN_USA49W=y
+CONFIG_USB_SERIAL_KEYSPAN_USA49WLC=y
+CONFIG_USB_SERIAL_KLSI=m
+CONFIG_USB_SERIAL_KOBIL_SCT=m
+CONFIG_USB_SERIAL_MCT_U232=m
+CONFIG_USB_SERIAL_NAVMAN=m
+CONFIG_USB_SERIAL_PL2303=m
+CONFIG_USB_SERIAL_HP4X=m
+CONFIG_USB_SERIAL_SAFE=m
+CONFIG_USB_SERIAL_SAFE_PADDED=y
+CONFIG_USB_SERIAL_SIERRAWIRELESS=m
+CONFIG_USB_SERIAL_TI=m
+CONFIG_USB_SERIAL_CYBERJACK=m
+CONFIG_USB_SERIAL_XIRCOM=m
+CONFIG_USB_SERIAL_OMNINET=m
+CONFIG_USB_EZUSB=y
+
+#
+# USB Miscellaneous drivers
+#
+CONFIG_USB_EMI62=m
+CONFIG_USB_EMI26=m
+CONFIG_USB_AUERSWALD=m
+CONFIG_USB_RIO500=m
+CONFIG_USB_LEGOTOWER=m
+CONFIG_USB_LCD=m
+CONFIG_USB_BERRY_CHARGE=m
+CONFIG_USB_LED=m
+CONFIG_USB_CYTHERM=m
+CONFIG_USB_PHIDGETKIT=m
+CONFIG_USB_PHIDGETSERVO=m
+CONFIG_USB_IDMOUSE=m
+CONFIG_USB_SISUSBVGA=m
+CONFIG_USB_SISUSBVGA_CON=y
+CONFIG_USB_LD=m
+# CONFIG_USB_TEST is not set
+
+#
+# USB DSL modem support
+#
+
+#
+# USB Gadget Support
+#
+# CONFIG_USB_GADGET is not set
+
+#
+# MMC/SD Card support
+#
+# CONFIG_MMC is not set
+
+#
+# InfiniBand support
+#
+# CONFIG_INFINIBAND is not set
+
+#
+# DMA Engine support
+#
+# CONFIG_DMA_ENGINE is not set
+
+#
+# DMA Clients
+#
+
+#
+# DMA Devices
+#
+
+#
+# File systems
+#
+CONFIG_EXT2_FS=y
+CONFIG_EXT2_FS_XATTR=y
+CONFIG_EXT2_FS_POSIX_ACL=y
+CONFIG_EXT2_FS_SECURITY=y
+# CONFIG_EXT2_FS_XIP is not set
+CONFIG_EXT3_FS=y
+CONFIG_EXT3_FS_XATTR=y
+CONFIG_EXT3_FS_POSIX_ACL=y
+CONFIG_EXT3_FS_SECURITY=y
+CONFIG_JBD=y
+# CONFIG_JBD_DEBUG is not set
+CONFIG_FS_MBCACHE=y
+CONFIG_REISERFS_FS=m
+# CONFIG_REISERFS_CHECK is not set
+CONFIG_REISERFS_PROC_INFO=y
+CONFIG_REISERFS_FS_XATTR=y
+CONFIG_REISERFS_FS_POSIX_ACL=y
+CONFIG_REISERFS_FS_SECURITY=y
+CONFIG_JFS_FS=m
+CONFIG_JFS_POSIX_ACL=y
+CONFIG_JFS_SECURITY=y
+# CONFIG_JFS_DEBUG is not set
+CONFIG_JFS_STATISTICS=y
+CONFIG_FS_POSIX_ACL=y
+CONFIG_XFS_FS=m
+CONFIG_XFS_QUOTA=m
+CONFIG_XFS_DMAPI=m
+CONFIG_XFS_SECURITY=y
+CONFIG_XFS_POSIX_ACL=y
+CONFIG_XFS_RT=y
+# CONFIG_XFS_DEBUG is not set
+# CONFIG_XFS_TRACE is not set
+CONFIG_OCFS2_FS=m
+CONFIG_OCFS2_FS_USERSPACE_CLUSTER=m
+CONFIG_MINIX_FS=m
+CONFIG_ROMFS_FS=m
+CONFIG_INOTIFY=y
+CONFIG_INOTIFY_USER=y
+CONFIG_DMAPI=m
+# CONFIG_DMAPI_DEBUG is not set
+CONFIG_QUOTA=y
+CONFIG_QFMT_V1=m
+CONFIG_QFMT_V2=m
+CONFIG_QUOTACTL=y
+CONFIG_DNOTIFY=y
+CONFIG_AUTOFS_FS=m
+CONFIG_AUTOFS4_FS=m
+CONFIG_FUSE_FS=m
+
+#
+# CD-ROM/DVD Filesystems
+#
+CONFIG_ISO9660_FS=m
+CONFIG_JOLIET=y
+CONFIG_ZISOFS=y
+CONFIG_ZISOFS_FS=m
+CONFIG_UDF_FS=m
+CONFIG_UDF_NLS=y
+
+#
+# DOS/FAT/NT Filesystems
+#
+CONFIG_FAT_FS=y
+CONFIG_MSDOS_FS=m
+CONFIG_VFAT_FS=y
+CONFIG_FAT_DEFAULT_CODEPAGE=437
+CONFIG_FAT_DEFAULT_IOCHARSET="iso8859-1"
+CONFIG_NTFS_FS=m
+# CONFIG_NTFS_DEBUG is not set
+# CONFIG_NTFS_RW is not set
+
+#
+# Pseudo filesystems
+#
+CONFIG_PROC_FS=y
+CONFIG_PROC_KCORE=y
+CONFIG_SYSFS=y
+CONFIG_TMPFS=y
+CONFIG_TMPFS_POSIX_ACL=y
+CONFIG_HUGETLBFS=y
+CONFIG_HUGETLB_PAGE=y
+CONFIG_RAMFS=y
+CONFIG_CONFIGFS_FS=m
+
+#
+# Miscellaneous filesystems
+#
+# CONFIG_ADFS_FS is not set
+# CONFIG_AFFS_FS is not set
+CONFIG_HFS_FS=m
+CONFIG_HFSPLUS_FS=m
+# CONFIG_BEFS_FS is not set
+# CONFIG_BFS_FS is not set
+# CONFIG_EFS_FS is not set
+CONFIG_CRAMFS=m
+# CONFIG_VXFS_FS is not set
+# CONFIG_HPFS_FS is not set
+# CONFIG_QNX4FS_FS is not set
+# CONFIG_SYSV_FS is not set
+CONFIG_UFS_FS=m
+
+#
+# Network File Systems
+#
+CONFIG_NFS_FS=m
+CONFIG_NFS_V3=y
+CONFIG_NFS_V3_ACL=y
+CONFIG_NFS_V4=y
+CONFIG_NFS_DIRECTIO=y
+CONFIG_NFSD=m
+CONFIG_NFSD_V2_ACL=y
+CONFIG_NFSD_V3=y
+CONFIG_NFSD_V3_ACL=y
+CONFIG_NFSD_V4=y
+CONFIG_NFSD_TCP=y
+CONFIG_LOCKD=m
+CONFIG_LOCKD_V4=y
+CONFIG_EXPORTFS=m
+CONFIG_NFS_ACL_SUPPORT=m
+CONFIG_NFS_COMMON=y
+CONFIG_SUNRPC=m
+CONFIG_SUNRPC_GSS=m
+CONFIG_RPCSEC_GSS_KRB5=m
+CONFIG_RPCSEC_GSS_SPKM3=m
+CONFIG_SMB_FS=m
+# CONFIG_SMB_NLS_DEFAULT is not set
+CONFIG_CIFS=m
+CONFIG_CIFS_STATS=y
+# CONFIG_CIFS_STATS2 is not set
+CONFIG_CIFS_XATTR=y
+CONFIG_CIFS_POSIX=y
+# CONFIG_CIFS_EXPERIMENTAL is not set
+CONFIG_NCP_FS=m
+CONFIG_NCPFS_PACKET_SIGNING=y
+CONFIG_NCPFS_IOCTL_LOCKING=y
+CONFIG_NCPFS_STRONG=y
+CONFIG_NCPFS_NFS_NS=y
+CONFIG_NCPFS_OS2_NS=y
+CONFIG_NCPFS_SMALLDOS=y
+CONFIG_NCPFS_NLS=y
+CONFIG_NCPFS_EXTRAS=y
+# CONFIG_CODA_FS is not set
+# CONFIG_AFS_FS is not set
+CONFIG_9P_FS=m
+CONFIG_GENERIC_ACL=y
+
+#
+# Partition Types
+#
+CONFIG_PARTITION_ADVANCED=y
+# CONFIG_ACORN_PARTITION is not set
+CONFIG_OSF_PARTITION=y
+CONFIG_AMIGA_PARTITION=y
+CONFIG_ATARI_PARTITION=y
+CONFIG_MAC_PARTITION=y
+CONFIG_MSDOS_PARTITION=y
+CONFIG_BSD_DISKLABEL=y
+CONFIG_MINIX_SUBPARTITION=y
+CONFIG_SOLARIS_X86_PARTITION=y
+CONFIG_UNIXWARE_DISKLABEL=y
+CONFIG_LDM_PARTITION=y
+# CONFIG_LDM_DEBUG is not set
+CONFIG_SGI_PARTITION=y
+CONFIG_ULTRIX_PARTITION=y
+CONFIG_SUN_PARTITION=y
+CONFIG_KARMA_PARTITION=y
+CONFIG_EFI_PARTITION=y
+
+#
+# Native Language Support
+#
+CONFIG_NLS=y
+CONFIG_NLS_DEFAULT="utf8"
+CONFIG_NLS_CODEPAGE_437=y
+CONFIG_NLS_CODEPAGE_737=m
+CONFIG_NLS_CODEPAGE_775=m
+CONFIG_NLS_CODEPAGE_850=m
+CONFIG_NLS_CODEPAGE_852=m
+CONFIG_NLS_CODEPAGE_855=m
+CONFIG_NLS_CODEPAGE_857=m
+CONFIG_NLS_CODEPAGE_860=m
+CONFIG_NLS_CODEPAGE_861=m
+CONFIG_NLS_CODEPAGE_862=m
+CONFIG_NLS_CODEPAGE_863=m
+CONFIG_NLS_CODEPAGE_864=m
+CONFIG_NLS_CODEPAGE_865=m
+CONFIG_NLS_CODEPAGE_866=m
+CONFIG_NLS_CODEPAGE_869=m
+CONFIG_NLS_CODEPAGE_936=m
+CONFIG_NLS_CODEPAGE_950=m
+CONFIG_NLS_CODEPAGE_932=m
+CONFIG_NLS_CODEPAGE_949=m
+CONFIG_NLS_CODEPAGE_874=m
+CONFIG_NLS_ISO8859_8=m
+CONFIG_NLS_CODEPAGE_1250=m
+CONFIG_NLS_CODEPAGE_1251=m
+CONFIG_NLS_ASCII=m
+CONFIG_NLS_ISO8859_1=y
+CONFIG_NLS_ISO8859_2=m
+CONFIG_NLS_ISO8859_3=m
+CONFIG_NLS_ISO8859_4=m
+CONFIG_NLS_ISO8859_5=m
+CONFIG_NLS_ISO8859_6=m
+CONFIG_NLS_ISO8859_7=m
+CONFIG_NLS_ISO8859_9=m
+CONFIG_NLS_ISO8859_13=m
+CONFIG_NLS_ISO8859_14=m
+CONFIG_NLS_ISO8859_15=m
+CONFIG_NLS_KOI8_R=m
+CONFIG_NLS_KOI8_U=m
+CONFIG_NLS_UTF8=m
+
+#
+# Library routines
+#
+CONFIG_CRC_CCITT=m
+CONFIG_CRC16=m
+CONFIG_CRC32=y
+CONFIG_LIBCRC32C=m
+CONFIG_ZLIB_INFLATE=m
+CONFIG_ZLIB_DEFLATE=m
+CONFIG_TEXTSEARCH=y
+CONFIG_TEXTSEARCH_KMP=m
+CONFIG_TEXTSEARCH_BM=m
+CONFIG_TEXTSEARCH_FSM=m
+
+#
+# Instrumentation Support
+#
+CONFIG_PROFILING=y
+CONFIG_OPROFILE=y
+CONFIG_KPROBES=y
+
+#
+# Kernel hacking
+#
+# CONFIG_PRINTK_TIME is not set
+CONFIG_MAGIC_SYSRQ=y
+CONFIG_DEBUG_KERNEL=y
+CONFIG_LOG_BUF_SHIFT=19
+# CONFIG_DETECT_SOFTLOCKUP is not set
+# CONFIG_SCHEDSTATS is not set
+# CONFIG_DEBUG_SLAB is not set
+# CONFIG_DEBUG_MUTEXES is not set
+# CONFIG_DEBUG_SPINLOCK is not set
+# CONFIG_DEBUG_SPINLOCK_SLEEP is not set
+# CONFIG_DEBUG_KOBJECT is not set
+CONFIG_DEBUG_INFO=y
+CONFIG_DEBUG_FS=y
+# CONFIG_DEBUG_VM is not set
+CONFIG_FORCED_INLINING=y
+# CONFIG_RCU_TORTURE_TEST is not set
+# CONFIG_LKCD_DUMP is not set
+CONFIG_DEBUG_STACKOVERFLOW=y
+CONFIG_DEBUG_STACK_USAGE=y
+CONFIG_HCALL_STATS=y
+CONFIG_DEBUGGER=y
+CONFIG_XMON=y
+# CONFIG_XMON_DEFAULT is not set
+CONFIG_IRQSTACKS=y
+CONFIG_BOOTX_TEXT=y
+# CONFIG_PPC_EARLY_DEBUG_LPAR is not set
+# CONFIG_PPC_EARLY_DEBUG_G5 is not set
+# CONFIG_PPC_EARLY_DEBUG_RTAS is not set
+# CONFIG_PPC_EARLY_DEBUG_MAPLE is not set
+# CONFIG_PPC_EARLY_DEBUG_ISERIES is not set
+
+#
+# Security options
+#
+CONFIG_KEYS=y
+CONFIG_KEYS_DEBUG_PROC_KEYS=y
+CONFIG_SECURITY=y
+CONFIG_SECURITY_NETWORK=y
+# CONFIG_SECURITY_NETWORK_XFRM is not set
+CONFIG_SECURITY_CAPABILITIES=m
+CONFIG_SECURITY_ROOTPLUG=m
+CONFIG_SECURITY_SECLVL=m
+# CONFIG_SECURITY_SELINUX is not set
+CONFIG_SECURITY_APPARMOR=m
+CONFIG_KEYS_COMPAT=y
+
+#
+# Cryptographic options
+#
+CONFIG_CRYPTO=y
+CONFIG_CRYPTO_ALGAPI=y
+CONFIG_CRYPTO_ABLKCIPHER=m
+CONFIG_CRYPTO_BLKCIPHER=m
+CONFIG_CRYPTO_HASH=y
+CONFIG_CRYPTO_MANAGER=y
+CONFIG_CRYPTO_HMAC=y
+CONFIG_CRYPTO_XCBC=m
+CONFIG_CRYPTO_NULL=m
+CONFIG_CRYPTO_MD4=m
+CONFIG_CRYPTO_MD5=y
+CONFIG_CRYPTO_SHA1=m
+CONFIG_CRYPTO_SHA256=m
+CONFIG_CRYPTO_SHA512=m
+CONFIG_CRYPTO_WP512=m
+CONFIG_CRYPTO_TGR192=m
+CONFIG_CRYPTO_GF128MUL=m
+CONFIG_CRYPTO_ECB=m
+CONFIG_CRYPTO_CBC=m
+CONFIG_CRYPTO_PCBC=m
+CONFIG_CRYPTO_LRW=m
+CONFIG_CRYPTO_CRYPTD=m
+CONFIG_CRYPTO_DES=y
+CONFIG_CRYPTO_FCRYPT=m
+CONFIG_CRYPTO_BLOWFISH=m
+CONFIG_CRYPTO_TWOFISH=m
+CONFIG_CRYPTO_TWOFISH_COMMON=m
+CONFIG_CRYPTO_SERPENT=m
+CONFIG_CRYPTO_AES=m
+CONFIG_CRYPTO_CAST5=m
+CONFIG_CRYPTO_CAST6=m
+CONFIG_CRYPTO_TEA=m
+CONFIG_CRYPTO_ARC4=m
+CONFIG_CRYPTO_KHAZAD=m
+CONFIG_CRYPTO_ANUBIS=m
+CONFIG_CRYPTO_DEFLATE=m
+CONFIG_CRYPTO_MICHAEL_MIC=m
+CONFIG_CRYPTO_CRC32C=m
+CONFIG_CRYPTO_CAMELLIA=m
+CONFIG_CRYPTO_TEST=m
+
+#
+# Hardware crypto devices
+#
diff --git a/lustre/kernel_patches/kernel_configs/kernel-2.6.16-2.6-sles10-ppc64.config b/lustre/kernel_patches/kernel_configs/kernel-2.6.16-2.6-sles10-ppc64.config

new file mode 100644 (file)

index 0000000..fff3b80
--- /dev/null
+++ b/lustre/kernel_patches/kernel_configs/kernel-2.6.16-2.6-sles10-ppc64.config
@@ -0,0 +1,2184 @@
+#
+# Automatically generated make config: don't edit
+# Linux kernel version: 2.6.16.60
+# Fri Aug  1 03:42:28 2008
+#
+CONFIG_PPC64=y
+CONFIG_64BIT=y
+CONFIG_PPC_MERGE=y
+CONFIG_MMU=y
+CONFIG_GENERIC_HARDIRQS=y
+CONFIG_RWSEM_XCHGADD_ALGORITHM=y
+CONFIG_GENERIC_CALIBRATE_DELAY=y
+CONFIG_PPC=y
+CONFIG_EARLY_PRINTK=y
+CONFIG_COMPAT=y
+CONFIG_SYSVIPC_COMPAT=y
+CONFIG_SCHED_NO_NO_OMIT_FRAME_POINTER=y
+CONFIG_ARCH_MAY_HAVE_PC_FDC=y
+CONFIG_PPC_OF=y
+CONFIG_PPC_UDBG_16550=y
+CONFIG_GENERIC_TBSYNC=y
+CONFIG_AUDIT_ARCH=y
+# CONFIG_DEFAULT_UIMAGE is not set
+
+#
+# Processor support
+#
+# CONFIG_POWER4_ONLY is not set
+CONFIG_POWER3=y
+CONFIG_POWER4=y
+CONFIG_PPC_FPU=y
+CONFIG_ALTIVEC=y
+CONFIG_PPC_STD_MMU=y
+CONFIG_VIRT_CPU_ACCOUNTING=y
+# CONFIG_SMP is not set
+
+#
+# Code maturity level options
+#
+CONFIG_EXPERIMENTAL=y
+CONFIG_BROKEN_ON_SMP=y
+CONFIG_INIT_ENV_ARG_LIMIT=32
+
+#
+# General setup
+#
+CONFIG_LOCALVERSION="-ppc64"
+# CONFIG_LOCALVERSION_AUTO is not set
+CONFIG_SUSE_KERNEL=y
+CONFIG_SLE_VERSION=10
+CONFIG_SLE_SP=2
+CONFIG_SLE_SP_SUBLEVEL=0
+CONFIG_SWAP=y
+CONFIG_SYSVIPC=y
+CONFIG_POSIX_MQUEUE=y
+CONFIG_BSD_PROCESS_ACCT=y
+CONFIG_BSD_PROCESS_ACCT_V3=y
+CONFIG_TASKSTATS=y
+CONFIG_TASK_DELAY_ACCT=y
+CONFIG_TASK_XACCT=y
+CONFIG_SYSCTL=y
+CONFIG_AUDIT=y
+CONFIG_AUDITSYSCALL=y
+CONFIG_IKCONFIG=y
+CONFIG_IKCONFIG_PROC=y
+CONFIG_RELAY=y
+CONFIG_INITRAMFS_SOURCE=""
+CONFIG_CC_OPTIMIZE_FOR_SIZE=y
+# CONFIG_EMBEDDED is not set
+CONFIG_KALLSYMS=y
+CONFIG_KALLSYMS_ALL=y
+# CONFIG_KALLSYMS_EXTRA_PASS is not set
+CONFIG_HOTPLUG=y
+CONFIG_PRINTK=y
+CONFIG_BUG=y
+CONFIG_ELF_CORE=y
+CONFIG_BASE_FULL=y
+CONFIG_FUTEX=y
+CONFIG_EPOLL=y
+CONFIG_SHMEM=y
+CONFIG_CC_ALIGN_FUNCTIONS=0
+CONFIG_CC_ALIGN_LABELS=0
+CONFIG_CC_ALIGN_LOOPS=0
+CONFIG_CC_ALIGN_JUMPS=0
+CONFIG_SLAB=y
+# CONFIG_TINY_SHMEM is not set
+CONFIG_BASE_SMALL=0
+# CONFIG_SLOB is not set
+CONFIG_OBSOLETE_INTERMODULE=m
+
+#
+# Loadable module support
+#
+CONFIG_MODULES=y
+CONFIG_MODULE_UNLOAD=y
+# CONFIG_MODULE_FORCE_UNLOAD is not set
+CONFIG_MODVERSIONS=y
+CONFIG_MODULE_SRCVERSION_ALL=y
+CONFIG_KMOD=y
+
+#
+# Block layer
+#
+CONFIG_BLK_DEV_IO_TRACE=y
+
+#
+# IO Schedulers
+#
+CONFIG_IOSCHED_NOOP=y
+CONFIG_IOSCHED_AS=y
+CONFIG_IOSCHED_DEADLINE=y
+CONFIG_IOSCHED_CFQ=y
+# CONFIG_DEFAULT_AS is not set
+CONFIG_DEFAULT_DEADLINE=y
+# CONFIG_DEFAULT_CFQ is not set
+# CONFIG_DEFAULT_NOOP is not set
+CONFIG_DEFAULT_IOSCHED="deadline"
+
+#
+# Platform support
+#
+CONFIG_PPC_MULTIPLATFORM=y
+# CONFIG_PPC_ISERIES is not set
+# CONFIG_EMBEDDED6xx is not set
+# CONFIG_APUS is not set
+CONFIG_PPC_PSERIES=y
+CONFIG_PPC_PMAC=y
+CONFIG_PPC_PMAC64=y
+CONFIG_PPC_MAPLE=y
+CONFIG_PPC_CELL=y
+CONFIG_XICS=y
+CONFIG_U3_DART=y
+CONFIG_MPIC=y
+CONFIG_PPC_RTAS=y
+CONFIG_RTAS_ERROR_LOGGING=y
+CONFIG_RTAS_PROC=y
+CONFIG_RTAS_FLASH=y
+CONFIG_MMIO_NVRAM=y
+CONFIG_MPIC_BROKEN_U3=y
+CONFIG_CELL_IIC=y
+CONFIG_IBMVIO=y
+CONFIG_IBMEBUS=y
+# CONFIG_PPC_MPC106 is not set
+CONFIG_CPU_FREQ=y
+CONFIG_CPU_FREQ_TABLE=y
+CONFIG_CPU_FREQ_DEBUG=y
+CONFIG_CPU_FREQ_STAT=m
+CONFIG_CPU_FREQ_STAT_DETAILS=y
+CONFIG_CPU_FREQ_DEFAULT_GOV_PERFORMANCE=y
+# CONFIG_CPU_FREQ_DEFAULT_GOV_USERSPACE is not set
+CONFIG_CPU_FREQ_GOV_PERFORMANCE=y
+CONFIG_CPU_FREQ_GOV_POWERSAVE=m
+CONFIG_CPU_FREQ_GOV_USERSPACE=m
+CONFIG_CPU_FREQ_GOV_ONDEMAND=m
+CONFIG_CPU_FREQ_GOV_CONSERVATIVE=m
+CONFIG_CPU_FREQ_PMAC64=y
+# CONFIG_WANT_EARLY_SERIAL is not set
+
+#
+# Cell Broadband Engine options
+#
+CONFIG_SPU_FS=m
+
+#
+# Kernel options
+#
+CONFIG_HZ_100=y
+# CONFIG_HZ_250 is not set
+# CONFIG_HZ_1000 is not set
+CONFIG_HZ=100
+CONFIG_PREEMPT_NONE=y
+# CONFIG_PREEMPT_VOLUNTARY is not set
+# CONFIG_PREEMPT is not set
+CONFIG_BINFMT_ELF=y
+CONFIG_BINFMT_MISC=m
+CONFIG_FORCE_MAX_ZONEORDER=13
+CONFIG_IOMMU_VMERGE=y
+CONFIG_ARCH_ENABLE_MEMORY_HOTPLUG=y
+CONFIG_KEXEC=y
+# CONFIG_CRASH_DUMP is not set
+CONFIG_PPC_SPLPAR=y
+CONFIG_EEH=y
+CONFIG_SCANLOG=m
+CONFIG_LPARCFG=y
+CONFIG_NUMA=y
+CONFIG_ARCH_SELECT_MEMORY_MODEL=y
+CONFIG_ARCH_SPARSEMEM_ENABLE=y
+CONFIG_SELECT_MEMORY_MODEL=y
+# CONFIG_FLATMEM_MANUAL is not set
+# CONFIG_DISCONTIGMEM_MANUAL is not set
+CONFIG_SPARSEMEM_MANUAL=y
+CONFIG_SPARSEMEM=y
+CONFIG_NEED_MULTIPLE_NODES=y
+CONFIG_HAVE_MEMORY_PRESENT=y
+# CONFIG_SPARSEMEM_STATIC is not set
+CONFIG_SPARSEMEM_EXTREME=y
+CONFIG_MEMORY_HOTPLUG=y
+CONFIG_SPLIT_PTLOCK_CPUS=4
+CONFIG_MIGRATION=y
+CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID=y
+CONFIG_ARCH_MEMORY_PROBE=y
+# CONFIG_PPC_64K_PAGES is not set
+CONFIG_PROC_DEVICETREE=y
+# CONFIG_CMDLINE_BOOL is not set
+# CONFIG_PM is not set
+CONFIG_SECCOMP=y
+CONFIG_ISA_DMA_API=y
+
+#
+# Bus options
+#
+CONFIG_GENERIC_ISA_DMA=y
+CONFIG_PPC_I8259=y
+# CONFIG_PPC_INDIRECT_PCI is not set
+CONFIG_PCI=y
+CONFIG_PCI_DOMAINS=y
+CONFIG_PCIEPORTBUS=y
+# CONFIG_HOTPLUG_PCI_PCIE is not set
+CONFIG_PCI_MSI=y
+# CONFIG_PCI_LEGACY_PROC is not set
+# CONFIG_PCI_DEBUG is not set
+
+#
+# PCCARD (PCMCIA/CardBus) support
+#
+# CONFIG_PCCARD is not set
+
+#
+# PCI Hotplug Support
+#
+CONFIG_HOTPLUG_PCI=y
+# CONFIG_HOTPLUG_PCI_FAKE is not set
+# CONFIG_HOTPLUG_PCI_CPCI is not set
+# CONFIG_HOTPLUG_PCI_SHPC is not set
+CONFIG_HOTPLUG_PCI_RPA=y
+CONFIG_HOTPLUG_PCI_RPA_DLPAR=y
+CONFIG_KERNEL_START=0xc000000000000000
+
+#
+# Networking
+#
+CONFIG_NET=y
+
+#
+# Networking options
+#
+# CONFIG_NETDEBUG is not set
+CONFIG_PACKET=y
+CONFIG_PACKET_MMAP=y
+CONFIG_UNIX=y
+CONFIG_XFRM=y
+CONFIG_XFRM_USER=m
+CONFIG_NET_KEY=y
+CONFIG_INET=y
+CONFIG_IP_MULTICAST=y
+CONFIG_IP_ADVANCED_ROUTER=y
+CONFIG_ASK_IP_FIB_HASH=y
+# CONFIG_IP_FIB_TRIE is not set
+CONFIG_IP_FIB_HASH=y
+CONFIG_IP_MULTIPLE_TABLES=y
+# CONFIG_IP_ROUTE_FWMARK is not set
+CONFIG_IP_ROUTE_MULTIPATH=y
+# CONFIG_IP_ROUTE_MULTIPATH_CACHED is not set
+CONFIG_IP_ROUTE_VERBOSE=y
+# CONFIG_IP_PNP is not set
+CONFIG_NET_IPIP=m
+CONFIG_NET_IPGRE=m
+CONFIG_NET_IPGRE_BROADCAST=y
+CONFIG_IP_MROUTE=y
+CONFIG_IP_PIMSM_V1=y
+CONFIG_IP_PIMSM_V2=y
+# CONFIG_ARPD is not set
+CONFIG_SYN_COOKIES=y
+CONFIG_INET_AH=m
+CONFIG_INET_ESP=m
+CONFIG_INET_IPCOMP=m
+CONFIG_INET_TUNNEL=m
+CONFIG_INET_DIAG=m
+CONFIG_INET_TCP_DIAG=m
+CONFIG_TCP_CONG_ADVANCED=y
+
+#
+# TCP congestion control
+#
+CONFIG_TCP_CONG_BIC=m
+CONFIG_TCP_CONG_CUBIC=m
+CONFIG_TCP_CONG_WESTWOOD=m
+CONFIG_TCP_CONG_HTCP=m
+CONFIG_TCP_CONG_HSTCP=m
+CONFIG_TCP_CONG_HYBLA=m
+CONFIG_TCP_CONG_VEGAS=m
+CONFIG_TCP_CONG_SCALABLE=m
+
+#
+# IP: Virtual Server Configuration
+#
+# CONFIG_IP_VS is not set
+CONFIG_IPV6=m
+CONFIG_IPV6_PRIVACY=y
+CONFIG_INET6_AH=m
+CONFIG_INET6_ESP=m
+CONFIG_INET6_IPCOMP=m
+CONFIG_INET6_TUNNEL=m
+CONFIG_IPV6_TUNNEL=m
+CONFIG_NETFILTER=y
+# CONFIG_NETFILTER_DEBUG is not set
+CONFIG_BRIDGE_NETFILTER=y
+
+#
+# Core Netfilter Configuration
+#
+CONFIG_NETFILTER_NETLINK=m
+CONFIG_NETFILTER_NETLINK_QUEUE=m
+CONFIG_NETFILTER_NETLINK_LOG=m
+CONFIG_NETFILTER_XTABLES=m
+CONFIG_NETFILTER_XT_TARGET_CLASSIFY=m
+CONFIG_NETFILTER_XT_TARGET_CONNMARK=m
+CONFIG_NETFILTER_XT_TARGET_MARK=m
+CONFIG_NETFILTER_XT_TARGET_NFQUEUE=m
+CONFIG_NETFILTER_XT_TARGET_NOTRACK=m
+CONFIG_NETFILTER_XT_MATCH_COMMENT=m
+CONFIG_NETFILTER_XT_MATCH_CONNMARK=m
+CONFIG_NETFILTER_XT_MATCH_CONNTRACK=m
+CONFIG_NETFILTER_XT_MATCH_DCCP=m
+CONFIG_NETFILTER_XT_MATCH_HELPER=m
+CONFIG_NETFILTER_XT_MATCH_LENGTH=m
+CONFIG_NETFILTER_XT_MATCH_LIMIT=m
+CONFIG_NETFILTER_XT_MATCH_MAC=m
+CONFIG_NETFILTER_XT_MATCH_MARK=m
+CONFIG_NETFILTER_XT_MATCH_PHYSDEV=m
+CONFIG_NETFILTER_XT_MATCH_PKTTYPE=m
+CONFIG_NETFILTER_XT_MATCH_REALM=m
+CONFIG_NETFILTER_XT_MATCH_SCTP=m
+CONFIG_NETFILTER_XT_MATCH_STATE=m
+CONFIG_NETFILTER_XT_MATCH_STRING=m
+CONFIG_NETFILTER_XT_MATCH_TCPMSS=m
+
+#
+# IP: Netfilter Configuration
+#
+CONFIG_IP_NF_CONNTRACK=m
+# CONFIG_IP_NF_CT_ACCT is not set
+CONFIG_IP_NF_CONNTRACK_MARK=y
+CONFIG_IP_NF_CONNTRACK_EVENTS=y
+CONFIG_IP_NF_CONNTRACK_NETLINK=m
+CONFIG_IP_NF_CT_PROTO_SCTP=m
+CONFIG_IP_NF_FTP=m
+CONFIG_IP_NF_IRC=m
+CONFIG_IP_NF_NETBIOS_NS=m
+CONFIG_IP_NF_TFTP=m
+CONFIG_IP_NF_AMANDA=m
+CONFIG_IP_NF_PPTP=m
+CONFIG_IP_NF_QUEUE=m
+CONFIG_IP_NF_IPTABLES=m
+CONFIG_IP_NF_MATCH_IPRANGE=m
+CONFIG_IP_NF_MATCH_MULTIPORT=m
+CONFIG_IP_NF_MATCH_TOS=m
+CONFIG_IP_NF_MATCH_RECENT=m
+CONFIG_IP_NF_MATCH_ECN=m
+CONFIG_IP_NF_MATCH_DSCP=m
+CONFIG_IP_NF_MATCH_AH_ESP=m
+CONFIG_IP_NF_MATCH_TTL=m
+CONFIG_IP_NF_MATCH_OWNER=m
+CONFIG_IP_NF_MATCH_ADDRTYPE=m
+CONFIG_IP_NF_MATCH_HASHLIMIT=m
+CONFIG_IP_NF_MATCH_POLICY=m
+CONFIG_IP_NF_MATCH_IPV4OPTIONS=m
+CONFIG_IP_NF_FILTER=m
+CONFIG_IP_NF_TARGET_REJECT=m
+CONFIG_IP_NF_TARGET_LOG=m
+CONFIG_IP_NF_TARGET_ULOG=m
+CONFIG_IP_NF_TARGET_TCPMSS=m
+CONFIG_IP_NF_NAT=m
+CONFIG_IP_NF_NAT_NEEDED=y
+CONFIG_IP_NF_TARGET_MASQUERADE=m
+CONFIG_IP_NF_TARGET_REDIRECT=m
+CONFIG_IP_NF_TARGET_NETMAP=m
+CONFIG_IP_NF_TARGET_SAME=m
+CONFIG_IP_NF_NAT_SNMP_BASIC=m
+CONFIG_IP_NF_NAT_IRC=m
+CONFIG_IP_NF_NAT_FTP=m
+CONFIG_IP_NF_NAT_TFTP=m
+CONFIG_IP_NF_NAT_AMANDA=m
+CONFIG_IP_NF_NAT_PPTP=m
+CONFIG_IP_NF_MANGLE=m
+CONFIG_IP_NF_TARGET_TOS=m
+CONFIG_IP_NF_TARGET_ECN=m
+CONFIG_IP_NF_TARGET_DSCP=m
+CONFIG_IP_NF_TARGET_TTL=m
+CONFIG_IP_NF_TARGET_CLUSTERIP=m
+CONFIG_IP_NF_RAW=m
+CONFIG_IP_NF_ARPTABLES=m
+CONFIG_IP_NF_ARPFILTER=m
+CONFIG_IP_NF_ARP_MANGLE=m
+
+#
+# IPv6: Netfilter Configuration (EXPERIMENTAL)
+#
+CONFIG_IP6_NF_QUEUE=m
+CONFIG_IP6_NF_IPTABLES=m
+CONFIG_IP6_NF_MATCH_RT=m
+CONFIG_IP6_NF_MATCH_OPTS=m
+CONFIG_IP6_NF_MATCH_FRAG=m
+CONFIG_IP6_NF_MATCH_HL=m
+CONFIG_IP6_NF_MATCH_MULTIPORT=m
+CONFIG_IP6_NF_MATCH_OWNER=m
+CONFIG_IP6_NF_MATCH_IPV6HEADER=m
+CONFIG_IP6_NF_MATCH_AHESP=m
+CONFIG_IP6_NF_MATCH_EUI64=m
+CONFIG_IP6_NF_MATCH_POLICY=m
+CONFIG_IP6_NF_FILTER=m
+CONFIG_IP6_NF_TARGET_LOG=m
+CONFIG_IP6_NF_TARGET_REJECT=m
+CONFIG_IP6_NF_MANGLE=m
+CONFIG_IP6_NF_TARGET_HL=m
+CONFIG_IP6_NF_RAW=m
+
+#
+# Bridge: Netfilter Configuration
+#
+CONFIG_BRIDGE_NF_EBTABLES=m
+CONFIG_BRIDGE_EBT_BROUTE=m
+CONFIG_BRIDGE_EBT_T_FILTER=m
+CONFIG_BRIDGE_EBT_T_NAT=m
+CONFIG_BRIDGE_EBT_802_3=m
+CONFIG_BRIDGE_EBT_AMONG=m
+CONFIG_BRIDGE_EBT_ARP=m
+CONFIG_BRIDGE_EBT_IP=m
+CONFIG_BRIDGE_EBT_LIMIT=m
+CONFIG_BRIDGE_EBT_MARK=m
+CONFIG_BRIDGE_EBT_PKTTYPE=m
+CONFIG_BRIDGE_EBT_STP=m
+CONFIG_BRIDGE_EBT_VLAN=m
+CONFIG_BRIDGE_EBT_ARPREPLY=m
+CONFIG_BRIDGE_EBT_DNAT=m
+CONFIG_BRIDGE_EBT_MARK_T=m
+CONFIG_BRIDGE_EBT_REDIRECT=m
+CONFIG_BRIDGE_EBT_SNAT=m
+CONFIG_BRIDGE_EBT_LOG=m
+CONFIG_BRIDGE_EBT_ULOG=m
+
+#
+# DCCP Configuration (EXPERIMENTAL)
+#
+CONFIG_IP_DCCP=m
+CONFIG_INET_DCCP_DIAG=m
+
+#
+# DCCP CCIDs Configuration (EXPERIMENTAL)
+#
+CONFIG_IP_DCCP_CCID3=m
+CONFIG_IP_DCCP_TFRC_LIB=m
+
+#
+# DCCP Kernel Hacking
+#
+# CONFIG_IP_DCCP_DEBUG is not set
+# CONFIG_IP_DCCP_UNLOAD_HACK is not set
+
+#
+# SCTP Configuration (EXPERIMENTAL)
+#
+CONFIG_IP_SCTP=m
+# CONFIG_SCTP_DBG_MSG is not set
+# CONFIG_SCTP_DBG_OBJCNT is not set
+CONFIG_SCTP_HMAC_NONE=y
+# CONFIG_SCTP_HMAC_SHA1 is not set
+# CONFIG_SCTP_HMAC_MD5 is not set
+
+#
+# TIPC Configuration (EXPERIMENTAL)
+#
+# CONFIG_TIPC is not set
+# CONFIG_ATM is not set
+CONFIG_BRIDGE=m
+CONFIG_VLAN_8021Q=m
+# CONFIG_DECNET is not set
+CONFIG_LLC=y
+CONFIG_LLC2=m
+CONFIG_IPX=m
+CONFIG_IPX_INTERN=y
+CONFIG_ATALK=m
+CONFIG_DEV_APPLETALK=y
+CONFIG_IPDDP=m
+CONFIG_IPDDP_ENCAP=y
+CONFIG_IPDDP_DECAP=y
+# CONFIG_X25 is not set
+# CONFIG_LAPB is not set
+# CONFIG_NET_DIVERT is not set
+# CONFIG_ECONET is not set
+# CONFIG_WAN_ROUTER is not set
+
+#
+# QoS and/or fair queueing
+#
+CONFIG_NET_SCHED=y
+CONFIG_NET_SCH_CLK_JIFFIES=y
+# CONFIG_NET_SCH_CLK_GETTIMEOFDAY is not set
+# CONFIG_NET_SCH_CLK_CPU is not set
+
+#
+# Queueing/Scheduling
+#
+CONFIG_NET_SCH_CBQ=m
+CONFIG_NET_SCH_HTB=m
+CONFIG_NET_SCH_HFSC=m
+CONFIG_NET_SCH_PRIO=m
+CONFIG_NET_SCH_RED=m
+CONFIG_NET_SCH_SFQ=m
+CONFIG_NET_SCH_TEQL=m
+CONFIG_NET_SCH_TBF=m
+CONFIG_NET_SCH_GRED=m
+CONFIG_NET_SCH_DSMARK=m
+CONFIG_NET_SCH_NETEM=m
+CONFIG_NET_SCH_INGRESS=m
+
+#
+# Classification
+#
+CONFIG_NET_CLS=y
+CONFIG_NET_CLS_BASIC=m
+CONFIG_NET_CLS_TCINDEX=m
+CONFIG_NET_CLS_ROUTE4=m
+CONFIG_NET_CLS_ROUTE=y
+CONFIG_NET_CLS_FW=m
+CONFIG_NET_CLS_U32=m
+CONFIG_CLS_U32_PERF=y
+CONFIG_CLS_U32_MARK=y
+CONFIG_NET_CLS_RSVP=m
+CONFIG_NET_CLS_RSVP6=m
+CONFIG_NET_EMATCH=y
+CONFIG_NET_EMATCH_STACK=32
+CONFIG_NET_EMATCH_CMP=m
+CONFIG_NET_EMATCH_NBYTE=m
+CONFIG_NET_EMATCH_U32=m
+CONFIG_NET_EMATCH_META=m
+CONFIG_NET_EMATCH_TEXT=m
+CONFIG_NET_CLS_ACT=y
+CONFIG_NET_ACT_POLICE=m
+CONFIG_NET_ACT_GACT=m
+CONFIG_GACT_PROB=y
+CONFIG_NET_ACT_MIRRED=m
+CONFIG_NET_ACT_IPT=m
+CONFIG_NET_ACT_PEDIT=m
+CONFIG_NET_ACT_SIMP=m
+CONFIG_NET_CLS_IND=y
+CONFIG_NET_ESTIMATOR=y
+
+#
+# Network testing
+#
+CONFIG_NET_PKTGEN=m
+# CONFIG_HAMRADIO is not set
+# CONFIG_IRDA is not set
+CONFIG_BT=m
+CONFIG_BT_L2CAP=m
+CONFIG_BT_SCO=m
+CONFIG_BT_RFCOMM=m
+CONFIG_BT_RFCOMM_TTY=y
+CONFIG_BT_BNEP=m
+CONFIG_BT_BNEP_MC_FILTER=y
+CONFIG_BT_BNEP_PROTO_FILTER=y
+CONFIG_BT_HIDP=m
+
+#
+# Bluetooth device drivers
+#
+CONFIG_BT_HCIUSB=m
+CONFIG_BT_HCIUSB_SCO=y
+# CONFIG_BT_HCIUART is not set
+CONFIG_BT_HCIBCM203X=m
+CONFIG_BT_HCIBPA10X=m
+CONFIG_BT_HCIBFUSB=m
+CONFIG_BT_HCIVHCI=m
+CONFIG_IEEE80211=m
+# CONFIG_IEEE80211_DEBUG is not set
+CONFIG_IEEE80211_CRYPT_WEP=m
+CONFIG_IEEE80211_CRYPT_CCMP=m
+
+#
+# Device Drivers
+#
+
+#
+# Generic Driver Options
+#
+CONFIG_STANDALONE=y
+CONFIG_PREVENT_FIRMWARE_BUILD=y
+CONFIG_FW_LOADER=m
+# CONFIG_DEBUG_DRIVER is not set
+# CONFIG_SYS_HYPERVISOR is not set
+
+#
+# Connector - unified userspace <-> kernelspace linker
+#
+CONFIG_CONNECTOR=y
+CONFIG_PROC_EVENTS=y
+
+#
+# Memory Technology Devices (MTD)
+#
+# CONFIG_MTD is not set
+
+#
+# Parallel port support
+#
+CONFIG_PARPORT=m
+CONFIG_PARPORT_PC=m
+CONFIG_PARPORT_SERIAL=m
+CONFIG_PARPORT_PC_FIFO=y
+# CONFIG_PARPORT_PC_SUPERIO is not set
+CONFIG_PARPORT_NOT_PC=y
+# CONFIG_PARPORT_GSC is not set
+CONFIG_PARPORT_1284=y
+
+#
+# Plug and Play support
+#
+
+#
+# Block devices
+#
+CONFIG_BLK_DEV_FD=m
+# CONFIG_PARIDE is not set
+# CONFIG_BLK_CPQ_DA is not set
+# CONFIG_BLK_CPQ_CISS_DA is not set
+# CONFIG_BLK_DEV_DAC960 is not set
+# CONFIG_BLK_DEV_UMEM is not set
+# CONFIG_BLK_DEV_COW_COMMON is not set
+CONFIG_BLK_DEV_LOOP=m
+CONFIG_BLK_DEV_CRYPTOLOOP=m
+CONFIG_BLK_DEV_NBD=m
+# CONFIG_BLK_DEV_SX8 is not set
+# CONFIG_BLK_DEV_UB is not set
+CONFIG_BLK_DEV_RAM=y
+CONFIG_BLK_DEV_RAM_COUNT=16
+CONFIG_BLK_DEV_RAM_SIZE=123456
+CONFIG_BLK_DEV_INITRD=y
+CONFIG_CDROM_PKTCDVD=m
+CONFIG_CDROM_PKTCDVD_BUFFERS=8
+CONFIG_CDROM_PKTCDVD_WCACHE=y
+CONFIG_CIPHER_TWOFISH=m
+CONFIG_ATA_OVER_ETH=m
+
+#
+# ATA/ATAPI/MFM/RLL support
+#
+CONFIG_IDE=y
+CONFIG_BLK_DEV_IDE=y
+
+#
+# Please see Documentation/ide.txt for help/info on IDE drives
+#
+# CONFIG_BLK_DEV_IDE_SATA is not set
+CONFIG_BLK_DEV_IDEDISK=y
+# CONFIG_IDEDISK_MULTI_MODE is not set
+CONFIG_BLK_DEV_IDECD=m
+# CONFIG_BLK_DEV_IDETAPE is not set
+# CONFIG_BLK_DEV_IDEFLOPPY is not set
+CONFIG_BLK_DEV_IDESCSI=m
+CONFIG_IDE_TASK_IOCTL=y
+
+#
+# IDE chipset support/bugfixes
+#
+CONFIG_IDE_GENERIC=y
+CONFIG_BLK_DEV_IDEPCI=y
+CONFIG_IDEPCI_SHARE_IRQ=y
+# CONFIG_BLK_DEV_OFFBOARD is not set
+# CONFIG_BLK_DEV_GENERIC is not set
+# CONFIG_BLK_DEV_OPTI621 is not set
+# CONFIG_BLK_DEV_SL82C105 is not set
+CONFIG_BLK_DEV_IDEDMA_PCI=y
+CONFIG_BLK_DEV_IDEDMA_FORCED=y
+CONFIG_IDEDMA_PCI_AUTO=y
+# CONFIG_IDEDMA_ONLYDISK is not set
+# CONFIG_BLK_DEV_AEC62XX is not set
+# CONFIG_BLK_DEV_ALI15X3 is not set
+CONFIG_BLK_DEV_AMD74XX=y
+# CONFIG_BLK_DEV_CMD64X is not set
+# CONFIG_BLK_DEV_TRIFLEX is not set
+# CONFIG_BLK_DEV_CY82C693 is not set
+# CONFIG_BLK_DEV_CS5520 is not set
+# CONFIG_BLK_DEV_CS5530 is not set
+# CONFIG_BLK_DEV_HPT34X is not set
+# CONFIG_BLK_DEV_HPT366 is not set
+# CONFIG_BLK_DEV_SC1200 is not set
+# CONFIG_BLK_DEV_PIIX is not set
+# CONFIG_BLK_DEV_IT821X is not set
+# CONFIG_BLK_DEV_NS87415 is not set
+# CONFIG_BLK_DEV_PDC202XX_OLD is not set
+# CONFIG_BLK_DEV_PDC202XX_NEW is not set
+# CONFIG_BLK_DEV_SVWKS is not set
+CONFIG_BLK_DEV_SIIMAGE=y
+# CONFIG_BLK_DEV_SLC90E66 is not set
+# CONFIG_BLK_DEV_TRM290 is not set
+# CONFIG_BLK_DEV_VIA82CXXX is not set
+CONFIG_BLK_DEV_IDE_PMAC=y
+CONFIG_BLK_DEV_IDE_PMAC_ATA100FIRST=y
+CONFIG_BLK_DEV_IDEDMA_PMAC=y
+# CONFIG_BLK_DEV_IDE_PMAC_BLINK is not set
+# CONFIG_IDE_ARM is not set
+CONFIG_BLK_DEV_IDEDMA=y
+# CONFIG_IDEDMA_IVB is not set
+CONFIG_IDEDMA_AUTO=y
+# CONFIG_BLK_DEV_HD is not set
+
+#
+# SCSI device support
+#
+CONFIG_RAID_ATTRS=m
+CONFIG_SCSI=m
+CONFIG_SCSI_NETLINK=y
+CONFIG_SCSI_PROC_FS=y
+
+#
+# SCSI support type (disk, tape, CD-ROM)
+#
+CONFIG_BLK_DEV_SD=m
+CONFIG_SD_IOSTATS=y
+CONFIG_CHR_DEV_ST=m
+# CONFIG_CHR_DEV_OSST is not set
+CONFIG_BLK_DEV_SR=m
+CONFIG_BLK_DEV_SR_VENDOR=y
+CONFIG_CHR_DEV_SG=m
+CONFIG_CHR_DEV_SCH=m
+
+#
+# Some SCSI devices (e.g. CD jukebox) support multiple LUNs
+#
+CONFIG_SCSI_MULTI_LUN=y
+CONFIG_SCSI_CONSTANTS=y
+CONFIG_SCSI_LOGGING=y
+
+#
+# SCSI Transport Attributes
+#
+CONFIG_SCSI_SPI_ATTRS=m
+CONFIG_SCSI_FC_ATTRS=m
+# CONFIG_SCSI_ISCSI_ATTRS is not set
+CONFIG_SCSI_SAS_ATTRS=m
+CONFIG_SCSI_SAS_LIBSAS=m
+CONFIG_SCSI_SAS_ATA=y
+# CONFIG_SCSI_SAS_LIBSAS_DEBUG is not set
+CONFIG_ISCSI_TARGET=m
+
+#
+# SCSI low-level drivers
+#
+# CONFIG_ISCSI_TCP is not set
+# CONFIG_BLK_DEV_3W_XXXX_RAID is not set
+# CONFIG_SCSI_3W_9XXX is not set
+# CONFIG_SCSI_ACARD is not set
+# CONFIG_SCSI_AACRAID is not set
+# CONFIG_SCSI_AIC7XXX is not set
+# CONFIG_SCSI_AIC7XXX_OLD is not set
+# CONFIG_SCSI_AIC79XX is not set
+CONFIG_SCSI_AIC94XX=m
+# CONFIG_AIC94XX_DEBUG is not set
+# CONFIG_SCSI_ARCMSR is not set
+# CONFIG_MEGARAID_NEWGEN is not set
+# CONFIG_MEGARAID_LEGACY is not set
+CONFIG_MEGARAID_SAS=m
+# CONFIG_SCSI_HPTIOP is not set
+# CONFIG_SCSI_BUSLOGIC is not set
+# CONFIG_SCSI_DMX3191D is not set
+# CONFIG_SCSI_EATA is not set
+# CONFIG_SCSI_FUTURE_DOMAIN is not set
+# CONFIG_SCSI_GDTH is not set
+# CONFIG_SCSI_IPS is not set
+CONFIG_SCSI_IBMVSCSI=m
+CONFIG_SCSI_IBMVSCSIS=m
+# CONFIG_SCSI_INITIO is not set
+# CONFIG_SCSI_INIA100 is not set
+# CONFIG_SCSI_PPA is not set
+# CONFIG_SCSI_IMM is not set
+CONFIG_SCSI_SYM53C8XX_2=m
+CONFIG_SCSI_SYM53C8XX_DMA_ADDRESSING_MODE=0
+CONFIG_SCSI_SYM53C8XX_DEFAULT_TAGS=16
+CONFIG_SCSI_SYM53C8XX_MAX_TAGS=64
+CONFIG_SCSI_SYM53C8XX_MMIO=y
+CONFIG_SCSI_IPR=m
+CONFIG_SCSI_IPR_TRACE=y
+CONFIG_SCSI_IPR_DUMP=y
+# CONFIG_SCSI_QLOGIC_FC is not set
+# CONFIG_SCSI_QLOGIC_1280 is not set
+CONFIG_SCSI_QLA_FC=m
+# CONFIG_SCSI_QLA_ISCSI is not set
+CONFIG_SCSI_LPFC=m
+# CONFIG_SCSI_DC395x is not set
+# CONFIG_SCSI_DC390T is not set
+CONFIG_SCSI_DEBUG=m
+
+#
+# Serial ATA (prod) and Parallel ATA (experimental) drivers
+#
+CONFIG_ATA=m
+CONFIG_SATA_AHCI=m
+CONFIG_SATA_SVW=m
+# CONFIG_ATA_PIIX is not set
+# CONFIG_SATA_MV is not set
+# CONFIG_SATA_NV is not set
+# CONFIG_PDC_ADMA is not set
+# CONFIG_SATA_QSTOR is not set
+# CONFIG_SATA_PROMISE is not set
+# CONFIG_SATA_SX4 is not set
+# CONFIG_SATA_SIL is not set
+# CONFIG_SATA_SIL24 is not set
+# CONFIG_SATA_SIS is not set
+# CONFIG_SATA_ULI is not set
+# CONFIG_SATA_VIA is not set
+CONFIG_SATA_VITESSE=m
+CONFIG_SATA_INTEL_COMBINED=y
+# CONFIG_PATA_ALI is not set
+# CONFIG_PATA_AMD is not set
+# CONFIG_PATA_ARTOP is not set
+# CONFIG_PATA_ATIIXP is not set
+# CONFIG_PATA_CMD64X is not set
+# CONFIG_PATA_CS5520 is not set
+# CONFIG_PATA_CS5530 is not set
+# CONFIG_PATA_CYPRESS is not set
+# CONFIG_PATA_EFAR is not set
+# CONFIG_ATA_GENERIC is not set
+# CONFIG_PATA_HPT366 is not set
+# CONFIG_PATA_HPT37X is not set
+# CONFIG_PATA_HPT3X2N is not set
+# CONFIG_PATA_HPT3X3 is not set
+# CONFIG_PATA_IT821X is not set
+# CONFIG_PATA_IT8213 is not set
+# CONFIG_PATA_JMICRON is not set
+# CONFIG_PATA_TRIFLEX is not set
+# CONFIG_PATA_MARVELL is not set
+# CONFIG_PATA_MPIIX is not set
+# CONFIG_PATA_OLDPIIX is not set
+# CONFIG_PATA_NETCELL is not set
+# CONFIG_PATA_NS87410 is not set
+# CONFIG_PATA_OPTI is not set
+# CONFIG_PATA_OPTIDMA is not set
+# CONFIG_PATA_PDC_OLD is not set
+# CONFIG_PATA_RADISYS is not set
+# CONFIG_PATA_RZ1000 is not set
+# CONFIG_PATA_SC1200 is not set
+# CONFIG_PATA_SERVERWORKS is not set
+CONFIG_PATA_PDC2027X=m
+# CONFIG_PATA_SIL680 is not set
+# CONFIG_PATA_SIS is not set
+# CONFIG_PATA_VIA is not set
+CONFIG_PATA_WINBOND=m
+
+#
+# Multi-device support (RAID and LVM)
+#
+CONFIG_MD=y
+CONFIG_BLK_DEV_MD=y
+CONFIG_MD_LINEAR=m
+CONFIG_MD_RAID0=m
+CONFIG_MD_RAID1=m
+CONFIG_MD_RAID10=m
+CONFIG_MD_RAID5=m
+CONFIG_MD_RAID6=m
+CONFIG_MD_MULTIPATH=m
+CONFIG_MD_FAULTY=m
+CONFIG_BLK_DEV_DM=m
+CONFIG_DM_CRYPT=m
+CONFIG_DM_SNAPSHOT=m
+CONFIG_DM_MIRROR=m
+CONFIG_DM_ZERO=m
+CONFIG_DM_MULTIPATH=m
+CONFIG_DM_MULTIPATH_EMC=m
+CONFIG_DM_MULTIPATH_HP_SW=m
+CONFIG_DM_MULTIPATH_RDAC=m
+CONFIG_DM_MULTIPATH_ALUA=m
+CONFIG_DM_NL_EVT=y
+CONFIG_FUSION=y
+CONFIG_FUSION_SPI=m
+CONFIG_FUSION_FC=m
+CONFIG_FUSION_SAS=m
+CONFIG_FUSION_MAX_SGE=128
+CONFIG_FUSION_MAX_FC_SGE=256
+CONFIG_FUSION_CTL=m
+CONFIG_FUSION_LAN=m
+CONFIG_FUSION_LOGGING=y
+
+#
+# IEEE 1394 (FireWire) support
+#
+CONFIG_IEEE1394=m
+
+#
+# Subsystem Options
+#
+# CONFIG_IEEE1394_VERBOSEDEBUG is not set
+# CONFIG_IEEE1394_OUI_DB is not set
+CONFIG_IEEE1394_EXTRA_CONFIG_ROMS=y
+CONFIG_IEEE1394_CONFIG_ROM_IP1394=y
+CONFIG_IEEE1394_EXPORT_FULL_API=y
+
+#
+# Device Drivers
+#
+# CONFIG_IEEE1394_PCILYNX is not set
+CONFIG_IEEE1394_OHCI1394=m
+
+#
+# Protocol Drivers
+#
+CONFIG_IEEE1394_VIDEO1394=m
+CONFIG_IEEE1394_SBP2=m
+# CONFIG_IEEE1394_SBP2_PHYS_DMA is not set
+CONFIG_IEEE1394_ETH1394=m
+CONFIG_IEEE1394_DV1394=m
+CONFIG_IEEE1394_RAWIO=m
+
+#
+# I2O device support
+#
+# CONFIG_I2O is not set
+
+#
+# Macintosh device drivers
+#
+CONFIG_ADB=y
+CONFIG_ADB_PMU=y
+CONFIG_PMAC_SMU=y
+CONFIG_INPUT_ADBHID=y
+CONFIG_MAC_EMUMOUSEBTN=y
+CONFIG_THERM_PM72=y
+CONFIG_WINDFARM=y
+CONFIG_WINDFARM_PM81=y
+CONFIG_WINDFARM_PM91=y
+CONFIG_WINDFARM_PM112=y
+
+#
+# Network device support
+#
+CONFIG_NETDEVICES=y
+CONFIG_IFB=m
+CONFIG_DUMMY=m
+CONFIG_BONDING=m
+CONFIG_EQUALIZER=m
+CONFIG_TUN=m
+
+#
+# ARCnet devices
+#
+# CONFIG_ARCNET is not set
+
+#
+# PHY device support
+#
+CONFIG_PHYLIB=m
+
+#
+# MII PHY device drivers
+#
+CONFIG_MARVELL_PHY=m
+CONFIG_DAVICOM_PHY=m
+CONFIG_QSEMI_PHY=m
+CONFIG_LXT_PHY=m
+CONFIG_CICADA_PHY=m
+
+#
+# Ethernet (10 or 100Mbit)
+#
+CONFIG_NET_ETHERNET=y
+CONFIG_MII=m
+# CONFIG_HAPPYMEAL is not set
+CONFIG_SUNGEM=m
+CONFIG_CASSINI=m
+CONFIG_NET_VENDOR_3COM=y
+CONFIG_VORTEX=m
+CONFIG_TYPHOON=m
+
+#
+# Tulip family network device support
+#
+CONFIG_NET_TULIP=y
+# CONFIG_DE2104X is not set
+CONFIG_TULIP=m
+CONFIG_TULIP_MWI=y
+CONFIG_TULIP_MMIO=y
+CONFIG_TULIP_NAPI=y
+CONFIG_TULIP_NAPI_HW_MITIGATION=y
+CONFIG_DE4X5=m
+CONFIG_WINBOND_840=m
+CONFIG_DM9102=m
+CONFIG_ULI526X=m
+# CONFIG_HP100 is not set
+CONFIG_IBMVETH=m
+CONFIG_NET_PCI=y
+CONFIG_PCNET32=m
+CONFIG_AMD8111_ETH=m
+CONFIG_AMD8111E_NAPI=y
+# CONFIG_ADAPTEC_STARFIRE is not set
+# CONFIG_B44 is not set
+# CONFIG_FORCEDETH is not set
+# CONFIG_DGRS is not set
+# CONFIG_EEPRO100 is not set
+CONFIG_E100=m
+# CONFIG_FEALNX is not set
+# CONFIG_NATSEMI is not set
+# CONFIG_NE2K_PCI is not set
+# CONFIG_8139CP is not set
+# CONFIG_8139TOO is not set
+# CONFIG_SIS900 is not set
+# CONFIG_EPIC100 is not set
+# CONFIG_SUNDANCE is not set
+# CONFIG_VIA_RHINE is not set
+# CONFIG_NET_POCKET is not set
+
+#
+# Ethernet (1000 Mbit)
+#
+CONFIG_ACENIC=m
+CONFIG_ACENIC_OMIT_TIGON_I=y
+# CONFIG_DL2K is not set
+CONFIG_E1000=m
+CONFIG_E1000_NAPI=y
+# CONFIG_E1000_DISABLE_PACKET_SPLIT is not set
+CONFIG_IGB=m
+# CONFIG_NS83820 is not set
+# CONFIG_HAMACHI is not set
+# CONFIG_YELLOWFIN is not set
+CONFIG_R8169=m
+CONFIG_R8169_NAPI=y
+CONFIG_R8169_VLAN=y
+CONFIG_SIS190=m
+# CONFIG_SKGE is not set
+CONFIG_SKY2=m
+# CONFIG_SK98LIN is not set
+# CONFIG_VIA_VELOCITY is not set
+CONFIG_TIGON3=m
+CONFIG_BNX2=m
+CONFIG_BNX2X=m
+CONFIG_SPIDER_NET=m
+# CONFIG_MV643XX_ETH is not set
+CONFIG_QLA3XXX=m
+
+#
+# Ethernet (10000 Mbit)
+#
+CONFIG_CHELSIO_T1=m
+CONFIG_EHEA=m
+# CONFIG_CHELSIO_T3 is not set
+CONFIG_IXGBE=m
+CONFIG_IXGBE_NAPI=y
+CONFIG_IXGB=m
+CONFIG_IXGB_NAPI=y
+CONFIG_S2IO=m
+CONFIG_S2IO_NAPI=y
+CONFIG_NETXEN_NIC=m
+CONFIG_MYRI10GE=m
+
+#
+# Token Ring devices
+#
+CONFIG_TR=y
+CONFIG_IBMOL=m
+# CONFIG_3C359 is not set
+# CONFIG_TMS380TR is not set
+
+#
+# Wireless LAN (non-hamradio)
+#
+# CONFIG_NET_RADIO is not set
+
+#
+# Wan interfaces
+#
+# CONFIG_WAN is not set
+# CONFIG_FDDI is not set
+# CONFIG_HIPPI is not set
+# CONFIG_PLIP is not set
+CONFIG_PPP=m
+CONFIG_PPP_MULTILINK=y
+CONFIG_PPP_FILTER=y
+CONFIG_PPP_ASYNC=m
+CONFIG_PPP_SYNC_TTY=m
+CONFIG_PPP_DEFLATE=m
+CONFIG_PPP_BSDCOMP=m
+CONFIG_PPP_MPPE=m
+CONFIG_PPPOE=m
+CONFIG_SLIP=m
+CONFIG_SLIP_COMPRESSED=y
+CONFIG_SLIP_SMART=y
+# CONFIG_SLIP_MODE_SLIP6 is not set
+CONFIG_NET_FC=y
+CONFIG_SHAPER=m
+CONFIG_NETCONSOLE=m
+CONFIG_NETPOLL=y
+CONFIG_NETPOLL_TRAP=y
+CONFIG_NET_POLL_CONTROLLER=y
+
+#
+# ISDN subsystem
+#
+# CONFIG_ISDN is not set
+
+#
+# Telephony Support
+#
+# CONFIG_PHONE is not set
+
+#
+# Input device support
+#
+CONFIG_INPUT=y
+
+#
+# Userland interfaces
+#
+CONFIG_INPUT_MOUSEDEV=y
+# CONFIG_INPUT_MOUSEDEV_PSAUX is not set
+CONFIG_INPUT_MOUSEDEV_SCREEN_X=1024
+CONFIG_INPUT_MOUSEDEV_SCREEN_Y=768
+CONFIG_INPUT_JOYDEV=m
+CONFIG_INPUT_TSDEV=m
+CONFIG_INPUT_TSDEV_SCREEN_X=240
+CONFIG_INPUT_TSDEV_SCREEN_Y=320
+CONFIG_INPUT_EVDEV=y
+CONFIG_INPUT_EVBUG=m
+
+#
+# Input Device Drivers
+#
+CONFIG_INPUT_KEYBOARD=y
+CONFIG_KEYBOARD_ATKBD=y
+# CONFIG_KEYBOARD_SUNKBD is not set
+# CONFIG_KEYBOARD_LKKBD is not set
+# CONFIG_KEYBOARD_XTKBD is not set
+# CONFIG_KEYBOARD_NEWTON is not set
+CONFIG_INPUT_MOUSE=y
+CONFIG_MOUSE_PS2=y
+CONFIG_MOUSE_SERIAL=m
+# CONFIG_MOUSE_VSXXXAA is not set
+CONFIG_INPUT_JOYSTICK=y
+# CONFIG_JOYSTICK_ANALOG is not set
+# CONFIG_JOYSTICK_A3D is not set
+# CONFIG_JOYSTICK_ADI is not set
+# CONFIG_JOYSTICK_COBRA is not set
+# CONFIG_JOYSTICK_GF2K is not set
+# CONFIG_JOYSTICK_GRIP is not set
+# CONFIG_JOYSTICK_GRIP_MP is not set
+# CONFIG_JOYSTICK_GUILLEMOT is not set
+# CONFIG_JOYSTICK_INTERACT is not set
+# CONFIG_JOYSTICK_SIDEWINDER is not set
+# CONFIG_JOYSTICK_TMDC is not set
+CONFIG_JOYSTICK_IFORCE=m
+CONFIG_JOYSTICK_IFORCE_USB=y
+CONFIG_JOYSTICK_IFORCE_232=y
+CONFIG_JOYSTICK_WARRIOR=m
+CONFIG_JOYSTICK_MAGELLAN=m
+CONFIG_JOYSTICK_SPACEORB=m
+CONFIG_JOYSTICK_SPACEBALL=m
+CONFIG_JOYSTICK_STINGER=m
+CONFIG_JOYSTICK_TWIDJOY=m
+# CONFIG_JOYSTICK_DB9 is not set
+# CONFIG_JOYSTICK_GAMECON is not set
+# CONFIG_JOYSTICK_TURBOGRAFX is not set
+CONFIG_JOYSTICK_JOYDUMP=m
+CONFIG_INPUT_TOUCHSCREEN=y
+CONFIG_TOUCHSCREEN_ADS7846=m
+# CONFIG_TOUCHSCREEN_GUNZE is not set
+# CONFIG_TOUCHSCREEN_ELO is not set
+# CONFIG_TOUCHSCREEN_MTOUCH is not set
+# CONFIG_TOUCHSCREEN_MK712 is not set
+CONFIG_INPUT_MISC=y
+CONFIG_INPUT_PCSPKR=m
+CONFIG_INPUT_UINPUT=m
+
+#
+# Hardware I/O ports
+#
+CONFIG_SERIO=y
+CONFIG_SERIO_I8042=y
+CONFIG_SERIO_SERPORT=m
+# CONFIG_SERIO_PARKBD is not set
+# CONFIG_SERIO_PCIPS2 is not set
+CONFIG_SERIO_LIBPS2=y
+CONFIG_SERIO_RAW=m
+CONFIG_GAMEPORT=m
+# CONFIG_GAMEPORT_NS558 is not set
+# CONFIG_GAMEPORT_L4 is not set
+# CONFIG_GAMEPORT_EMU10K1 is not set
+# CONFIG_GAMEPORT_FM801 is not set
+
+#
+# Character devices
+#
+CONFIG_VT=y
+CONFIG_VT_CONSOLE=y
+CONFIG_HW_CONSOLE=y
+# CONFIG_SERIAL_NONSTANDARD is not set
+# CONFIG_NOZOMI is not set
+
+#
+# Serial drivers
+#
+CONFIG_SERIAL_8250=y
+CONFIG_SERIAL_8250_CONSOLE=y
+CONFIG_SERIAL_8250_NR_UARTS=4
+CONFIG_SERIAL_8250_RUNTIME_UARTS=4
+# CONFIG_SERIAL_8250_EXTENDED is not set
+
+#
+# Non-8250 serial port support
+#
+CONFIG_SERIAL_CORE=y
+CONFIG_SERIAL_CORE_CONSOLE=y
+CONFIG_SERIAL_PMACZILOG=y
+CONFIG_SERIAL_PMACZILOG_CONSOLE=y
+CONFIG_SERIAL_ICOM=m
+CONFIG_SERIAL_JSM=m
+CONFIG_UNIX98_PTYS=y
+CONFIG_LEGACY_PTYS=y
+CONFIG_LEGACY_PTY_COUNT=64
+CONFIG_PRINTER=m
+# CONFIG_LP_CONSOLE is not set
+# CONFIG_PPDEV is not set
+# CONFIG_TIPAR is not set
+CONFIG_HVC_CONSOLE=y
+CONFIG_HVC_RTAS=y
+CONFIG_HVCS=m
+
+#
+# IPMI
+#
+# CONFIG_IPMI_HANDLER is not set
+
+#
+# Watchdog Cards
+#
+CONFIG_WATCHDOG=y
+# CONFIG_WATCHDOG_NOWAYOUT is not set
+
+#
+# Watchdog Device Drivers
+#
+CONFIG_SOFT_WATCHDOG=m
+CONFIG_WATCHDOG_RTAS=m
+
+#
+# PCI-based Watchdog Cards
+#
+# CONFIG_PCIPCWATCHDOG is not set
+# CONFIG_WDTPCI is not set
+
+#
+# USB-based Watchdog Cards
+#
+# CONFIG_USBPCWATCHDOG is not set
+# CONFIG_RTC is not set
+CONFIG_GEN_RTC=y
+# CONFIG_GEN_RTC_X is not set
+# CONFIG_DTLK is not set
+# CONFIG_R3964 is not set
+# CONFIG_APPLICOM is not set
+
+#
+# Ftape, the floppy tape device driver
+#
+CONFIG_AGP=m
+CONFIG_AGP_UNINORTH=m
+CONFIG_DRM=m
+# CONFIG_DRM_TDFX is not set
+CONFIG_DRM_R128=m
+CONFIG_DRM_RADEON=m
+# CONFIG_DRM_MGA is not set
+# CONFIG_DRM_SIS is not set
+# CONFIG_DRM_VIA is not set
+# CONFIG_DRM_SAVAGE is not set
+CONFIG_RAW_DRIVER=m
+CONFIG_MAX_RAW_DEVS=4096
+CONFIG_HANGCHECK_TIMER=m
+
+#
+# TPM devices
+#
+CONFIG_TCG_TPM=m
+CONFIG_TCG_TIS=m
+CONFIG_TCG_ATMEL=m
+# CONFIG_TELCLOCK is not set
+CONFIG_CRASHER=m
+
+#
+# I2C support
+#
+CONFIG_I2C=y
+CONFIG_I2C_CHARDEV=m
+
+#
+# I2C Algorithms
+#
+CONFIG_I2C_ALGOBIT=y
+# CONFIG_I2C_ALGOPCF is not set
+# CONFIG_I2C_ALGOPCA is not set
+
+#
+# I2C Hardware Bus support
+#
+# CONFIG_I2C_ALI1535 is not set
+# CONFIG_I2C_ALI1563 is not set
+# CONFIG_I2C_ALI15X3 is not set
+# CONFIG_I2C_AMD756 is not set
+CONFIG_I2C_AMD8111=m
+# CONFIG_I2C_I801 is not set
+# CONFIG_I2C_I810 is not set
+# CONFIG_I2C_PIIX4 is not set
+CONFIG_I2C_POWERMAC=y
+# CONFIG_I2C_NFORCE2 is not set
+# CONFIG_I2C_PARPORT is not set
+# CONFIG_I2C_PARPORT_LIGHT is not set
+# CONFIG_I2C_PROSAVAGE is not set
+# CONFIG_I2C_SAVAGE4 is not set
+# CONFIG_SCx200_ACB is not set
+# CONFIG_I2C_SIS5595 is not set
+# CONFIG_I2C_SIS630 is not set
+# CONFIG_I2C_SIS96X is not set
+# CONFIG_I2C_STUB is not set
+# CONFIG_I2C_VIA is not set
+# CONFIG_I2C_VIAPRO is not set
+# CONFIG_I2C_VOODOO3 is not set
+# CONFIG_I2C_PCA_ISA is not set
+
+#
+# Miscellaneous I2C Chip support
+#
+# CONFIG_SENSORS_DS1337 is not set
+# CONFIG_SENSORS_DS1374 is not set
+# CONFIG_SENSORS_EEPROM is not set
+# CONFIG_SENSORS_PCF8574 is not set
+# CONFIG_SENSORS_PCA9539 is not set
+# CONFIG_SENSORS_PCF8591 is not set
+# CONFIG_SENSORS_RTC8564 is not set
+# CONFIG_SENSORS_MAX6875 is not set
+# CONFIG_RTC_X1205_I2C is not set
+# CONFIG_I2C_DEBUG_CORE is not set
+# CONFIG_I2C_DEBUG_ALGO is not set
+# CONFIG_I2C_DEBUG_BUS is not set
+# CONFIG_I2C_DEBUG_CHIP is not set
+
+#
+# SPI support
+#
+CONFIG_SPI=y
+CONFIG_SPI_DEBUG=y
+CONFIG_SPI_MASTER=y
+
+#
+# SPI Master Controller Drivers
+#
+CONFIG_SPI_BITBANG=m
+CONFIG_SPI_BUTTERFLY=m
+
+#
+# SPI Protocol Masters
+#
+
+#
+# Dallas's 1-wire bus
+#
+# CONFIG_W1 is not set
+
+#
+# Hardware Monitoring support
+#
+# CONFIG_HWMON is not set
+# CONFIG_HWMON_VID is not set
+
+#
+# Misc devices
+#
+# CONFIG_TIFM_CORE is not set
+
+#
+# Multimedia Capabilities Port drivers
+#
+
+#
+# Multimedia devices
+#
+CONFIG_VIDEO_DEV=m
+
+#
+# Video For Linux
+#
+
+#
+# Video Adapters
+#
+# CONFIG_VIDEO_ADV_DEBUG is not set
+CONFIG_VIDEO_BT848=m
+CONFIG_VIDEO_SAA6588=m
+# CONFIG_VIDEO_BWQCAM is not set
+# CONFIG_VIDEO_CQCAM is not set
+# CONFIG_VIDEO_W9966 is not set
+# CONFIG_VIDEO_CPIA is not set
+# CONFIG_VIDEO_SAA5246A is not set
+# CONFIG_VIDEO_SAA5249 is not set
+# CONFIG_TUNER_3036 is not set
+# CONFIG_VIDEO_STRADIS is not set
+# CONFIG_VIDEO_ZORAN is not set
+# CONFIG_VIDEO_SAA7134 is not set
+# CONFIG_VIDEO_MXB is not set
+# CONFIG_VIDEO_DPC is not set
+# CONFIG_VIDEO_HEXIUM_ORION is not set
+# CONFIG_VIDEO_HEXIUM_GEMINI is not set
+# CONFIG_VIDEO_CX88 is not set
+# CONFIG_VIDEO_EM28XX is not set
+# CONFIG_VIDEO_OVCAMCHIP is not set
+# CONFIG_VIDEO_AUDIO_DECODER is not set
+# CONFIG_VIDEO_DECODER is not set
+
+#
+# Radio Adapters
+#
+# CONFIG_RADIO_GEMTEK_PCI is not set
+# CONFIG_RADIO_MAXIRADIO is not set
+# CONFIG_RADIO_MAESTRO is not set
+
+#
+# Digital Video Broadcasting Devices
+#
+# CONFIG_DVB is not set
+CONFIG_VIDEO_TUNER=m
+CONFIG_VIDEO_BUF=m
+CONFIG_VIDEO_BTCX=m
+CONFIG_VIDEO_IR=m
+CONFIG_VIDEO_TVEEPROM=m
+
+#
+# Graphics support
+#
+CONFIG_FB=y
+CONFIG_FB_CFB_FILLRECT=y
+CONFIG_FB_CFB_COPYAREA=y
+CONFIG_FB_CFB_IMAGEBLIT=y
+CONFIG_FB_MACMODES=y
+CONFIG_FB_MODE_HELPERS=y
+CONFIG_FB_TILEBLITTING=y
+# CONFIG_FB_CIRRUS is not set
+# CONFIG_FB_PM2 is not set
+# CONFIG_FB_CYBER2000 is not set
+CONFIG_FB_OF=y
+# CONFIG_FB_CONTROL is not set
+# CONFIG_FB_PLATINUM is not set
+# CONFIG_FB_VALKYRIE is not set
+# CONFIG_FB_CT65550 is not set
+# CONFIG_FB_ASILIANT is not set
+# CONFIG_FB_IMSTT is not set
+# CONFIG_FB_VGA16 is not set
+# CONFIG_FB_S1D13XXX is not set
+CONFIG_FB_NVIDIA=y
+CONFIG_FB_NVIDIA_I2C=y
+# CONFIG_FB_RIVA is not set
+CONFIG_FB_MATROX=y
+CONFIG_FB_MATROX_MILLENIUM=y
+CONFIG_FB_MATROX_MYSTIQUE=y
+CONFIG_FB_MATROX_G=y
+CONFIG_FB_MATROX_I2C=m
+CONFIG_FB_MATROX_MAVEN=m
+CONFIG_FB_MATROX_MULTIHEAD=y
+# CONFIG_FB_RADEON_OLD is not set
+CONFIG_FB_RADEON=y
+CONFIG_FB_RADEON_I2C=y
+# CONFIG_FB_RADEON_DEBUG is not set
+# CONFIG_FB_ATY128 is not set
+# CONFIG_FB_ATY is not set
+# CONFIG_FB_SAVAGE is not set
+# CONFIG_FB_SIS is not set
+# CONFIG_FB_NEOMAGIC is not set
+# CONFIG_FB_KYRO is not set
+# CONFIG_FB_3DFX is not set
+# CONFIG_FB_VOODOO1 is not set
+# CONFIG_FB_TRIDENT is not set
+# CONFIG_FB_VIRTUAL is not set
+
+#
+# Console display driver support
+#
+# CONFIG_VGA_CONSOLE is not set
+CONFIG_DUMMY_CONSOLE=y
+CONFIG_FRAMEBUFFER_CONSOLE=y
+CONFIG_FRAMEBUFFER_CONSOLE_ROTATION=y
+# CONFIG_FONTS is not set
+CONFIG_FONT_8x8=y
+CONFIG_FONT_8x16=y
+
+#
+# Logo configuration
+#
+CONFIG_LOGO=y
+# CONFIG_LOGO_LINUX_MONO is not set
+CONFIG_LOGO_LINUX_VGA16=y
+# CONFIG_LOGO_LINUX_CLUT224 is not set
+CONFIG_BACKLIGHT_LCD_SUPPORT=y
+CONFIG_BACKLIGHT_CLASS_DEVICE=m
+CONFIG_BACKLIGHT_DEVICE=y
+CONFIG_LCD_CLASS_DEVICE=m
+CONFIG_LCD_DEVICE=y
+
+#
+# Bootsplash configuration
+#
+
+#
+# Sound
+#
+CONFIG_SOUND=m
+
+#
+# Advanced Linux Sound Architecture
+#
+CONFIG_SND=m
+CONFIG_SND_TIMER=m
+CONFIG_SND_PCM=m
+CONFIG_SND_HWDEP=m
+CONFIG_SND_RAWMIDI=m
+CONFIG_SND_SEQUENCER=m
+CONFIG_SND_SEQ_DUMMY=m
+CONFIG_SND_OSSEMUL=y
+CONFIG_SND_MIXER_OSS=m
+CONFIG_SND_PCM_OSS=m
+CONFIG_SND_PCM_OSS_PLUGINS=y
+CONFIG_SND_SEQUENCER_OSS=y
+CONFIG_SND_DYNAMIC_MINORS=y
+CONFIG_SND_SUPPORT_OLD_API=y
+CONFIG_SND_VERBOSE_PROCFS=y
+CONFIG_SND_VERBOSE_PRINTK=y
+CONFIG_SND_DEBUG=y
+# CONFIG_SND_DEBUG_DETECT is not set
+
+#
+# Generic devices
+#
+CONFIG_SND_MPU401_UART=m
+CONFIG_SND_DUMMY=m
+CONFIG_SND_VIRMIDI=m
+CONFIG_SND_MTPAV=m
+CONFIG_SND_SERIAL_U16550=m
+CONFIG_SND_MPU401=m
+
+#
+# PCI devices
+#
+# CONFIG_SND_AD1889 is not set
+# CONFIG_SND_ALS4000 is not set
+# CONFIG_SND_ALI5451 is not set
+# CONFIG_SND_ATIIXP is not set
+# CONFIG_SND_ATIIXP_MODEM is not set
+# CONFIG_SND_AU8810 is not set
+# CONFIG_SND_AU8820 is not set
+# CONFIG_SND_AU8830 is not set
+# CONFIG_SND_AZT3328 is not set
+# CONFIG_SND_BT87X is not set
+# CONFIG_SND_CA0106 is not set
+# CONFIG_SND_CMIPCI is not set
+# CONFIG_SND_CS4281 is not set
+# CONFIG_SND_CS46XX is not set
+CONFIG_SND_DARLA20=m
+CONFIG_SND_GINA20=m
+CONFIG_SND_LAYLA20=m
+CONFIG_SND_DARLA24=m
+CONFIG_SND_GINA24=m
+CONFIG_SND_LAYLA24=m
+CONFIG_SND_MONA=m
+CONFIG_SND_MIA=m
+CONFIG_SND_ECHO3G=m
+CONFIG_SND_INDIGO=m
+CONFIG_SND_INDIGOIO=m
+CONFIG_SND_INDIGODJ=m
+# CONFIG_SND_EMU10K1 is not set
+# CONFIG_SND_EMU10K1X is not set
+# CONFIG_SND_ENS1370 is not set
+# CONFIG_SND_ENS1371 is not set
+# CONFIG_SND_ES1938 is not set
+# CONFIG_SND_ES1968 is not set
+# CONFIG_SND_FM801 is not set
+# CONFIG_SND_HDA_INTEL is not set
+# CONFIG_SND_HDSP is not set
+# CONFIG_SND_HDSPM is not set
+# CONFIG_SND_ICE1712 is not set
+# CONFIG_SND_ICE1724 is not set
+# CONFIG_SND_INTEL8X0 is not set
+# CONFIG_SND_INTEL8X0M is not set
+# CONFIG_SND_KORG1212 is not set
+# CONFIG_SND_MAESTRO3 is not set
+# CONFIG_SND_MIXART is not set
+# CONFIG_SND_NM256 is not set
+# CONFIG_SND_PCXHR is not set
+# CONFIG_SND_RME32 is not set
+# CONFIG_SND_RME96 is not set
+# CONFIG_SND_RME9652 is not set
+# CONFIG_SND_SONICVIBES is not set
+# CONFIG_SND_TRIDENT is not set
+# CONFIG_SND_VIA82XX is not set
+# CONFIG_SND_VIA82XX_MODEM is not set
+# CONFIG_SND_VX222 is not set
+# CONFIG_SND_YMFPCI is not set
+
+#
+# ALSA PowerMac devices
+#
+CONFIG_SND_POWERMAC=m
+CONFIG_SND_POWERMAC_AUTO_DRC=y
+
+#
+# USB devices
+#
+CONFIG_SND_USB_AUDIO=m
+CONFIG_SND_USB_USX2Y=m
+
+#
+# Open Sound System
+#
+# CONFIG_SOUND_PRIME is not set
+
+#
+# USB support
+#
+CONFIG_USB_ARCH_HAS_HCD=y
+CONFIG_USB_ARCH_HAS_OHCI=y
+CONFIG_USB=y
+# CONFIG_USB_DEBUG is not set
+
+#
+# Miscellaneous USB options
+#
+CONFIG_USB_DEVICEFS=y
+CONFIG_USB_BANDWIDTH=y
+# CONFIG_USB_DYNAMIC_MINORS is not set
+# CONFIG_USB_OTG is not set
+
+#
+# USB Host Controller Drivers
+#
+CONFIG_USB_EHCI_HCD=y
+CONFIG_USB_EHCI_SPLIT_ISO=y
+CONFIG_USB_EHCI_ROOT_HUB_TT=y
+# CONFIG_USB_ISP116X_HCD is not set
+CONFIG_USB_OHCI_HCD=y
+# CONFIG_USB_OHCI_BIG_ENDIAN is not set
+CONFIG_USB_OHCI_LITTLE_ENDIAN=y
+CONFIG_USB_UHCI_HCD=m
+# CONFIG_USB_SL811_HCD is not set
+
+#
+# USB Device Class drivers
+#
+# CONFIG_OBSOLETE_OSS_USB_DRIVER is not set
+CONFIG_USB_ACM=m
+CONFIG_USB_PRINTER=m
+
+#
+# NOTE: USB_STORAGE enables SCSI, and 'SCSI disk support'
+#
+
+#
+# may also be needed; see USB_STORAGE Help for more information
+#
+CONFIG_USB_STORAGE=m
+# CONFIG_USB_STORAGE_DEBUG is not set
+CONFIG_USB_STORAGE_DATAFAB=y
+CONFIG_USB_STORAGE_FREECOM=y
+CONFIG_USB_STORAGE_ISD200=y
+CONFIG_USB_STORAGE_DPCM=y
+CONFIG_USB_STORAGE_USBAT=y
+CONFIG_USB_STORAGE_SDDR09=y
+CONFIG_USB_STORAGE_SDDR55=y
+CONFIG_USB_STORAGE_JUMPSHOT=y
+CONFIG_USB_STORAGE_ALAUDA=y
+CONFIG_USB_STORAGE_ONETOUCH=y
+# CONFIG_USB_LIBUSUAL is not set
+
+#
+# USB Input Devices
+#
+CONFIG_USB_HID=y
+CONFIG_USB_HIDINPUT=y
+# CONFIG_USB_HIDINPUT_POWERBOOK is not set
+CONFIG_HID_FF=y
+CONFIG_HID_PID=y
+CONFIG_LOGITECH_FF=y
+CONFIG_THRUSTMASTER_FF=y
+CONFIG_USB_HIDDEV=y
+CONFIG_USB_AIPTEK=m
+CONFIG_USB_WACOM=m
+CONFIG_USB_ACECAD=m
+CONFIG_USB_KBTAB=m
+CONFIG_USB_POWERMATE=m
+CONFIG_USB_MTOUCH=m
+CONFIG_USB_ITMTOUCH=m
+CONFIG_USB_EGALAX=m
+CONFIG_USB_YEALINK=m
+CONFIG_USB_XPAD=m
+CONFIG_USB_ATI_REMOTE=m
+CONFIG_USB_ATI_REMOTE2=m
+CONFIG_USB_KEYSPAN_REMOTE=m
+CONFIG_USB_APPLETOUCH=m
+
+#
+# USB Imaging devices
+#
+CONFIG_USB_MDC800=m
+CONFIG_USB_MICROTEK=m
+
+#
+# USB Multimedia devices
+#
+CONFIG_USB_DABUSB=m
+CONFIG_USB_VICAM=m
+CONFIG_USB_DSBR=m
+CONFIG_USB_ET61X251=m
+CONFIG_USB_IBMCAM=m
+CONFIG_USB_KONICAWC=m
+CONFIG_USB_OV511=m
+CONFIG_USB_SE401=m
+CONFIG_USB_SN9C102=m
+CONFIG_USB_STV680=m
+CONFIG_USB_PWC=m
+
+#
+# USB Network Adapters
+#
+CONFIG_USB_CATC=m
+CONFIG_USB_KAWETH=m
+CONFIG_USB_PEGASUS=m
+CONFIG_USB_RTL8150=m
+CONFIG_USB_USBNET=m
+CONFIG_USB_NET_AX8817X=m
+CONFIG_USB_NET_CDCETHER=m
+CONFIG_USB_NET_GL620A=m
+CONFIG_USB_NET_NET1080=m
+CONFIG_USB_NET_PLUSB=m
+CONFIG_USB_NET_RNDIS_HOST=m
+CONFIG_USB_NET_CDC_SUBSET=m
+CONFIG_USB_ALI_M5632=y
+CONFIG_USB_AN2720=y
+CONFIG_USB_BELKIN=y
+CONFIG_USB_ARMLINUX=y
+CONFIG_USB_EPSON2888=y
+CONFIG_USB_NET_ZAURUS=m
+# CONFIG_USB_MON is not set
+
+#
+# USB port drivers
+#
+CONFIG_USB_USS720=m
+
+#
+# USB Serial Converter support
+#
+CONFIG_USB_SERIAL=m
+CONFIG_USB_SERIAL_GENERIC=y
+CONFIG_USB_SERIAL_AIRPRIME=m
+CONFIG_USB_SERIAL_ANYDATA=m
+CONFIG_USB_SERIAL_BELKIN=m
+CONFIG_USB_SERIAL_WHITEHEAT=m
+CONFIG_USB_SERIAL_DIGI_ACCELEPORT=m
+CONFIG_USB_SERIAL_CP2101=m
+CONFIG_USB_SERIAL_CYPRESS_M8=m
+CONFIG_USB_SERIAL_EMPEG=m
+CONFIG_USB_SERIAL_FTDI_SIO=m
+CONFIG_USB_SERIAL_FUNSOFT=m
+CONFIG_USB_SERIAL_VISOR=m
+CONFIG_USB_SERIAL_IPAQ=m
+CONFIG_USB_SERIAL_IR=m
+CONFIG_USB_SERIAL_EDGEPORT=m
+CONFIG_USB_SERIAL_EDGEPORT_TI=m
+CONFIG_USB_SERIAL_GARMIN=m
+CONFIG_USB_SERIAL_IPW=m
+CONFIG_USB_SERIAL_KEYSPAN_PDA=m
+CONFIG_USB_SERIAL_KEYSPAN=m
+CONFIG_USB_SERIAL_KEYSPAN_MPR=y
+CONFIG_USB_SERIAL_KEYSPAN_USA28=y
+CONFIG_USB_SERIAL_KEYSPAN_USA28X=y
+CONFIG_USB_SERIAL_KEYSPAN_USA28XA=y
+CONFIG_USB_SERIAL_KEYSPAN_USA28XB=y
+CONFIG_USB_SERIAL_KEYSPAN_USA19=y
+CONFIG_USB_SERIAL_KEYSPAN_USA18X=y
+CONFIG_USB_SERIAL_KEYSPAN_USA19W=y
+CONFIG_USB_SERIAL_KEYSPAN_USA19QW=y
+CONFIG_USB_SERIAL_KEYSPAN_USA19QI=y
+CONFIG_USB_SERIAL_KEYSPAN_USA49W=y
+CONFIG_USB_SERIAL_KEYSPAN_USA49WLC=y
+CONFIG_USB_SERIAL_KLSI=m
+CONFIG_USB_SERIAL_KOBIL_SCT=m
+CONFIG_USB_SERIAL_MCT_U232=m
+CONFIG_USB_SERIAL_NAVMAN=m
+CONFIG_USB_SERIAL_PL2303=m
+CONFIG_USB_SERIAL_HP4X=m
+CONFIG_USB_SERIAL_SAFE=m
+CONFIG_USB_SERIAL_SAFE_PADDED=y
+CONFIG_USB_SERIAL_SIERRAWIRELESS=m
+CONFIG_USB_SERIAL_TI=m
+CONFIG_USB_SERIAL_CYBERJACK=m
+CONFIG_USB_SERIAL_XIRCOM=m
+CONFIG_USB_SERIAL_OMNINET=m
+CONFIG_USB_EZUSB=y
+
+#
+# USB Miscellaneous drivers
+#
+CONFIG_USB_EMI62=m
+CONFIG_USB_EMI26=m
+CONFIG_USB_AUERSWALD=m
+CONFIG_USB_RIO500=m
+CONFIG_USB_LEGOTOWER=m
+CONFIG_USB_LCD=m
+CONFIG_USB_BERRY_CHARGE=m
+CONFIG_USB_LED=m
+CONFIG_USB_CYTHERM=m
+CONFIG_USB_PHIDGETKIT=m
+CONFIG_USB_PHIDGETSERVO=m
+CONFIG_USB_IDMOUSE=m
+CONFIG_USB_SISUSBVGA=m
+CONFIG_USB_SISUSBVGA_CON=y
+CONFIG_USB_LD=m
+# CONFIG_USB_TEST is not set
+
+#
+# USB DSL modem support
+#
+
+#
+# USB Gadget Support
+#
+# CONFIG_USB_GADGET is not set
+
+#
+# MMC/SD Card support
+#
+# CONFIG_MMC is not set
+
+#
+# InfiniBand support
+#
+# CONFIG_INFINIBAND is not set
+
+#
+# DMA Engine support
+#
+# CONFIG_DMA_ENGINE is not set
+
+#
+# DMA Clients
+#
+
+#
+# DMA Devices
+#
+
+#
+# File systems
+#
+CONFIG_EXT2_FS=y
+CONFIG_EXT2_FS_XATTR=y
+CONFIG_EXT2_FS_POSIX_ACL=y
+CONFIG_EXT2_FS_SECURITY=y
+# CONFIG_EXT2_FS_XIP is not set
+CONFIG_EXT3_FS=y
+CONFIG_EXT3_FS_XATTR=y
+CONFIG_EXT3_FS_POSIX_ACL=y
+CONFIG_EXT3_FS_SECURITY=y
+CONFIG_JBD=y
+# CONFIG_JBD_DEBUG is not set
+CONFIG_FS_MBCACHE=y
+CONFIG_REISERFS_FS=m
+# CONFIG_REISERFS_CHECK is not set
+CONFIG_REISERFS_PROC_INFO=y
+CONFIG_REISERFS_FS_XATTR=y
+CONFIG_REISERFS_FS_POSIX_ACL=y
+CONFIG_REISERFS_FS_SECURITY=y
+CONFIG_JFS_FS=m
+CONFIG_JFS_POSIX_ACL=y
+CONFIG_JFS_SECURITY=y
+# CONFIG_JFS_DEBUG is not set
+CONFIG_JFS_STATISTICS=y
+CONFIG_FS_POSIX_ACL=y
+CONFIG_XFS_FS=m
+CONFIG_XFS_QUOTA=m
+CONFIG_XFS_DMAPI=m
+CONFIG_XFS_SECURITY=y
+CONFIG_XFS_POSIX_ACL=y
+CONFIG_XFS_RT=y
+# CONFIG_XFS_DEBUG is not set
+# CONFIG_XFS_TRACE is not set
+CONFIG_OCFS2_FS=m
+CONFIG_OCFS2_FS_USERSPACE_CLUSTER=m
+CONFIG_MINIX_FS=m
+CONFIG_ROMFS_FS=m
+CONFIG_INOTIFY=y
+CONFIG_INOTIFY_USER=y
+CONFIG_DMAPI=m
+# CONFIG_DMAPI_DEBUG is not set
+CONFIG_QUOTA=y
+CONFIG_QFMT_V1=m
+CONFIG_QFMT_V2=m
+CONFIG_QUOTACTL=y
+CONFIG_DNOTIFY=y
+CONFIG_AUTOFS_FS=m
+CONFIG_AUTOFS4_FS=m
+CONFIG_FUSE_FS=m
+
+#
+# CD-ROM/DVD Filesystems
+#
+CONFIG_ISO9660_FS=m
+CONFIG_JOLIET=y
+CONFIG_ZISOFS=y
+CONFIG_ZISOFS_FS=m
+CONFIG_UDF_FS=m
+CONFIG_UDF_NLS=y
+
+#
+# DOS/FAT/NT Filesystems
+#
+CONFIG_FAT_FS=y
+CONFIG_MSDOS_FS=m
+CONFIG_VFAT_FS=y
+CONFIG_FAT_DEFAULT_CODEPAGE=437
+CONFIG_FAT_DEFAULT_IOCHARSET="iso8859-1"
+CONFIG_NTFS_FS=m
+# CONFIG_NTFS_DEBUG is not set
+# CONFIG_NTFS_RW is not set
+
+#
+# Pseudo filesystems
+#
+CONFIG_PROC_FS=y
+CONFIG_PROC_KCORE=y
+CONFIG_SYSFS=y
+CONFIG_TMPFS=y
+CONFIG_TMPFS_POSIX_ACL=y
+CONFIG_HUGETLBFS=y
+CONFIG_HUGETLB_PAGE=y
+CONFIG_RAMFS=y
+CONFIG_CONFIGFS_FS=m
+
+#
+# Miscellaneous filesystems
+#
+# CONFIG_ADFS_FS is not set
+# CONFIG_AFFS_FS is not set
+CONFIG_HFS_FS=m
+CONFIG_HFSPLUS_FS=m
+# CONFIG_BEFS_FS is not set
+# CONFIG_BFS_FS is not set
+# CONFIG_EFS_FS is not set
+CONFIG_CRAMFS=m
+# CONFIG_VXFS_FS is not set
+# CONFIG_HPFS_FS is not set
+# CONFIG_QNX4FS_FS is not set
+# CONFIG_SYSV_FS is not set
+CONFIG_UFS_FS=m
+
+#
+# Network File Systems
+#
+CONFIG_NFS_FS=m
+CONFIG_NFS_V3=y
+CONFIG_NFS_V3_ACL=y
+CONFIG_NFS_V4=y
+CONFIG_NFS_DIRECTIO=y
+CONFIG_NFSD=m
+CONFIG_NFSD_V2_ACL=y
+CONFIG_NFSD_V3=y
+CONFIG_NFSD_V3_ACL=y
+CONFIG_NFSD_V4=y
+CONFIG_NFSD_TCP=y
+CONFIG_LOCKD=m
+CONFIG_LOCKD_V4=y
+CONFIG_EXPORTFS=m
+CONFIG_NFS_ACL_SUPPORT=m
+CONFIG_NFS_COMMON=y
+CONFIG_SUNRPC=m
+CONFIG_SUNRPC_GSS=m
+CONFIG_RPCSEC_GSS_KRB5=m
+CONFIG_RPCSEC_GSS_SPKM3=m
+CONFIG_SMB_FS=m
+# CONFIG_SMB_NLS_DEFAULT is not set
+CONFIG_CIFS=m
+CONFIG_CIFS_STATS=y
+# CONFIG_CIFS_STATS2 is not set
+CONFIG_CIFS_XATTR=y
+CONFIG_CIFS_POSIX=y
+# CONFIG_CIFS_EXPERIMENTAL is not set
+CONFIG_NCP_FS=m
+CONFIG_NCPFS_PACKET_SIGNING=y
+CONFIG_NCPFS_IOCTL_LOCKING=y
+CONFIG_NCPFS_STRONG=y
+CONFIG_NCPFS_NFS_NS=y
+CONFIG_NCPFS_OS2_NS=y
+CONFIG_NCPFS_SMALLDOS=y
+CONFIG_NCPFS_NLS=y
+CONFIG_NCPFS_EXTRAS=y
+# CONFIG_CODA_FS is not set
+# CONFIG_AFS_FS is not set
+CONFIG_9P_FS=m
+CONFIG_GENERIC_ACL=y
+
+#
+# Partition Types
+#
+CONFIG_PARTITION_ADVANCED=y
+# CONFIG_ACORN_PARTITION is not set
+CONFIG_OSF_PARTITION=y
+CONFIG_AMIGA_PARTITION=y
+CONFIG_ATARI_PARTITION=y
+CONFIG_MAC_PARTITION=y
+CONFIG_MSDOS_PARTITION=y
+CONFIG_BSD_DISKLABEL=y
+CONFIG_MINIX_SUBPARTITION=y
+CONFIG_SOLARIS_X86_PARTITION=y
+CONFIG_UNIXWARE_DISKLABEL=y
+CONFIG_LDM_PARTITION=y
+# CONFIG_LDM_DEBUG is not set
+CONFIG_SGI_PARTITION=y
+CONFIG_ULTRIX_PARTITION=y
+CONFIG_SUN_PARTITION=y
+CONFIG_KARMA_PARTITION=y
+CONFIG_EFI_PARTITION=y
+
+#
+# Native Language Support
+#
+CONFIG_NLS=y
+CONFIG_NLS_DEFAULT="utf8"
+CONFIG_NLS_CODEPAGE_437=y
+CONFIG_NLS_CODEPAGE_737=m
+CONFIG_NLS_CODEPAGE_775=m
+CONFIG_NLS_CODEPAGE_850=m
+CONFIG_NLS_CODEPAGE_852=m
+CONFIG_NLS_CODEPAGE_855=m
+CONFIG_NLS_CODEPAGE_857=m
+CONFIG_NLS_CODEPAGE_860=m
+CONFIG_NLS_CODEPAGE_861=m
+CONFIG_NLS_CODEPAGE_862=m
+CONFIG_NLS_CODEPAGE_863=m
+CONFIG_NLS_CODEPAGE_864=m
+CONFIG_NLS_CODEPAGE_865=m
+CONFIG_NLS_CODEPAGE_866=m
+CONFIG_NLS_CODEPAGE_869=m
+CONFIG_NLS_CODEPAGE_936=m
+CONFIG_NLS_CODEPAGE_950=m
+CONFIG_NLS_CODEPAGE_932=m
+CONFIG_NLS_CODEPAGE_949=m
+CONFIG_NLS_CODEPAGE_874=m
+CONFIG_NLS_ISO8859_8=m
+CONFIG_NLS_CODEPAGE_1250=m
+CONFIG_NLS_CODEPAGE_1251=m
+CONFIG_NLS_ASCII=m
+CONFIG_NLS_ISO8859_1=y
+CONFIG_NLS_ISO8859_2=m
+CONFIG_NLS_ISO8859_3=m
+CONFIG_NLS_ISO8859_4=m
+CONFIG_NLS_ISO8859_5=m
+CONFIG_NLS_ISO8859_6=m
+CONFIG_NLS_ISO8859_7=m
+CONFIG_NLS_ISO8859_9=m
+CONFIG_NLS_ISO8859_13=m
+CONFIG_NLS_ISO8859_14=m
+CONFIG_NLS_ISO8859_15=m
+CONFIG_NLS_KOI8_R=m
+CONFIG_NLS_KOI8_U=m
+CONFIG_NLS_UTF8=m
+
+#
+# Library routines
+#
+CONFIG_CRC_CCITT=m
+CONFIG_CRC16=m
+CONFIG_CRC32=y
+CONFIG_LIBCRC32C=m
+CONFIG_ZLIB_INFLATE=m
+CONFIG_ZLIB_DEFLATE=m
+CONFIG_TEXTSEARCH=y
+CONFIG_TEXTSEARCH_KMP=m
+CONFIG_TEXTSEARCH_BM=m
+CONFIG_TEXTSEARCH_FSM=m
+
+#
+# Instrumentation Support
+#
+CONFIG_PROFILING=y
+CONFIG_OPROFILE=y
+CONFIG_KPROBES=y
+
+#
+# Kernel hacking
+#
+# CONFIG_PRINTK_TIME is not set
+CONFIG_MAGIC_SYSRQ=y
+CONFIG_DEBUG_KERNEL=y
+CONFIG_LOG_BUF_SHIFT=19
+# CONFIG_DETECT_SOFTLOCKUP is not set
+# CONFIG_SCHEDSTATS is not set
+# CONFIG_DEBUG_SLAB is not set
+# CONFIG_DEBUG_MUTEXES is not set
+# CONFIG_DEBUG_SPINLOCK is not set
+# CONFIG_DEBUG_SPINLOCK_SLEEP is not set
+# CONFIG_DEBUG_KOBJECT is not set
+CONFIG_DEBUG_INFO=y
+CONFIG_DEBUG_FS=y
+# CONFIG_DEBUG_VM is not set
+CONFIG_FORCED_INLINING=y
+# CONFIG_RCU_TORTURE_TEST is not set
+# CONFIG_LKCD_DUMP is not set
+CONFIG_DEBUG_STACKOVERFLOW=y
+CONFIG_DEBUG_STACK_USAGE=y
+CONFIG_HCALL_STATS=y
+CONFIG_DEBUGGER=y
+CONFIG_XMON=y
+# CONFIG_XMON_DEFAULT is not set
+CONFIG_IRQSTACKS=y
+CONFIG_BOOTX_TEXT=y
+# CONFIG_PPC_EARLY_DEBUG_LPAR is not set
+# CONFIG_PPC_EARLY_DEBUG_G5 is not set
+# CONFIG_PPC_EARLY_DEBUG_RTAS is not set
+# CONFIG_PPC_EARLY_DEBUG_MAPLE is not set
+# CONFIG_PPC_EARLY_DEBUG_ISERIES is not set
+
+#
+# Security options
+#
+CONFIG_KEYS=y
+CONFIG_KEYS_DEBUG_PROC_KEYS=y
+CONFIG_SECURITY=y
+CONFIG_SECURITY_NETWORK=y
+# CONFIG_SECURITY_NETWORK_XFRM is not set
+CONFIG_SECURITY_CAPABILITIES=m
+CONFIG_SECURITY_ROOTPLUG=m
+CONFIG_SECURITY_SECLVL=m
+# CONFIG_SECURITY_SELINUX is not set
+CONFIG_SECURITY_APPARMOR=m
+CONFIG_KEYS_COMPAT=y
+
+#
+# Cryptographic options
+#
+CONFIG_CRYPTO=y
+CONFIG_CRYPTO_ALGAPI=y
+CONFIG_CRYPTO_ABLKCIPHER=m
+CONFIG_CRYPTO_BLKCIPHER=m
+CONFIG_CRYPTO_HASH=y
+CONFIG_CRYPTO_MANAGER=y
+CONFIG_CRYPTO_HMAC=y
+CONFIG_CRYPTO_XCBC=m
+CONFIG_CRYPTO_NULL=m
+CONFIG_CRYPTO_MD4=m
+CONFIG_CRYPTO_MD5=y
+CONFIG_CRYPTO_SHA1=m
+CONFIG_CRYPTO_SHA256=m
+CONFIG_CRYPTO_SHA512=m
+CONFIG_CRYPTO_WP512=m
+CONFIG_CRYPTO_TGR192=m
+CONFIG_CRYPTO_GF128MUL=m
+CONFIG_CRYPTO_ECB=m
+CONFIG_CRYPTO_CBC=m
+CONFIG_CRYPTO_PCBC=m
+CONFIG_CRYPTO_LRW=m
+CONFIG_CRYPTO_CRYPTD=m
+CONFIG_CRYPTO_DES=y
+CONFIG_CRYPTO_FCRYPT=m
+CONFIG_CRYPTO_BLOWFISH=m
+CONFIG_CRYPTO_TWOFISH=m
+CONFIG_CRYPTO_TWOFISH_COMMON=m
+CONFIG_CRYPTO_SERPENT=m
+CONFIG_CRYPTO_AES=m
+CONFIG_CRYPTO_CAST5=m
+CONFIG_CRYPTO_CAST6=m
+CONFIG_CRYPTO_TEA=m
+CONFIG_CRYPTO_ARC4=m
+CONFIG_CRYPTO_KHAZAD=m
+CONFIG_CRYPTO_ANUBIS=m
+CONFIG_CRYPTO_DEFLATE=m
+CONFIG_CRYPTO_MICHAEL_MIC=m
+CONFIG_CRYPTO_CRC32C=m
+CONFIG_CRYPTO_CAMELLIA=m
+CONFIG_CRYPTO_TEST=m
+
+#
+# Hardware crypto devices
+#
diff --git a/lustre/kernel_patches/kernel_configs/kernel-2.6.16-2.6-sles10-x86_64-smp.config b/lustre/kernel_patches/kernel_configs/kernel-2.6.16-2.6-sles10-x86_64-smp.config

index 18b56e4..eb87a50 100644 (file)
--- a/lustre/kernel_patches/kernel_configs/kernel-2.6.16-2.6-sles10-x86_64-smp.config
+++ b/lustre/kernel_patches/kernel_configs/kernel-2.6.16-2.6-sles10-x86_64-smp.config
@@ -1,7 +1,7 @@
  #
  # Automatically generated make config: don't edit
-# Linux kernel version: 2.6.16.46
-# Tue Jul  3 18:35:09 2007
+# Linux kernel version: 2.6.16.60
+# Wed May 21 20:28:50 2008
  #
  CONFIG_X86_64=y
  CONFIG_64BIT=y
@@ -32,15 +32,15 @@ CONFIG_LOCALVERSION=""
  CONFIG_LOCALVERSION_AUTO=y
  CONFIG_SUSE_KERNEL=y
  CONFIG_SLE_VERSION=10
-CONFIG_SLE_SP=1
+CONFIG_SLE_SP=2
  CONFIG_SLE_SP_SUBLEVEL=0
  CONFIG_SWAP=y
  CONFIG_SYSVIPC=y
  CONFIG_POSIX_MQUEUE=y
  CONFIG_BSD_PROCESS_ACCT=y
  CONFIG_BSD_PROCESS_ACCT_V3=y
-CONFIG_TASK_DELAY_ACCT=y
  CONFIG_TASKSTATS=y
+CONFIG_TASK_DELAY_ACCT=y
  CONFIG_TASK_XACCT=y
  CONFIG_SYSCTL=y
  CONFIG_AUDIT=y
@@ -198,6 +198,7 @@ CONFIG_ACPI_BUTTON=m
  CONFIG_ACPI_VIDEO=m
  # CONFIG_ACPI_HOTKEY is not set
  CONFIG_ACPI_FAN=m
+CONFIG_ACPI_DOCK=m
  CONFIG_ACPI_PROCESSOR=m
  CONFIG_ACPI_HOTPLUG_CPU=y
  CONFIG_ACPI_THERMAL=m
@@ -259,6 +260,7 @@ CONFIG_PCI_MMCONFIG=y
  CONFIG_PCIEPORTBUS=y
  CONFIG_HOTPLUG_PCI_PCIE=m
  # CONFIG_HOTPLUG_PCI_PCIE_POLL_EVENT_MODE is not set
+CONFIG_PCIEAER=y
  CONFIG_PCI_MSI=y
  # CONFIG_PCI_LEGACY_PROC is not set
  # CONFIG_PCI_DEBUG is not set
@@ -1096,6 +1098,7 @@ CONFIG_SCSI_FC_ATTRS=m
  # CONFIG_SCSI_ISCSI_ATTRS is not set
  CONFIG_SCSI_SAS_ATTRS=m
  CONFIG_SCSI_SAS_LIBSAS=m
+# CONFIG_SCSI_SAS_ATA is not set
  # CONFIG_SCSI_SAS_LIBSAS_DEBUG is not set
  CONFIG_ISCSI_TARGET=m
  
@@ -1249,11 +1252,8 @@ CONFIG_DM_MULTIPATH=m
  CONFIG_DM_MULTIPATH_EMC=m
  CONFIG_DM_MULTIPATH_HP_SW=m
  CONFIG_DM_MULTIPATH_RDAC=m
+CONFIG_DM_MULTIPATH_ALUA=m
  CONFIG_DM_NL_EVT=y
-
-#
-# Fusion MPT device support
-#
  CONFIG_FUSION=y
  CONFIG_FUSION_SPI=m
  CONFIG_FUSION_FC=m
@@ -1262,6 +1262,7 @@ CONFIG_FUSION_MAX_SGE=128
  CONFIG_FUSION_MAX_FC_SGE=256
  CONFIG_FUSION_CTL=m
  CONFIG_FUSION_LAN=m
+# CONFIG_FUSION_LOGGING is not set
  
  #
  # IEEE 1394 (FireWire) support
@@ -1409,6 +1410,7 @@ CONFIG_DL2K=m
  CONFIG_E1000=m
  CONFIG_E1000_NAPI=y
  # CONFIG_E1000_DISABLE_PACKET_SPLIT is not set
+CONFIG_IGB=m
  CONFIG_NS83820=m
  CONFIG_HAMACHI=m
  CONFIG_YELLOWFIN=m
@@ -1422,17 +1424,22 @@ CONFIG_SK98LIN=m
  CONFIG_VIA_VELOCITY=m
  CONFIG_TIGON3=m
  CONFIG_BNX2=m
+CONFIG_BNX2X=m
  CONFIG_QLA3XXX=m
  
  #
  # Ethernet (10000 Mbit)
  #
  CONFIG_CHELSIO_T1=m
+# CONFIG_CHELSIO_T3 is not set
+CONFIG_IXGBE=m
+# CONFIG_IXGBE_NAPI is not set
  CONFIG_IXGB=m
  CONFIG_IXGB_NAPI=y
  CONFIG_S2IO=m
  CONFIG_S2IO_NAPI=y
  CONFIG_NETXEN_NIC=m
+CONFIG_MYRI10GE=m
  
  #
  # Token Ring devices
@@ -1540,7 +1547,6 @@ CONFIG_NET_FC=y
  CONFIG_SHAPER=m
  CONFIG_NETCONSOLE=m
  CONFIG_NETPOLL=y
-CONFIG_NETPOLL_RX=y
  CONFIG_NETPOLL_TRAP=y
  CONFIG_NET_POLL_CONTROLLER=y
  
@@ -2044,6 +2050,8 @@ CONFIG_SENSORS_HDAPS=m
  # Misc devices
  #
  CONFIG_IBM_ASM=m
+CONFIG_TIFM_CORE=m
+CONFIG_TIFM_7XX1=m
  
  #
  # Multimedia Capabilities Port drivers
@@ -2690,8 +2698,22 @@ CONFIG_USB_LD=m
  #
  CONFIG_MMC=m
  # CONFIG_MMC_DEBUG is not set
+# CONFIG_MMC_UNSAFE_RESUME is not set
+
+#
+# MMC/SD Card Drivers
+#
  CONFIG_MMC_BLOCK=m
+CONFIG_MMC_BLOCK_BOUNCE=y
+CONFIG_SDIO_UART=m
+
+#
+# MMC/SD Host Controller Drivers
+#
+CONFIG_MMC_SDHCI=m
+CONFIG_MMC_RICOH_MMC=m
  CONFIG_MMC_WBSD=m
+CONFIG_MMC_TIFM_SD=m
  
  #
  # InfiniBand support
@@ -2976,7 +2998,7 @@ CONFIG_LOG_BUF_SHIFT=18
  # CONFIG_DEBUG_SPINLOCK is not set
  # CONFIG_DEBUG_SPINLOCK_SLEEP is not set
  # CONFIG_DEBUG_KOBJECT is not set
-# CONFIG_DEBUG_INFO is not set
+CONFIG_DEBUG_INFO=y
  CONFIG_DEBUG_FS=y
  # CONFIG_DEBUG_VM is not set
  # CONFIG_FRAME_POINTER is not set
@@ -3004,7 +3026,13 @@ CONFIG_SECURITY_APPARMOR=m
  # Cryptographic options
  #
  CONFIG_CRYPTO=y
+CONFIG_CRYPTO_ALGAPI=y
+CONFIG_CRYPTO_ABLKCIPHER=m
+CONFIG_CRYPTO_BLKCIPHER=m
+CONFIG_CRYPTO_HASH=m
+CONFIG_CRYPTO_MANAGER=m
  CONFIG_CRYPTO_HMAC=y
+CONFIG_CRYPTO_XCBC=m
  CONFIG_CRYPTO_NULL=m
  CONFIG_CRYPTO_MD4=m
  CONFIG_CRYPTO_MD5=y
@@ -3013,9 +3041,18 @@ CONFIG_CRYPTO_SHA256=m
  CONFIG_CRYPTO_SHA512=m
  CONFIG_CRYPTO_WP512=m
  CONFIG_CRYPTO_TGR192=m
+CONFIG_CRYPTO_GF128MUL=m
+CONFIG_CRYPTO_ECB=m
+CONFIG_CRYPTO_CBC=m
+CONFIG_CRYPTO_PCBC=m
+CONFIG_CRYPTO_LRW=m
+CONFIG_CRYPTO_CRYPTD=m
  CONFIG_CRYPTO_DES=y
+CONFIG_CRYPTO_FCRYPT=m
  CONFIG_CRYPTO_BLOWFISH=m
  CONFIG_CRYPTO_TWOFISH=m
+CONFIG_CRYPTO_TWOFISH_COMMON=m
+CONFIG_CRYPTO_TWOFISH_X86_64=m
  CONFIG_CRYPTO_SERPENT=m
  CONFIG_CRYPTO_AES=m
  CONFIG_CRYPTO_AES_X86_64=m
@@ -3028,6 +3065,7 @@ CONFIG_CRYPTO_ANUBIS=m
  CONFIG_CRYPTO_DEFLATE=m
  CONFIG_CRYPTO_MICHAEL_MIC=m
  CONFIG_CRYPTO_CRC32C=m
+CONFIG_CRYPTO_CAMELLIA=m
  CONFIG_CRYPTO_TEST=m
  
  #
diff --git a/lustre/kernel_patches/kernel_configs/kernel-2.6.16-2.6-sles10-x86_64.config b/lustre/kernel_patches/kernel_configs/kernel-2.6.16-2.6-sles10-x86_64.config

index 8037b01..ec9a18c 100644 (file)
--- a/lustre/kernel_patches/kernel_configs/kernel-2.6.16-2.6-sles10-x86_64.config
+++ b/lustre/kernel_patches/kernel_configs/kernel-2.6.16-2.6-sles10-x86_64.config
@@ -1,7 +1,7 @@
  #
  # Automatically generated make config: don't edit
-# Linux kernel version: 2.6.16.46
-# Tue Jul  3 18:30:50 2007
+# Linux kernel version: 2.6.16.60
+# Wed May 21 20:29:32 2008
  #
  CONFIG_X86_64=y
  CONFIG_64BIT=y
@@ -32,15 +32,15 @@ CONFIG_LOCALVERSION=""
  CONFIG_LOCALVERSION_AUTO=y
  CONFIG_SUSE_KERNEL=y
  CONFIG_SLE_VERSION=10
-CONFIG_SLE_SP=1
+CONFIG_SLE_SP=2
  CONFIG_SLE_SP_SUBLEVEL=0
  CONFIG_SWAP=y
  CONFIG_SYSVIPC=y
  CONFIG_POSIX_MQUEUE=y
  CONFIG_BSD_PROCESS_ACCT=y
  CONFIG_BSD_PROCESS_ACCT_V3=y
-CONFIG_TASK_DELAY_ACCT=y
  CONFIG_TASKSTATS=y
+CONFIG_TASK_DELAY_ACCT=y
  CONFIG_TASK_XACCT=y
  CONFIG_SYSCTL=y
  CONFIG_AUDIT=y
@@ -180,6 +180,7 @@ CONFIG_ACPI_BUTTON=m
  CONFIG_ACPI_VIDEO=m
  # CONFIG_ACPI_HOTKEY is not set
  CONFIG_ACPI_FAN=m
+CONFIG_ACPI_DOCK=m
  CONFIG_ACPI_PROCESSOR=m
  CONFIG_ACPI_THERMAL=m
  CONFIG_ACPI_ASUS=m
@@ -239,6 +240,7 @@ CONFIG_PCI_MMCONFIG=y
  CONFIG_PCIEPORTBUS=y
  CONFIG_HOTPLUG_PCI_PCIE=m
  # CONFIG_HOTPLUG_PCI_PCIE_POLL_EVENT_MODE is not set
+CONFIG_PCIEAER=y
  CONFIG_PCI_MSI=y
  # CONFIG_PCI_LEGACY_PROC is not set
  # CONFIG_PCI_DEBUG is not set
@@ -1077,6 +1079,7 @@ CONFIG_SCSI_FC_ATTRS=m
  # CONFIG_SCSI_ISCSI_ATTRS is not set
  CONFIG_SCSI_SAS_ATTRS=m
  CONFIG_SCSI_SAS_LIBSAS=m
+# CONFIG_SCSI_SAS_ATA is not set
  # CONFIG_SCSI_SAS_LIBSAS_DEBUG is not set
  CONFIG_ISCSI_TARGET=m
  
@@ -1230,11 +1233,8 @@ CONFIG_DM_MULTIPATH=m
  CONFIG_DM_MULTIPATH_EMC=m
  CONFIG_DM_MULTIPATH_HP_SW=m
  CONFIG_DM_MULTIPATH_RDAC=m
+CONFIG_DM_MULTIPATH_ALUA=m
  CONFIG_DM_NL_EVT=y
-
-#
-# Fusion MPT device support
-#
  CONFIG_FUSION=y
  CONFIG_FUSION_SPI=m
  CONFIG_FUSION_FC=m
@@ -1243,6 +1243,7 @@ CONFIG_FUSION_MAX_SGE=128
  CONFIG_FUSION_MAX_FC_SGE=256
  CONFIG_FUSION_CTL=m
  CONFIG_FUSION_LAN=m
+# CONFIG_FUSION_LOGGING is not set
  
  #
  # IEEE 1394 (FireWire) support
@@ -1391,6 +1392,7 @@ CONFIG_DL2K=m
  CONFIG_E1000=m
  CONFIG_E1000_NAPI=y
  # CONFIG_E1000_DISABLE_PACKET_SPLIT is not set
+CONFIG_IGB=m
  CONFIG_NS83820=m
  CONFIG_HAMACHI=m
  CONFIG_YELLOWFIN=m
@@ -1404,17 +1406,22 @@ CONFIG_SK98LIN=m
  CONFIG_VIA_VELOCITY=m
  CONFIG_TIGON3=m
  CONFIG_BNX2=m
+CONFIG_BNX2X=m
  CONFIG_QLA3XXX=m
  
  #
  # Ethernet (10000 Mbit)
  #
  CONFIG_CHELSIO_T1=m
+# CONFIG_CHELSIO_T3 is not set
+CONFIG_IXGBE=m
+# CONFIG_IXGBE_NAPI is not set
  CONFIG_IXGB=m
  CONFIG_IXGB_NAPI=y
  CONFIG_S2IO=m
  CONFIG_S2IO_NAPI=y
  CONFIG_NETXEN_NIC=m
+CONFIG_MYRI10GE=m
  
  #
  # Token Ring devices
@@ -1522,7 +1529,6 @@ CONFIG_NET_FC=y
  CONFIG_SHAPER=m
  CONFIG_NETCONSOLE=m
  CONFIG_NETPOLL=y
-CONFIG_NETPOLL_RX=y
  CONFIG_NETPOLL_TRAP=y
  CONFIG_NET_POLL_CONTROLLER=y
  
@@ -2032,6 +2038,8 @@ CONFIG_SENSORS_HDAPS=m
  # Misc devices
  #
  CONFIG_IBM_ASM=m
+CONFIG_TIFM_CORE=m
+CONFIG_TIFM_7XX1=m
  
  #
  # Multimedia Capabilities Port drivers
@@ -2678,8 +2686,22 @@ CONFIG_USB_LD=m
  #
  CONFIG_MMC=m
  # CONFIG_MMC_DEBUG is not set
+# CONFIG_MMC_UNSAFE_RESUME is not set
+
+#
+# MMC/SD Card Drivers
+#
  CONFIG_MMC_BLOCK=m
+CONFIG_MMC_BLOCK_BOUNCE=y
+CONFIG_SDIO_UART=m
+
+#
+# MMC/SD Host Controller Drivers
+#
+CONFIG_MMC_SDHCI=m
+CONFIG_MMC_RICOH_MMC=m
  CONFIG_MMC_WBSD=m
+CONFIG_MMC_TIFM_SD=m
  
  #
  # InfiniBand support
@@ -2964,7 +2986,7 @@ CONFIG_LOG_BUF_SHIFT=18
  # CONFIG_DEBUG_SPINLOCK is not set
  # CONFIG_DEBUG_SPINLOCK_SLEEP is not set
  # CONFIG_DEBUG_KOBJECT is not set
-# CONFIG_DEBUG_INFO is not set
+CONFIG_DEBUG_INFO=y
  CONFIG_DEBUG_FS=y
  # CONFIG_DEBUG_VM is not set
  # CONFIG_FRAME_POINTER is not set
@@ -2992,7 +3014,13 @@ CONFIG_SECURITY_APPARMOR=m
  # Cryptographic options
  #
  CONFIG_CRYPTO=y
+CONFIG_CRYPTO_ALGAPI=m
+CONFIG_CRYPTO_ABLKCIPHER=m
+CONFIG_CRYPTO_BLKCIPHER=m
+CONFIG_CRYPTO_HASH=m
+CONFIG_CRYPTO_MANAGER=m
  CONFIG_CRYPTO_HMAC=y
+CONFIG_CRYPTO_XCBC=m
  CONFIG_CRYPTO_NULL=m
  CONFIG_CRYPTO_MD4=m
  CONFIG_CRYPTO_MD5=y
@@ -3001,9 +3029,18 @@ CONFIG_CRYPTO_SHA256=m
  CONFIG_CRYPTO_SHA512=m
  CONFIG_CRYPTO_WP512=m
  CONFIG_CRYPTO_TGR192=m
+CONFIG_CRYPTO_GF128MUL=m
+CONFIG_CRYPTO_ECB=m
+CONFIG_CRYPTO_CBC=m
+CONFIG_CRYPTO_PCBC=m
+CONFIG_CRYPTO_LRW=m
+CONFIG_CRYPTO_CRYPTD=m
  CONFIG_CRYPTO_DES=y
+CONFIG_CRYPTO_FCRYPT=m
  CONFIG_CRYPTO_BLOWFISH=m
  CONFIG_CRYPTO_TWOFISH=m
+CONFIG_CRYPTO_TWOFISH_COMMON=m
+CONFIG_CRYPTO_TWOFISH_X86_64=m
  CONFIG_CRYPTO_SERPENT=m
  CONFIG_CRYPTO_AES=m
  CONFIG_CRYPTO_AES_X86_64=m
@@ -3016,6 +3053,7 @@ CONFIG_CRYPTO_ANUBIS=m
  CONFIG_CRYPTO_DEFLATE=m
  CONFIG_CRYPTO_MICHAEL_MIC=m
  CONFIG_CRYPTO_CRC32C=m
+CONFIG_CRYPTO_CAMELLIA=m
  CONFIG_CRYPTO_TEST=m
  
  #
diff --git a/lustre/kernel_patches/kernel_configs/kernel-2.6.18-2.6-rhel5-ppc64-smp.config b/lustre/kernel_patches/kernel_configs/kernel-2.6.18-2.6-rhel5-ppc64-smp.config

new file mode 100644 (file)

index 0000000..d8a493c
--- /dev/null
+++ b/lustre/kernel_patches/kernel_configs/kernel-2.6.18-2.6-rhel5-ppc64-smp.config
@@ -0,0 +1,3104 @@
+#
+# Automatically generated make config: don't edit
+# Linux kernel version: 2.6.18-prep
+# Sat Jul 12 00:22:15 2008
+#
+CONFIG_PPC64=y
+CONFIG_64BIT=y
+CONFIG_PPC_MERGE=y
+CONFIG_MMU=y
+CONFIG_GENERIC_HARDIRQS=y
+CONFIG_IRQ_PER_CPU=y
+CONFIG_RWSEM_XCHGADD_ALGORITHM=y
+CONFIG_GENERIC_HWEIGHT=y
+CONFIG_GENERIC_CALIBRATE_DELAY=y
+CONFIG_GENERIC_FIND_NEXT_BIT=y
+CONFIG_PPC=y
+CONFIG_EARLY_PRINTK=y
+CONFIG_COMPAT=y
+CONFIG_SYSVIPC_COMPAT=y
+CONFIG_SCHED_NO_NO_OMIT_FRAME_POINTER=y
+CONFIG_ARCH_MAY_HAVE_PC_FDC=y
+CONFIG_PPC_OF=y
+CONFIG_PPC_UDBG_16550=y
+CONFIG_GENERIC_TBSYNC=y
+CONFIG_AUDIT_ARCH=y
+# CONFIG_DEFAULT_UIMAGE is not set
+
+#
+# Processor support
+#
+# CONFIG_POWER4_ONLY is not set
+CONFIG_POWER3=y
+CONFIG_POWER4=y
+CONFIG_PPC_FPU=y
+# CONFIG_PPC_DCR_NATIVE is not set
+CONFIG_PPC_DCR_MMIO=y
+CONFIG_PPC_DCR=y
+CONFIG_ALTIVEC=y
+CONFIG_PPC_STD_MMU=y
+CONFIG_VIRT_CPU_ACCOUNTING=y
+CONFIG_SMP=y
+CONFIG_NR_CPUS=128
+CONFIG_DEFCONFIG_LIST="/lib/modules/$UNAME_RELEASE/.config"
+
+#
+# Code maturity level options
+#
+CONFIG_EXPERIMENTAL=y
+CONFIG_LOCK_KERNEL=y
+CONFIG_INIT_ENV_ARG_LIMIT=32
+
+#
+# General setup
+#
+CONFIG_LOCALVERSION=""
+# CONFIG_LOCALVERSION_AUTO is not set
+CONFIG_SWAP=y
+CONFIG_SYSVIPC=y
+CONFIG_POSIX_MQUEUE=y
+CONFIG_BSD_PROCESS_ACCT=y
+# CONFIG_BSD_PROCESS_ACCT_V3 is not set
+CONFIG_TASKSTATS=y
+CONFIG_TASK_DELAY_ACCT=y
+CONFIG_AUDIT=y
+CONFIG_AUDITSYSCALL=y
+# CONFIG_IKCONFIG is not set
+CONFIG_CPUSETS=y
+CONFIG_RELAY=y
+CONFIG_INITRAMFS_SOURCE=""
+CONFIG_CC_OPTIMIZE_FOR_SIZE=y
+# CONFIG_EMBEDDED is not set
+CONFIG_SYSCTL=y
+CONFIG_KALLSYMS=y
+# CONFIG_KALLSYMS_ALL is not set
+CONFIG_KALLSYMS_EXTRA_PASS=y
+CONFIG_HOTPLUG=y
+CONFIG_PRINTK=y
+CONFIG_BUG=y
+CONFIG_ELF_CORE=y
+CONFIG_BASE_FULL=y
+CONFIG_FUTEX=y
+CONFIG_EPOLL=y
+CONFIG_SHMEM=y
+CONFIG_SLAB=y
+CONFIG_VM_EVENT_COUNTERS=y
+CONFIG_RT_MUTEXES=y
+# CONFIG_TINY_SHMEM is not set
+CONFIG_BASE_SMALL=0
+# CONFIG_SLOB is not set
+
+#
+# Loadable module support
+#
+CONFIG_MODULES=y
+CONFIG_MODULE_UNLOAD=y
+# CONFIG_MODULE_FORCE_UNLOAD is not set
+CONFIG_MODVERSIONS=y
+CONFIG_MODULE_SRCVERSION_ALL=y
+CONFIG_MODULE_SIG=y
+# CONFIG_MODULE_SIG_FORCE is not set
+CONFIG_KMOD=y
+CONFIG_STOP_MACHINE=y
+
+#
+# Process debugging support
+#
+CONFIG_PTRACE=y
+CONFIG_UTRACE=y
+
+#
+# Block layer
+#
+CONFIG_BLK_DEV_IO_TRACE=y
+
+#
+# IO Schedulers
+#
+CONFIG_IOSCHED_NOOP=y
+CONFIG_IOSCHED_AS=y
+CONFIG_IOSCHED_DEADLINE=y
+CONFIG_IOSCHED_CFQ=y
+# CONFIG_DEFAULT_AS is not set
+CONFIG_DEFAULT_DEADLINE=y
+# CONFIG_DEFAULT_CFQ is not set
+# CONFIG_DEFAULT_NOOP is not set
+CONFIG_DEFAULT_IOSCHED="deadline"
+
+#
+# Platform support
+#
+CONFIG_PPC_MULTIPLATFORM=y
+# CONFIG_PPC_ISERIES is not set
+# CONFIG_EMBEDDED6xx is not set
+# CONFIG_APUS is not set
+CONFIG_PPC_PSERIES=y
+CONFIG_PPC_PMAC=y
+CONFIG_PPC_PMAC64=y
+# CONFIG_PPC_MAPLE is not set
+CONFIG_PPC_CELL=y
+CONFIG_PPC_CELL_NATIVE=y
+CONFIG_PPC_IBM_CELL_BLADE=y
+CONFIG_UDBG_RTAS_CONSOLE=y
+CONFIG_XICS=y
+CONFIG_U3_DART=y
+CONFIG_PPC_RTAS=y
+CONFIG_RTAS_ERROR_LOGGING=y
+CONFIG_RTAS_PROC=y
+CONFIG_RTAS_FLASH=y
+CONFIG_MMIO_NVRAM=y
+CONFIG_PPC_PMI=m
+CONFIG_MPIC_BROKEN_U3=y
+CONFIG_IBMVIO=y
+CONFIG_IBMEBUS=y
+# CONFIG_PPC_MPC106 is not set
+CONFIG_PPC_970_NAP=y
+CONFIG_CPU_FREQ=y
+CONFIG_CPU_FREQ_TABLE=y
+CONFIG_CPU_FREQ_DEBUG=y
+CONFIG_CPU_FREQ_STAT=m
+CONFIG_CPU_FREQ_STAT_DETAILS=y
+# CONFIG_CPU_FREQ_DEFAULT_GOV_PERFORMANCE is not set
+CONFIG_CPU_FREQ_DEFAULT_GOV_USERSPACE=y
+CONFIG_CPU_FREQ_GOV_PERFORMANCE=y
+CONFIG_CPU_FREQ_GOV_POWERSAVE=m
+CONFIG_CPU_FREQ_GOV_USERSPACE=y
+CONFIG_CPU_FREQ_GOV_ONDEMAND=m
+CONFIG_CPU_FREQ_GOV_CONSERVATIVE=m
+CONFIG_CPU_FREQ_PMAC64=y
+CONFIG_AXON_RAM=m
+# CONFIG_WANT_EARLY_SERIAL is not set
+CONFIG_MPIC=y
+
+#
+# Cell Broadband Engine options
+#
+CONFIG_SPU_FS=m
+CONFIG_SPU_BASE=y
+CONFIG_SPUFS_MMAP=y
+CONFIG_CBE_RAS=y
+CONFIG_CBE_THERM=m
+CONFIG_CBE_CPUFREQ=m
+CONFIG_CBE_CPUFREQ_PMI=m
+CONFIG_CBE_AXON_UTL=y
+CONFIG_CBE_AXON_PCI=y
+
+#
+# Kernel options
+#
+# CONFIG_HZ_100 is not set
+# CONFIG_HZ_250 is not set
+CONFIG_HZ_1000=y
+CONFIG_HZ=1000
+CONFIG_PREEMPT_NONE=y
+# CONFIG_PREEMPT_VOLUNTARY is not set
+# CONFIG_PREEMPT is not set
+CONFIG_PREEMPT_BKL=y
+CONFIG_BINFMT_ELF=y
+CONFIG_BINFMT_MISC=y
+CONFIG_FORCE_MAX_ZONEORDER=9
+CONFIG_IOMMU_VMERGE=y
+CONFIG_HOTPLUG_CPU=y
+CONFIG_ARCH_ENABLE_MEMORY_HOTPLUG=y
+CONFIG_KEXEC=y
+# CONFIG_CRASH_DUMP is not set
+CONFIG_IRQ_ALL_CPUS=y
+CONFIG_PPC_SPLPAR=y
+CONFIG_EEH=y
+CONFIG_SCANLOG=y
+CONFIG_LPARCFG=y
+CONFIG_NUMA=y
+CONFIG_NODES_SHIFT=4
+CONFIG_ARCH_SELECT_MEMORY_MODEL=y
+CONFIG_ARCH_SPARSEMEM_ENABLE=y
+CONFIG_ARCH_SPARSEMEM_DEFAULT=y
+CONFIG_SELECT_MEMORY_MODEL=y
+# CONFIG_FLATMEM_MANUAL is not set
+# CONFIG_DISCONTIGMEM_MANUAL is not set
+CONFIG_SPARSEMEM_MANUAL=y
+CONFIG_SPARSEMEM=y
+CONFIG_NEED_MULTIPLE_NODES=y
+CONFIG_HAVE_MEMORY_PRESENT=y
+# CONFIG_SPARSEMEM_STATIC is not set
+CONFIG_SPARSEMEM_EXTREME=y
+CONFIG_MEMORY_HOTPLUG=y
+CONFIG_MEMORY_HOTPLUG_SPARSE=y
+CONFIG_SPLIT_PTLOCK_CPUS=4
+CONFIG_MIGRATION=y
+CONFIG_RESOURCES_64BIT=y
+CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID=y
+CONFIG_ARCH_MEMORY_PROBE=y
+CONFIG_NODES_SPAN_OTHER_NODES=y
+CONFIG_PPC_64K_PAGES=y
+CONFIG_SCHED_SMT=y
+CONFIG_PROC_DEVICETREE=y
+# CONFIG_CMDLINE_BOOL is not set
+CONFIG_PM=y
+CONFIG_PM_LEGACY=y
+# CONFIG_PM_DEBUG is not set
+# CONFIG_SECCOMP is not set
+CONFIG_ISA_DMA_API=y
+
+#
+# Bus options
+#
+CONFIG_GENERIC_ISA_DMA=y
+# CONFIG_MPIC_WEIRD is not set
+CONFIG_PPC_I8259=y
+# CONFIG_PPC_INDIRECT_PCI is not set
+CONFIG_PCI=y
+CONFIG_PCI_DOMAINS=y
+CONFIG_PCIEPORTBUS=y
+# CONFIG_HOTPLUG_PCI_PCIE is not set
+CONFIG_PCI_MSI=y
+# CONFIG_PCI_DEBUG is not set
+
+#
+# PCCARD (PCMCIA/CardBus) support
+#
+CONFIG_PCCARD=y
+# CONFIG_PCMCIA_DEBUG is not set
+CONFIG_PCMCIA=y
+CONFIG_PCMCIA_LOAD_CIS=y
+CONFIG_PCMCIA_IOCTL=y
+CONFIG_CARDBUS=y
+
+#
+# PC-card bridges
+#
+CONFIG_YENTA=y
+CONFIG_YENTA_O2=y
+CONFIG_YENTA_RICOH=y
+CONFIG_YENTA_TI=y
+CONFIG_YENTA_ENE_TUNE=y
+CONFIG_YENTA_TOSHIBA=y
+CONFIG_PD6729=m
+CONFIG_I82092=m
+CONFIG_PCCARD_NONSTATIC=y
+
+#
+# PCI Hotplug Support
+#
+CONFIG_HOTPLUG_PCI=y
+# CONFIG_HOTPLUG_PCI_FAKE is not set
+# CONFIG_HOTPLUG_PCI_CPCI is not set
+CONFIG_HOTPLUG_PCI_SHPC=m
+CONFIG_HOTPLUG_PCI_SHPC_POLL_EVENT_MODE=y
+CONFIG_HOTPLUG_PCI_RPA=y
+CONFIG_HOTPLUG_PCI_RPA_DLPAR=y
+CONFIG_KERNEL_START=0xc000000000000000
+
+#
+# Networking
+#
+CONFIG_NET=y
+
+#
+# Networking options
+#
+# CONFIG_NETDEBUG is not set
+CONFIG_PACKET=y
+CONFIG_PACKET_MMAP=y
+CONFIG_UNIX=y
+CONFIG_XFRM=y
+CONFIG_XFRM_NALGO=m
+CONFIG_XFRM_USER=y
+CONFIG_NET_KEY=m
+CONFIG_INET=y
+CONFIG_IP_MULTICAST=y
+CONFIG_IP_ADVANCED_ROUTER=y
+CONFIG_ASK_IP_FIB_HASH=y
+# CONFIG_IP_FIB_TRIE is not set
+CONFIG_IP_FIB_HASH=y
+CONFIG_IP_MULTIPLE_TABLES=y
+CONFIG_IP_ROUTE_FWMARK=y
+CONFIG_IP_ROUTE_MULTIPATH=y
+# CONFIG_IP_ROUTE_MULTIPATH_CACHED is not set
+CONFIG_IP_ROUTE_VERBOSE=y
+# CONFIG_IP_PNP is not set
+CONFIG_NET_IPIP=m
+CONFIG_NET_IPGRE=m
+CONFIG_NET_IPGRE_BROADCAST=y
+CONFIG_IP_MROUTE=y
+CONFIG_IP_PIMSM_V1=y
+CONFIG_IP_PIMSM_V2=y
+# CONFIG_ARPD is not set
+CONFIG_SYN_COOKIES=y
+CONFIG_INET_AH=m
+CONFIG_INET_ESP=m
+CONFIG_INET_IPCOMP=m
+CONFIG_INET_XFRM_TUNNEL=m
+CONFIG_INET_TUNNEL=m
+CONFIG_INET_XFRM_MODE_TRANSPORT=m
+CONFIG_INET_XFRM_MODE_TUNNEL=m
+CONFIG_INET_DIAG=m
+CONFIG_INET_TCP_DIAG=m
+CONFIG_TCP_CONG_ADVANCED=y
+
+#
+# TCP congestion control
+#
+CONFIG_TCP_CONG_BIC=y
+CONFIG_TCP_CONG_CUBIC=m
+CONFIG_TCP_CONG_WESTWOOD=m
+CONFIG_TCP_CONG_HTCP=m
+CONFIG_TCP_CONG_HSTCP=m
+CONFIG_TCP_CONG_HYBLA=m
+CONFIG_TCP_CONG_VEGAS=m
+CONFIG_TCP_CONG_SCALABLE=m
+CONFIG_TCP_CONG_LP=m
+CONFIG_TCP_CONG_VENO=m
+
+#
+# IP: Virtual Server Configuration
+#
+CONFIG_IP_VS=m
+# CONFIG_IP_VS_DEBUG is not set
+CONFIG_IP_VS_TAB_BITS=12
+
+#
+# IPVS transport protocol load balancing support
+#
+CONFIG_IP_VS_PROTO_TCP=y
+CONFIG_IP_VS_PROTO_UDP=y
+CONFIG_IP_VS_PROTO_ESP=y
+CONFIG_IP_VS_PROTO_AH=y
+
+#
+# IPVS scheduler
+#
+CONFIG_IP_VS_RR=m
+CONFIG_IP_VS_WRR=m
+CONFIG_IP_VS_LC=m
+CONFIG_IP_VS_WLC=m
+CONFIG_IP_VS_LBLC=m
+CONFIG_IP_VS_LBLCR=m
+CONFIG_IP_VS_DH=m
+CONFIG_IP_VS_SH=m
+CONFIG_IP_VS_SED=m
+CONFIG_IP_VS_NQ=m
+
+#
+# IPVS application helper
+#
+CONFIG_IP_VS_FTP=m
+CONFIG_IPV6=m
+CONFIG_IPV6_PRIVACY=y
+CONFIG_IPV6_ROUTER_PREF=y
+CONFIG_IPV6_ROUTE_INFO=y
+CONFIG_IPV6_OPTIMISTIC_DAD=y
+CONFIG_INET6_AH=m
+CONFIG_INET6_ESP=m
+CONFIG_INET6_IPCOMP=m
+CONFIG_INET6_XFRM_TUNNEL=m
+CONFIG_INET6_TUNNEL=m
+CONFIG_INET6_XFRM_MODE_TRANSPORT=m
+CONFIG_INET6_XFRM_MODE_TUNNEL=m
+CONFIG_IPV6_TUNNEL=m
+# CONFIG_IPV6_SUBTREES is not set
+CONFIG_IPV6_MULTIPLE_TABLES=y
+CONFIG_IPV6_ROUTE_FWMARK=y
+CONFIG_NETWORK_SECMARK=y
+CONFIG_NETFILTER=y
+# CONFIG_NETFILTER_DEBUG is not set
+CONFIG_BRIDGE_NETFILTER=y
+
+#
+# Core Netfilter Configuration
+#
+CONFIG_NETFILTER_NETLINK=m
+CONFIG_NETFILTER_NETLINK_QUEUE=m
+CONFIG_NETFILTER_NETLINK_LOG=m
+CONFIG_NETFILTER_XTABLES=m
+CONFIG_NETFILTER_XT_TARGET_CLASSIFY=m
+CONFIG_NETFILTER_XT_TARGET_CONNMARK=m
+CONFIG_NETFILTER_XT_TARGET_MARK=m
+CONFIG_NETFILTER_XT_TARGET_NFQUEUE=m
+CONFIG_NETFILTER_XT_TARGET_NOTRACK=m
+CONFIG_NETFILTER_XT_TARGET_SECMARK=m
+CONFIG_NETFILTER_XT_TARGET_CONNSECMARK=m
+CONFIG_NETFILTER_XT_MATCH_COMMENT=m
+CONFIG_NETFILTER_XT_MATCH_CONNBYTES=m
+CONFIG_NETFILTER_XT_MATCH_CONNMARK=m
+CONFIG_NETFILTER_XT_MATCH_CONNTRACK=m
+CONFIG_NETFILTER_XT_MATCH_DCCP=m
+CONFIG_NETFILTER_XT_MATCH_ESP=m
+CONFIG_NETFILTER_XT_MATCH_HELPER=m
+CONFIG_NETFILTER_XT_MATCH_LENGTH=m
+CONFIG_NETFILTER_XT_MATCH_LIMIT=m
+CONFIG_NETFILTER_XT_MATCH_MAC=m
+CONFIG_NETFILTER_XT_MATCH_MARK=m
+CONFIG_NETFILTER_XT_MATCH_POLICY=m
+CONFIG_NETFILTER_XT_MATCH_MULTIPORT=m
+CONFIG_NETFILTER_XT_MATCH_PHYSDEV=m
+CONFIG_NETFILTER_XT_MATCH_PKTTYPE=m
+CONFIG_NETFILTER_XT_MATCH_QUOTA=m
+CONFIG_NETFILTER_XT_MATCH_REALM=m
+CONFIG_NETFILTER_XT_MATCH_SCTP=m
+CONFIG_NETFILTER_XT_MATCH_STATE=m
+CONFIG_NETFILTER_XT_MATCH_STATISTIC=m
+CONFIG_NETFILTER_XT_MATCH_STRING=m
+CONFIG_NETFILTER_XT_MATCH_TCPMSS=m
+
+#
+# IP: Netfilter Configuration
+#
+CONFIG_IP_NF_CONNTRACK=m
+CONFIG_IP_NF_CT_ACCT=y
+CONFIG_IP_NF_CONNTRACK_MARK=y
+CONFIG_IP_NF_CONNTRACK_SECMARK=y
+CONFIG_IP_NF_CONNTRACK_EVENTS=y
+CONFIG_IP_NF_CONNTRACK_NETLINK=m
+CONFIG_IP_NF_CT_PROTO_SCTP=m
+CONFIG_IP_NF_FTP=m
+CONFIG_IP_NF_IRC=m
+CONFIG_IP_NF_NETBIOS_NS=m
+CONFIG_IP_NF_TFTP=m
+CONFIG_IP_NF_AMANDA=m
+CONFIG_IP_NF_PPTP=m
+CONFIG_IP_NF_H323=m
+CONFIG_IP_NF_SIP=m
+CONFIG_IP_NF_QUEUE=m
+CONFIG_IP_NF_IPTABLES=m
+CONFIG_IP_NF_MATCH_IPRANGE=m
+CONFIG_IP_NF_MATCH_TOS=m
+CONFIG_IP_NF_MATCH_RECENT=m
+CONFIG_IP_NF_MATCH_ECN=m
+CONFIG_IP_NF_MATCH_DSCP=m
+CONFIG_IP_NF_MATCH_AH=m
+CONFIG_IP_NF_MATCH_TTL=m
+CONFIG_IP_NF_MATCH_OWNER=m
+CONFIG_IP_NF_MATCH_ADDRTYPE=m
+CONFIG_IP_NF_MATCH_HASHLIMIT=m
+CONFIG_IP_NF_FILTER=m
+CONFIG_IP_NF_TARGET_REJECT=m
+CONFIG_IP_NF_TARGET_LOG=m
+CONFIG_IP_NF_TARGET_ULOG=m
+CONFIG_IP_NF_TARGET_TCPMSS=m
+CONFIG_IP_NF_NAT=m
+CONFIG_IP_NF_NAT_NEEDED=y
+CONFIG_IP_NF_TARGET_MASQUERADE=m
+CONFIG_IP_NF_TARGET_REDIRECT=m
+CONFIG_IP_NF_TARGET_NETMAP=m
+CONFIG_IP_NF_TARGET_SAME=m
+CONFIG_IP_NF_NAT_SNMP_BASIC=m
+CONFIG_IP_NF_NAT_IRC=m
+CONFIG_IP_NF_NAT_FTP=m
+CONFIG_IP_NF_NAT_TFTP=m
+CONFIG_IP_NF_NAT_AMANDA=m
+CONFIG_IP_NF_NAT_PPTP=m
+CONFIG_IP_NF_NAT_H323=m
+CONFIG_IP_NF_NAT_SIP=m
+CONFIG_IP_NF_MANGLE=m
+CONFIG_IP_NF_TARGET_TOS=m
+CONFIG_IP_NF_TARGET_ECN=m
+CONFIG_IP_NF_TARGET_DSCP=m
+CONFIG_IP_NF_TARGET_TTL=m
+CONFIG_IP_NF_TARGET_CLUSTERIP=m
+CONFIG_IP_NF_RAW=m
+CONFIG_IP_NF_ARPTABLES=m
+CONFIG_IP_NF_ARPFILTER=m
+CONFIG_IP_NF_ARP_MANGLE=m
+
+#
+# IPv6: Netfilter Configuration (EXPERIMENTAL)
+#
+CONFIG_IP6_NF_QUEUE=m
+CONFIG_IP6_NF_IPTABLES=m
+CONFIG_IP6_NF_MATCH_RT=m
+CONFIG_IP6_NF_MATCH_OPTS=m
+CONFIG_IP6_NF_MATCH_FRAG=m
+CONFIG_IP6_NF_MATCH_HL=m
+CONFIG_IP6_NF_MATCH_OWNER=m
+CONFIG_IP6_NF_MATCH_IPV6HEADER=m
+CONFIG_IP6_NF_MATCH_AH=m
+CONFIG_IP6_NF_MATCH_EUI64=m
+CONFIG_IP6_NF_FILTER=m
+CONFIG_IP6_NF_TARGET_LOG=m
+CONFIG_IP6_NF_TARGET_REJECT=m
+CONFIG_IP6_NF_MANGLE=m
+CONFIG_IP6_NF_TARGET_HL=m
+CONFIG_IP6_NF_RAW=m
+
+#
+# DECnet: Netfilter Configuration
+#
+# CONFIG_DECNET_NF_GRABULATOR is not set
+
+#
+# Bridge: Netfilter Configuration
+#
+CONFIG_BRIDGE_NF_EBTABLES=m
+CONFIG_BRIDGE_EBT_BROUTE=m
+CONFIG_BRIDGE_EBT_T_FILTER=m
+CONFIG_BRIDGE_EBT_T_NAT=m
+CONFIG_BRIDGE_EBT_802_3=m
+CONFIG_BRIDGE_EBT_AMONG=m
+CONFIG_BRIDGE_EBT_ARP=m
+CONFIG_BRIDGE_EBT_IP=m
+CONFIG_BRIDGE_EBT_LIMIT=m
+CONFIG_BRIDGE_EBT_MARK=m
+CONFIG_BRIDGE_EBT_PKTTYPE=m
+CONFIG_BRIDGE_EBT_STP=m
+CONFIG_BRIDGE_EBT_VLAN=m
+CONFIG_BRIDGE_EBT_ARPREPLY=m
+CONFIG_BRIDGE_EBT_DNAT=m
+CONFIG_BRIDGE_EBT_MARK_T=m
+CONFIG_BRIDGE_EBT_REDIRECT=m
+CONFIG_BRIDGE_EBT_SNAT=m
+CONFIG_BRIDGE_EBT_LOG=m
+CONFIG_BRIDGE_EBT_ULOG=m
+
+#
+# DCCP Configuration (EXPERIMENTAL)
+#
+CONFIG_IP_DCCP=m
+CONFIG_INET_DCCP_DIAG=m
+CONFIG_IP_DCCP_ACKVEC=y
+
+#
+# DCCP CCIDs Configuration (EXPERIMENTAL)
+#
+CONFIG_IP_DCCP_CCID2=m
+CONFIG_IP_DCCP_CCID3=m
+CONFIG_IP_DCCP_TFRC_LIB=m
+
+#
+# DCCP Kernel Hacking
+#
+# CONFIG_IP_DCCP_DEBUG is not set
+
+#
+# SCTP Configuration (EXPERIMENTAL)
+#
+CONFIG_IP_SCTP=m
+# CONFIG_SCTP_DBG_MSG is not set
+# CONFIG_SCTP_DBG_OBJCNT is not set
+# CONFIG_SCTP_HMAC_NONE is not set
+# CONFIG_SCTP_HMAC_SHA1 is not set
+CONFIG_SCTP_HMAC_MD5=y
+
+#
+# TIPC Configuration (EXPERIMENTAL)
+#
+CONFIG_TIPC=m
+# CONFIG_TIPC_ADVANCED is not set
+# CONFIG_TIPC_DEBUG is not set
+CONFIG_ATM=m
+CONFIG_ATM_CLIP=m
+# CONFIG_ATM_CLIP_NO_ICMP is not set
+CONFIG_ATM_LANE=m
+# CONFIG_ATM_MPOA is not set
+CONFIG_ATM_BR2684=m
+# CONFIG_ATM_BR2684_IPFILTER is not set
+CONFIG_BRIDGE=m
+CONFIG_VLAN_8021Q=m
+CONFIG_DECNET=m
+CONFIG_DECNET_ROUTER=y
+CONFIG_DECNET_ROUTE_FWMARK=y
+CONFIG_LLC=y
+# CONFIG_LLC2 is not set
+CONFIG_IPX=m
+# CONFIG_IPX_INTERN is not set
+CONFIG_ATALK=m
+CONFIG_DEV_APPLETALK=m
+CONFIG_IPDDP=m
+CONFIG_IPDDP_ENCAP=y
+CONFIG_IPDDP_DECAP=y
+# CONFIG_X25 is not set
+# CONFIG_LAPB is not set
+# CONFIG_ECONET is not set
+CONFIG_WAN_ROUTER=m
+
+#
+# QoS and/or fair queueing
+#
+CONFIG_NET_SCHED=y
+# CONFIG_NET_SCH_CLK_JIFFIES is not set
+CONFIG_NET_SCH_CLK_GETTIMEOFDAY=y
+# CONFIG_NET_SCH_CLK_CPU is not set
+
+#
+# Queueing/Scheduling
+#
+CONFIG_NET_SCH_CBQ=m
+CONFIG_NET_SCH_HTB=m
+CONFIG_NET_SCH_HFSC=m
+CONFIG_NET_SCH_ATM=m
+CONFIG_NET_SCH_PRIO=m
+CONFIG_NET_SCH_RED=m
+CONFIG_NET_SCH_SFQ=m
+CONFIG_NET_SCH_TEQL=m
+CONFIG_NET_SCH_TBF=m
+CONFIG_NET_SCH_GRED=m
+CONFIG_NET_SCH_DSMARK=m
+CONFIG_NET_SCH_NETEM=m
+CONFIG_NET_SCH_INGRESS=m
+
+#
+# Classification
+#
+CONFIG_NET_CLS=y
+CONFIG_NET_CLS_BASIC=m
+CONFIG_NET_CLS_TCINDEX=m
+CONFIG_NET_CLS_ROUTE4=m
+CONFIG_NET_CLS_ROUTE=y
+CONFIG_NET_CLS_FW=m
+CONFIG_NET_CLS_U32=m
+CONFIG_CLS_U32_PERF=y
+CONFIG_CLS_U32_MARK=y
+CONFIG_NET_CLS_RSVP=m
+CONFIG_NET_CLS_RSVP6=m
+CONFIG_NET_EMATCH=y
+CONFIG_NET_EMATCH_STACK=32
+CONFIG_NET_EMATCH_CMP=m
+CONFIG_NET_EMATCH_NBYTE=m
+CONFIG_NET_EMATCH_U32=m
+CONFIG_NET_EMATCH_META=m
+CONFIG_NET_EMATCH_TEXT=m
+CONFIG_NET_CLS_ACT=y
+CONFIG_NET_ACT_POLICE=m
+CONFIG_NET_ACT_GACT=m
+CONFIG_GACT_PROB=y
+CONFIG_NET_ACT_MIRRED=m
+CONFIG_NET_ACT_IPT=m
+CONFIG_NET_ACT_PEDIT=m
+CONFIG_NET_ACT_SIMP=m
+CONFIG_NET_CLS_IND=y
+CONFIG_NET_ESTIMATOR=y
+
+#
+# Network testing
+#
+CONFIG_NET_PKTGEN=m
+# CONFIG_NET_TCPPROBE is not set
+# CONFIG_HAMRADIO is not set
+CONFIG_IRDA=m
+
+#
+# IrDA protocols
+#
+CONFIG_IRLAN=m
+CONFIG_IRNET=m
+CONFIG_IRCOMM=m
+# CONFIG_IRDA_ULTRA is not set
+
+#
+# IrDA options
+#
+CONFIG_IRDA_CACHE_LAST_LSAP=y
+CONFIG_IRDA_FAST_RR=y
+# CONFIG_IRDA_DEBUG is not set
+
+#
+# Infrared-port device drivers
+#
+
+#
+# SIR device drivers
+#
+CONFIG_IRTTY_SIR=m
+
+#
+# Dongle support
+#
+CONFIG_DONGLE=y
+CONFIG_ESI_DONGLE=m
+CONFIG_ACTISYS_DONGLE=m
+CONFIG_TEKRAM_DONGLE=m
+CONFIG_TOIM3232_DONGLE=m
+CONFIG_LITELINK_DONGLE=m
+CONFIG_MA600_DONGLE=m
+CONFIG_GIRBIL_DONGLE=m
+CONFIG_MCP2120_DONGLE=m
+CONFIG_OLD_BELKIN_DONGLE=m
+CONFIG_ACT200L_DONGLE=m
+
+#
+# Old SIR device drivers
+#
+
+#
+# Old Serial dongle support
+#
+
+#
+# FIR device drivers
+#
+CONFIG_USB_IRDA=m
+CONFIG_SIGMATEL_FIR=m
+CONFIG_NSC_FIR=m
+CONFIG_WINBOND_FIR=m
+CONFIG_SMC_IRCC_FIR=m
+CONFIG_ALI_FIR=m
+CONFIG_VLSI_FIR=m
+CONFIG_VIA_FIR=m
+CONFIG_MCS_FIR=m
+CONFIG_BT=m
+CONFIG_BT_L2CAP=m
+CONFIG_BT_SCO=m
+CONFIG_BT_RFCOMM=m
+CONFIG_BT_RFCOMM_TTY=y
+CONFIG_BT_BNEP=m
+CONFIG_BT_BNEP_MC_FILTER=y
+CONFIG_BT_BNEP_PROTO_FILTER=y
+CONFIG_BT_CMTP=m
+CONFIG_BT_HIDP=m
+
+#
+# Bluetooth device drivers
+#
+CONFIG_BT_HCIUSB=m
+CONFIG_BT_HCIUSB_SCO=y
+CONFIG_BT_HCIUART=m
+CONFIG_BT_HCIUART_H4=y
+CONFIG_BT_HCIUART_BCSP=y
+CONFIG_BT_HCIBCM203X=m
+CONFIG_BT_HCIBPA10X=m
+CONFIG_BT_HCIBFUSB=m
+CONFIG_BT_HCIDTL1=m
+CONFIG_BT_HCIBT3C=m
+CONFIG_BT_HCIBLUECARD=m
+CONFIG_BT_HCIBTUART=m
+CONFIG_BT_HCIVHCI=m
+CONFIG_TUX=m
+
+#
+# TUX options
+#
+CONFIG_TUX_EXTCGI=y
+CONFIG_TUX_EXTENDED_LOG=y
+# CONFIG_TUX_DEBUG is not set
+CONFIG_NETLABEL=y
+CONFIG_FIB_RULES=y
+
+#
+# Wireless
+#
+CONFIG_CFG80211=m
+CONFIG_NL80211=y
+CONFIG_WIRELESS_EXT=y
+CONFIG_NET_WIRELESS_RTNETLINK=y
+CONFIG_MAC80211=m
+CONFIG_MAC80211_RCSIMPLE=y
+CONFIG_MAC80211_LEDS=y
+# CONFIG_MAC80211_DEBUGFS is not set
+# CONFIG_MAC80211_DEBUG is not set
+CONFIG_IEEE80211=m
+CONFIG_IEEE80211_DEBUG=y
+CONFIG_IEEE80211_CRYPT_WEP=m
+CONFIG_IEEE80211_CRYPT_CCMP=m
+CONFIG_IEEE80211_CRYPT_TKIP=m
+CONFIG_IEEE80211_SOFTMAC=m
+CONFIG_IEEE80211_SOFTMAC_DEBUG=y
+
+#
+# Device Drivers
+#
+
+#
+# Generic Driver Options
+#
+CONFIG_STANDALONE=y
+CONFIG_PREVENT_FIRMWARE_BUILD=y
+CONFIG_FW_LOADER=y
+# CONFIG_DEBUG_DRIVER is not set
+# CONFIG_DEBUG_DEVRES is not set
+# CONFIG_SYS_HYPERVISOR is not set
+
+#
+# Connector - unified userspace <-> kernelspace linker
+#
+CONFIG_CONNECTOR=y
+CONFIG_PROC_EVENTS=y
+
+#
+# Memory Technology Devices (MTD)
+#
+CONFIG_MTD=m
+# CONFIG_MTD_DEBUG is not set
+CONFIG_MTD_CONCAT=m
+CONFIG_MTD_PARTITIONS=y
+CONFIG_MTD_REDBOOT_PARTS=m
+CONFIG_MTD_REDBOOT_DIRECTORY_BLOCK=-1
+# CONFIG_MTD_REDBOOT_PARTS_UNALLOCATED is not set
+# CONFIG_MTD_REDBOOT_PARTS_READONLY is not set
+CONFIG_MTD_CMDLINE_PARTS=y
+
+#
+# User Modules And Translation Layers
+#
+CONFIG_MTD_CHAR=m
+CONFIG_MTD_BLOCK=m
+CONFIG_MTD_BLOCK_RO=m
+CONFIG_FTL=m
+CONFIG_NFTL=m
+CONFIG_NFTL_RW=y
+CONFIG_INFTL=m
+CONFIG_RFD_FTL=m
+
+#
+# RAM/ROM/Flash chip drivers
+#
+CONFIG_MTD_CFI=m
+CONFIG_MTD_JEDECPROBE=m
+CONFIG_MTD_GEN_PROBE=m
+# CONFIG_MTD_CFI_ADV_OPTIONS is not set
+CONFIG_MTD_MAP_BANK_WIDTH_1=y
+CONFIG_MTD_MAP_BANK_WIDTH_2=y
+CONFIG_MTD_MAP_BANK_WIDTH_4=y
+# CONFIG_MTD_MAP_BANK_WIDTH_8 is not set
+# CONFIG_MTD_MAP_BANK_WIDTH_16 is not set
+# CONFIG_MTD_MAP_BANK_WIDTH_32 is not set
+CONFIG_MTD_CFI_I1=y
+CONFIG_MTD_CFI_I2=y
+# CONFIG_MTD_CFI_I4 is not set
+# CONFIG_MTD_CFI_I8 is not set
+CONFIG_MTD_CFI_INTELEXT=m
+CONFIG_MTD_CFI_AMDSTD=m
+CONFIG_MTD_CFI_STAA=m
+CONFIG_MTD_CFI_UTIL=m
+CONFIG_MTD_RAM=m
+CONFIG_MTD_ROM=m
+CONFIG_MTD_ABSENT=m
+# CONFIG_MTD_OBSOLETE_CHIPS is not set
+
+#
+# Mapping drivers for chip access
+#
+CONFIG_MTD_COMPLEX_MAPPINGS=y
+# CONFIG_MTD_PHYSMAP is not set
+CONFIG_MTD_PCI=m
+# CONFIG_MTD_PLATRAM is not set
+
+#
+# Self-contained MTD device drivers
+#
+CONFIG_MTD_PMC551=m
+# CONFIG_MTD_PMC551_BUGFIX is not set
+# CONFIG_MTD_PMC551_DEBUG is not set
+# CONFIG_MTD_SLRAM is not set
+# CONFIG_MTD_PHRAM is not set
+CONFIG_MTD_MTDRAM=m
+CONFIG_MTDRAM_TOTAL_SIZE=4096
+CONFIG_MTDRAM_ERASE_SIZE=128
+CONFIG_MTD_BLOCK2MTD=m
+
+#
+# Disk-On-Chip Device Drivers
+#
+# CONFIG_MTD_DOC2000 is not set
+# CONFIG_MTD_DOC2001 is not set
+# CONFIG_MTD_DOC2001PLUS is not set
+
+#
+# NAND Flash Device Drivers
+#
+CONFIG_MTD_NAND=m
+# CONFIG_MTD_NAND_VERIFY_WRITE is not set
+CONFIG_MTD_NAND_ECC_SMC=y
+CONFIG_MTD_NAND_IDS=m
+# CONFIG_MTD_NAND_DISKONCHIP is not set
+CONFIG_MTD_NAND_NANDSIM=m
+
+#
+# OneNAND Flash Device Drivers
+#
+# CONFIG_MTD_ONENAND is not set
+
+#
+# Parallel port support
+#
+CONFIG_PARPORT=m
+CONFIG_PARPORT_PC=m
+CONFIG_PARPORT_SERIAL=m
+# CONFIG_PARPORT_PC_FIFO is not set
+# CONFIG_PARPORT_PC_SUPERIO is not set
+CONFIG_PARPORT_PC_PCMCIA=m
+CONFIG_PARPORT_NOT_PC=y
+# CONFIG_PARPORT_GSC is not set
+# CONFIG_PARPORT_AX88796 is not set
+CONFIG_PARPORT_1284=y
+
+#
+# Plug and Play support
+#
+
+#
+# Block devices
+#
+CONFIG_BLK_DEV_FD=m
+CONFIG_PARIDE=m
+CONFIG_PARIDE_PARPORT=m
+
+#
+# Parallel IDE high-level drivers
+#
+CONFIG_PARIDE_PD=m
+CONFIG_PARIDE_PCD=m
+CONFIG_PARIDE_PF=m
+CONFIG_PARIDE_PT=m
+CONFIG_PARIDE_PG=m
+
+#
+# Parallel IDE protocol modules
+#
+CONFIG_PARIDE_ATEN=m
+CONFIG_PARIDE_BPCK=m
+CONFIG_PARIDE_COMM=m
+CONFIG_PARIDE_DSTR=m
+CONFIG_PARIDE_FIT2=m
+CONFIG_PARIDE_FIT3=m
+CONFIG_PARIDE_EPAT=m
+CONFIG_PARIDE_EPATC8=y
+CONFIG_PARIDE_EPIA=m
+CONFIG_PARIDE_FRIQ=m
+CONFIG_PARIDE_FRPW=m
+CONFIG_PARIDE_KBIC=m
+CONFIG_PARIDE_KTTI=m
+CONFIG_PARIDE_ON20=m
+CONFIG_PARIDE_ON26=m
+# CONFIG_BLK_CPQ_DA is not set
+CONFIG_BLK_CPQ_CISS_DA=m
+CONFIG_CISS_SCSI_TAPE=y
+CONFIG_BLK_DEV_DAC960=m
+CONFIG_BLK_DEV_UMEM=m
+# CONFIG_BLK_DEV_COW_COMMON is not set
+CONFIG_BLK_DEV_LOOP=m
+CONFIG_BLK_DEV_CRYPTOLOOP=m
+CONFIG_BLK_DEV_NBD=m
+CONFIG_BLK_DEV_SX8=m
+# CONFIG_BLK_DEV_UB is not set
+CONFIG_BLK_DEV_RAM=y
+CONFIG_BLK_DEV_RAM_COUNT=16
+CONFIG_BLK_DEV_RAM_SIZE=16384
+CONFIG_BLK_DEV_RAM_BLOCKSIZE=4096
+CONFIG_BLK_DEV_INITRD=y
+CONFIG_CDROM_PKTCDVD=m
+CONFIG_CDROM_PKTCDVD_BUFFERS=8
+# CONFIG_CDROM_PKTCDVD_WCACHE is not set
+CONFIG_ATA_OVER_ETH=m
+
+#
+# ATA/ATAPI/MFM/RLL support
+#
+CONFIG_IDE=y
+CONFIG_BLK_DEV_IDE=y
+
+#
+# Please see Documentation/ide.txt for help/info on IDE drives
+#
+# CONFIG_BLK_DEV_IDE_SATA is not set
+CONFIG_BLK_DEV_IDEDISK=y
+CONFIG_IDEDISK_MULTI_MODE=y
+CONFIG_BLK_DEV_IDECS=m
+CONFIG_BLK_DEV_IDECD=m
+# CONFIG_BLK_DEV_IDETAPE is not set
+CONFIG_BLK_DEV_IDEFLOPPY=y
+CONFIG_BLK_DEV_IDESCSI=m
+CONFIG_IDE_TASK_IOCTL=y
+
+#
+# IDE chipset support/bugfixes
+#
+CONFIG_IDE_GENERIC=y
+CONFIG_BLK_DEV_IDEPCI=y
+CONFIG_IDEPCI_SHARE_IRQ=y
+# CONFIG_BLK_DEV_OFFBOARD is not set
+CONFIG_BLK_DEV_GENERIC=y
+# CONFIG_BLK_DEV_OPTI621 is not set
+CONFIG_BLK_DEV_SL82C105=y
+CONFIG_BLK_DEV_IDEDMA_PCI=y
+# CONFIG_BLK_DEV_IDEDMA_FORCED is not set
+CONFIG_IDEDMA_PCI_AUTO=y
+# CONFIG_IDEDMA_ONLYDISK is not set
+CONFIG_BLK_DEV_AEC62XX=y
+CONFIG_BLK_DEV_ALI15X3=y
+# CONFIG_WDC_ALI15X3 is not set
+CONFIG_BLK_DEV_AMD74XX=y
+CONFIG_BLK_DEV_CMD64X=y
+CONFIG_BLK_DEV_TRIFLEX=y
+# CONFIG_BLK_DEV_CY82C693 is not set
+CONFIG_BLK_DEV_CS5520=y
+CONFIG_BLK_DEV_CS5530=y
+CONFIG_BLK_DEV_HPT34X=y
+# CONFIG_HPT34X_AUTODMA is not set
+CONFIG_BLK_DEV_HPT366=y
+# CONFIG_BLK_DEV_SC1200 is not set
+CONFIG_BLK_DEV_PIIX=y
+CONFIG_BLK_DEV_IT821X=y
+# CONFIG_BLK_DEV_NS87415 is not set
+CONFIG_BLK_DEV_PDC202XX_OLD=y
+# CONFIG_PDC202XX_BURST is not set
+CONFIG_BLK_DEV_PDC202XX_NEW=y
+CONFIG_BLK_DEV_SVWKS=y
+CONFIG_BLK_DEV_SIIMAGE=y
+# CONFIG_BLK_DEV_SLC90E66 is not set
+# CONFIG_BLK_DEV_TRM290 is not set
+CONFIG_BLK_DEV_VIA82CXXX=y
+CONFIG_BLK_DEV_IDE_PMAC=y
+CONFIG_BLK_DEV_IDE_PMAC_ATA100FIRST=y
+CONFIG_BLK_DEV_IDEDMA_PMAC=y
+# CONFIG_IDE_ARM is not set
+CONFIG_BLK_DEV_IDEDMA=y
+# CONFIG_IDEDMA_IVB is not set
+CONFIG_IDEDMA_AUTO=y
+# CONFIG_BLK_DEV_HD is not set
+
+#
+# SCSI device support
+#
+CONFIG_RAID_ATTRS=m
+CONFIG_SCSI=m
+CONFIG_SCSI_NETLINK=y
+CONFIG_SCSI_PROC_FS=y
+
+#
+# SCSI support type (disk, tape, CD-ROM)
+#
+CONFIG_BLK_DEV_SD=m
+CONFIG_SD_IOSTATS=y
+CONFIG_CHR_DEV_ST=m
+CONFIG_CHR_DEV_OSST=m
+CONFIG_BLK_DEV_SR=m
+CONFIG_BLK_DEV_SR_VENDOR=y
+CONFIG_CHR_DEV_SG=m
+CONFIG_CHR_DEV_SCH=m
+
+#
+# Some SCSI devices (e.g. CD jukebox) support multiple LUNs
+#
+CONFIG_SCSI_MULTI_LUN=y
+CONFIG_SCSI_CONSTANTS=y
+CONFIG_SCSI_LOGGING=y
+
+#
+# SCSI Transport
+#
+CONFIG_SCSI_SPI_ATTRS=m
+CONFIG_SCSI_FC_ATTRS=m
+# CONFIG_SCSI_ISCSI_ATTRS is not set
+CONFIG_SCSI_SAS_ATTRS=m
+CONFIG_SCSI_SAS_LIBSAS=m
+CONFIG_SCSI_SAS_ATA=y
+# CONFIG_SCSI_SAS_LIBSAS_DEBUG is not set
+
+#
+# SCSI low-level drivers
+#
+# CONFIG_ISCSI_TCP is not set
+CONFIG_BLK_DEV_3W_XXXX_RAID=m
+CONFIG_SCSI_3W_9XXX=m
+CONFIG_SCSI_ACARD=m
+CONFIG_SCSI_AACRAID=m
+CONFIG_SCSI_AIC7XXX=m
+CONFIG_AIC7XXX_CMDS_PER_DEVICE=4
+CONFIG_AIC7XXX_RESET_DELAY_MS=15000
+# CONFIG_AIC7XXX_DEBUG_ENABLE is not set
+CONFIG_AIC7XXX_DEBUG_MASK=0
+# CONFIG_AIC7XXX_REG_PRETTY_PRINT is not set
+CONFIG_SCSI_AIC7XXX_OLD=m
+CONFIG_SCSI_AIC79XX=m
+CONFIG_AIC79XX_CMDS_PER_DEVICE=4
+CONFIG_AIC79XX_RESET_DELAY_MS=15000
+# CONFIG_AIC79XX_ENABLE_RD_STRM is not set
+# CONFIG_AIC79XX_DEBUG_ENABLE is not set
+CONFIG_AIC79XX_DEBUG_MASK=0
+# CONFIG_AIC79XX_REG_PRETTY_PRINT is not set
+CONFIG_SCSI_AIC94XX=m
+# CONFIG_AIC94XX_DEBUG is not set
+CONFIG_SCSI_ARCMSR=m
+CONFIG_MEGARAID_NEWGEN=y
+CONFIG_MEGARAID_MM=m
+CONFIG_MEGARAID_MAILBOX=m
+CONFIG_MEGARAID_LEGACY=m
+CONFIG_MEGARAID_SAS=m
+CONFIG_SCSI_HPTIOP=m
+# CONFIG_SCSI_BUSLOGIC is not set
+# CONFIG_SCSI_DMX3191D is not set
+# CONFIG_SCSI_EATA is not set
+# CONFIG_SCSI_FUTURE_DOMAIN is not set
+CONFIG_SCSI_GDTH=m
+CONFIG_SCSI_IPS=m
+CONFIG_SCSI_IBMVSCSI=m
+CONFIG_SCSI_INITIO=m
+CONFIG_SCSI_INIA100=m
+CONFIG_SCSI_PPA=m
+CONFIG_SCSI_IMM=m
+# CONFIG_SCSI_IZIP_EPP16 is not set
+# CONFIG_SCSI_IZIP_SLOW_CTR is not set
+CONFIG_SCSI_STEX=m
+CONFIG_SCSI_SYM53C8XX_2=m
+CONFIG_SCSI_SYM53C8XX_DMA_ADDRESSING_MODE=1
+CONFIG_SCSI_SYM53C8XX_DEFAULT_TAGS=16
+CONFIG_SCSI_SYM53C8XX_MAX_TAGS=64
+CONFIG_SCSI_SYM53C8XX_MMIO=y
+CONFIG_SCSI_IPR=m
+CONFIG_SCSI_IPR_TRACE=y
+CONFIG_SCSI_IPR_DUMP=y
+CONFIG_SCSI_QLOGIC_1280=m
+CONFIG_SCSI_QLA_FC=m
+# CONFIG_SCSI_QLA_ISCSI is not set
+CONFIG_SCSI_LPFC=m
+CONFIG_SCSI_DC395x=m
+# CONFIG_SCSI_DC390T is not set
+# CONFIG_SCSI_DEBUG is not set
+
+#
+# PCMCIA SCSI adapter support
+#
+# CONFIG_PCMCIA_FDOMAIN is not set
+CONFIG_PCMCIA_QLOGIC=m
+CONFIG_PCMCIA_SYM53C500=m
+CONFIG_ATA=m
+# CONFIG_ATA_NONSTANDARD is not set
+CONFIG_SATA_AHCI=m
+CONFIG_SATA_SVW=m
+CONFIG_ATA_PIIX=m
+CONFIG_SATA_MV=m
+CONFIG_SATA_NV=m
+CONFIG_PDC_ADMA=m
+CONFIG_SATA_QSTOR=m
+CONFIG_SATA_PROMISE=m
+CONFIG_SATA_SX4=m
+CONFIG_SATA_SIL=m
+CONFIG_SATA_SIL24=m
+CONFIG_SATA_SIS=m
+CONFIG_SATA_ULI=m
+CONFIG_SATA_VIA=m
+CONFIG_SATA_VITESSE=m
+CONFIG_SATA_INIC162X=m
+# CONFIG_PATA_ALI is not set
+# CONFIG_PATA_AMD is not set
+# CONFIG_PATA_ARTOP is not set
+# CONFIG_PATA_ATIIXP is not set
+# CONFIG_PATA_CMD640_PCI is not set
+# CONFIG_PATA_CMD64X is not set
+# CONFIG_PATA_CS5520 is not set
+# CONFIG_PATA_CS5530 is not set
+# CONFIG_PATA_CYPRESS is not set
+# CONFIG_PATA_EFAR is not set
+# CONFIG_ATA_GENERIC is not set
+# CONFIG_PATA_HPT366 is not set
+# CONFIG_PATA_HPT37X is not set
+# CONFIG_PATA_HPT3X2N is not set
+# CONFIG_PATA_HPT3X3 is not set
+# CONFIG_PATA_IT821X is not set
+# CONFIG_PATA_IT8213 is not set
+# CONFIG_PATA_JMICRON is not set
+# CONFIG_PATA_TRIFLEX is not set
+CONFIG_PATA_MARVELL=m
+# CONFIG_PATA_MPIIX is not set
+# CONFIG_PATA_OLDPIIX is not set
+# CONFIG_PATA_NETCELL is not set
+# CONFIG_PATA_NS87410 is not set
+# CONFIG_PATA_NS87415 is not set
+# CONFIG_PATA_OPTI is not set
+# CONFIG_PATA_OPTIDMA is not set
+# CONFIG_PATA_PCMCIA is not set
+# CONFIG_PATA_PDC_OLD is not set
+# CONFIG_PATA_RADISYS is not set
+# CONFIG_PATA_RZ1000 is not set
+# CONFIG_PATA_SC1200 is not set
+# CONFIG_PATA_SERVERWORKS is not set
+CONFIG_PATA_PDC2027X=m
+# CONFIG_PATA_SIL680 is not set
+CONFIG_PATA_SIS=m
+# CONFIG_PATA_VIA is not set
+# CONFIG_PATA_WINBOND is not set
+CONFIG_ATA_INTEL_COMBINED=y
+
+#
+# Multi-device support (RAID and LVM)
+#
+CONFIG_MD=y
+CONFIG_BLK_DEV_MD=y
+CONFIG_MD_LINEAR=m
+CONFIG_MD_RAID0=m
+CONFIG_MD_RAID1=m
+CONFIG_MD_RAID10=m
+CONFIG_MD_RAID456=m
+CONFIG_MD_RAID5_RESHAPE=y
+CONFIG_MD_MULTIPATH=m
+CONFIG_MD_FAULTY=m
+CONFIG_BLK_DEV_DM=m
+CONFIG_DM_CRYPT=m
+CONFIG_DM_SNAPSHOT=m
+CONFIG_DM_MIRROR=m
+CONFIG_DM_ZERO=m
+CONFIG_DM_MULTIPATH=m
+CONFIG_DM_MULTIPATH_EMC=m
+CONFIG_DM_MULTIPATH_RDAC=m
+CONFIG_DM_MULTIPATH_HP=m
+CONFIG_DM_UEVENT=y
+
+#
+# Fusion MPT device support
+#
+CONFIG_FUSION=y
+CONFIG_FUSION_SPI=m
+CONFIG_FUSION_FC=m
+CONFIG_FUSION_SAS=m
+CONFIG_FUSION_MAX_SGE=128
+CONFIG_FUSION_CTL=m
+CONFIG_FUSION_LAN=m
+CONFIG_FUSION_LOGGING=y
+CONFIG_FIREWIRE=m
+CONFIG_FIREWIRE_OHCI=m
+CONFIG_FIREWIRE_SBP2=m
+
+#
+# IEEE 1394 (FireWire) support
+#
+# CONFIG_IEEE1394 is not set
+
+#
+# I2O device support
+#
+# CONFIG_I2O is not set
+
+#
+# Macintosh device drivers
+#
+CONFIG_ADB_PMU=y
+CONFIG_ADB_PMU_LED=y
+CONFIG_ADB_PMU_LED_IDE=y
+CONFIG_PMAC_SMU=y
+CONFIG_THERM_PM72=y
+CONFIG_WINDFARM=y
+CONFIG_WINDFARM_PM81=y
+CONFIG_WINDFARM_PM91=y
+CONFIG_WINDFARM_PM112=y
+
+#
+# Network device support
+#
+CONFIG_NETDEVICES=y
+CONFIG_IFB=m
+CONFIG_DUMMY=m
+CONFIG_BONDING=m
+CONFIG_EQUALIZER=m
+CONFIG_TUN=m
+
+#
+# ARCnet devices
+#
+# CONFIG_ARCNET is not set
+
+#
+# PHY device support
+#
+CONFIG_PHYLIB=m
+
+#
+# MII PHY device drivers
+#
+CONFIG_MARVELL_PHY=m
+CONFIG_DAVICOM_PHY=m
+CONFIG_QSEMI_PHY=m
+CONFIG_LXT_PHY=m
+CONFIG_CICADA_PHY=m
+CONFIG_VITESSE_PHY=m
+CONFIG_SMSC_PHY=m
+CONFIG_FIXED_PHY=m
+CONFIG_FIXED_MII_10_FDX=y
+CONFIG_FIXED_MII_100_FDX=y
+
+#
+# Ethernet (10 or 100Mbit)
+#
+CONFIG_NET_ETHERNET=y
+CONFIG_MII=m
+CONFIG_HAPPYMEAL=m
+CONFIG_SUNGEM=m
+CONFIG_CASSINI=m
+CONFIG_NET_VENDOR_3COM=y
+CONFIG_VORTEX=m
+CONFIG_TYPHOON=m
+
+#
+# Tulip family network device support
+#
+CONFIG_NET_TULIP=y
+CONFIG_DE2104X=m
+CONFIG_TULIP=m
+# CONFIG_TULIP_MWI is not set
+CONFIG_TULIP_MMIO=y
+# CONFIG_TULIP_NAPI is not set
+CONFIG_DE4X5=m
+CONFIG_WINBOND_840=m
+CONFIG_DM9102=m
+CONFIG_ULI526X=m
+CONFIG_PCMCIA_XIRCOM=m
+# CONFIG_HP100 is not set
+CONFIG_IBMVETH=m
+CONFIG_NET_PCI=y
+CONFIG_PCNET32=m
+CONFIG_AMD8111_ETH=m
+CONFIG_AMD8111E_NAPI=y
+CONFIG_ADAPTEC_STARFIRE=m
+CONFIG_ADAPTEC_STARFIRE_NAPI=y
+CONFIG_B44=m
+CONFIG_FORCEDETH=m
+# CONFIG_DGRS is not set
+# CONFIG_EEPRO100 is not set
+CONFIG_E100=m
+CONFIG_FEALNX=m
+CONFIG_NATSEMI=m
+CONFIG_NE2K_PCI=m
+CONFIG_8139CP=m
+CONFIG_8139TOO=m
+# CONFIG_8139TOO_PIO is not set
+# CONFIG_8139TOO_TUNE_TWISTER is not set
+CONFIG_8139TOO_8129=y
+# CONFIG_8139_OLD_RX_RESET is not set
+CONFIG_SIS900=m
+CONFIG_EPIC100=m
+CONFIG_SUNDANCE=m
+# CONFIG_SUNDANCE_MMIO is not set
+CONFIG_VIA_RHINE=m
+CONFIG_VIA_RHINE_MMIO=y
+CONFIG_VIA_RHINE_NAPI=y
+CONFIG_NET_POCKET=y
+CONFIG_DE600=m
+CONFIG_DE620=m
+
+#
+# Ethernet (1000 Mbit)
+#
+CONFIG_ACENIC=m
+# CONFIG_ACENIC_OMIT_TIGON_I is not set
+CONFIG_DL2K=m
+CONFIG_E1000=m
+CONFIG_E1000_NAPI=y
+# CONFIG_E1000_DISABLE_PACKET_SPLIT is not set
+CONFIG_E1000E=m
+CONFIG_IGB=m
+CONFIG_NS83820=m
+CONFIG_HAMACHI=m
+CONFIG_YELLOWFIN=m
+CONFIG_R8169=m
+CONFIG_R8169_NAPI=y
+CONFIG_R8169_VLAN=y
+CONFIG_SIS190=m
+CONFIG_SKGE=m
+CONFIG_SKY2=m
+# CONFIG_SK98LIN is not set
+CONFIG_VIA_VELOCITY=m
+CONFIG_TIGON3=m
+CONFIG_BNX2=m
+CONFIG_SPIDER_NET=m
+# CONFIG_MV643XX_ETH is not set
+CONFIG_QLA3XXX=m
+
+#
+# Ethernet (10000 Mbit)
+#
+CONFIG_CHELSIO_T1=m
+CONFIG_CHELSIO_T3=m
+CONFIG_EHEA=m
+CONFIG_IXGBE=m
+CONFIG_IXGB=m
+CONFIG_IXGB_NAPI=y
+CONFIG_S2IO=m
+CONFIG_S2IO_NAPI=y
+CONFIG_MYRI10GE=m
+CONFIG_NETXEN_NIC=m
+CONFIG_BNX2X=m
+# CONFIG_MLX4_CORE is not set
+
+#
+# Token Ring devices
+#
+CONFIG_TR=y
+CONFIG_IBMOL=m
+CONFIG_3C359=m
+# CONFIG_TMS380TR is not set
+
+#
+# Wireless LAN (non-hamradio)
+#
+CONFIG_NET_RADIO=y
+
+#
+# Obsolete Wireless cards support (pre-802.11)
+#
+# CONFIG_STRIP is not set
+CONFIG_PCMCIA_WAVELAN=m
+CONFIG_PCMCIA_NETWAVE=m
+
+#
+# Wireless 802.11 Frequency Hopping cards support
+#
+# CONFIG_PCMCIA_RAYCS is not set
+
+#
+# Wireless 802.11b ISA/PCI cards support
+#
+# CONFIG_IPW2100 is not set
+# CONFIG_IPW2200 is not set
+CONFIG_AIRO=m
+CONFIG_HERMES=m
+CONFIG_APPLE_AIRPORT=m
+CONFIG_PLX_HERMES=m
+CONFIG_TMD_HERMES=m
+CONFIG_NORTEL_HERMES=m
+CONFIG_PCI_HERMES=m
+CONFIG_ATMEL=m
+CONFIG_PCI_ATMEL=m
+
+#
+# Wireless 802.11b Pcmcia/Cardbus cards support
+#
+CONFIG_PCMCIA_HERMES=m
+CONFIG_PCMCIA_SPECTRUM=m
+CONFIG_AIRO_CS=m
+CONFIG_PCMCIA_ATMEL=m
+CONFIG_PCMCIA_WL3501=m
+
+#
+# Prism GT/Duette 802.11(a/b/g) PCI/Cardbus support
+#
+CONFIG_PRISM54=m
+CONFIG_USB_ZD1201=m
+CONFIG_HOSTAP=m
+CONFIG_HOSTAP_FIRMWARE=y
+CONFIG_HOSTAP_FIRMWARE_NVRAM=y
+CONFIG_HOSTAP_PLX=m
+CONFIG_HOSTAP_PCI=m
+CONFIG_HOSTAP_CS=m
+CONFIG_BCM43XX=m
+CONFIG_BCM43XX_DEBUG=y
+CONFIG_BCM43XX_DMA=y
+CONFIG_BCM43XX_PIO=y
+CONFIG_BCM43XX_DMA_AND_PIO_MODE=y
+# CONFIG_BCM43XX_DMA_MODE is not set
+# CONFIG_BCM43XX_PIO_MODE is not set
+CONFIG_ZD1211RW=m
+# CONFIG_ZD1211RW_DEBUG is not set
+CONFIG_NET_WIRELESS=y
+CONFIG_IWL4965=m
+CONFIG_IWL4965_QOS=y
+CONFIG_IWL4965_SPECTRUM_MEASUREMENT=y
+CONFIG_IWL4965_SENSITIVITY=y
+# CONFIG_IWL4965_DEBUG is not set
+# CONFIG_IWL3945 is not set
+
+#
+# PCMCIA network device support
+#
+CONFIG_NET_PCMCIA=y
+CONFIG_PCMCIA_3C589=m
+CONFIG_PCMCIA_3C574=m
+CONFIG_PCMCIA_FMVJ18X=m
+CONFIG_PCMCIA_PCNET=m
+CONFIG_PCMCIA_NMCLAN=m
+CONFIG_PCMCIA_SMC91C92=m
+CONFIG_PCMCIA_XIRC2PS=m
+CONFIG_PCMCIA_AXNET=m
+
+#
+# Wan interfaces
+#
+# CONFIG_WAN is not set
+
+#
+# ATM drivers
+#
+# CONFIG_ATM_DUMMY is not set
+CONFIG_ATM_TCP=m
+CONFIG_ATM_LANAI=m
+CONFIG_ATM_ENI=m
+# CONFIG_ATM_ENI_DEBUG is not set
+# CONFIG_ATM_ENI_TUNE_BURST is not set
+# CONFIG_ATM_FIRESTREAM is not set
+# CONFIG_ATM_ZATM is not set
+CONFIG_ATM_IDT77252=m
+# CONFIG_ATM_IDT77252_DEBUG is not set
+# CONFIG_ATM_IDT77252_RCV_ALL is not set
+CONFIG_ATM_IDT77252_USE_SUNI=y
+# CONFIG_ATM_AMBASSADOR is not set
+# CONFIG_ATM_HORIZON is not set
+CONFIG_ATM_FORE200E_MAYBE=m
+# CONFIG_ATM_FORE200E_PCA is not set
+CONFIG_ATM_HE=m
+# CONFIG_ATM_HE_USE_SUNI is not set
+CONFIG_FDDI=y
+# CONFIG_DEFXX is not set
+CONFIG_SKFP=m
+# CONFIG_HIPPI is not set
+CONFIG_PLIP=m
+CONFIG_PPP=m
+CONFIG_PPP_MULTILINK=y
+CONFIG_PPP_FILTER=y
+CONFIG_PPP_ASYNC=m
+CONFIG_PPP_SYNC_TTY=m
+CONFIG_PPP_DEFLATE=m
+# CONFIG_PPP_BSDCOMP is not set
+CONFIG_PPP_MPPE=m
+CONFIG_PPPOE=m
+CONFIG_PPPOATM=m
+CONFIG_SLIP=m
+CONFIG_SLIP_COMPRESSED=y
+CONFIG_SLIP_SMART=y
+# CONFIG_SLIP_MODE_SLIP6 is not set
+CONFIG_NET_FC=y
+# CONFIG_SHAPER is not set
+CONFIG_NETCONSOLE=m
+CONFIG_NETPOLL=y
+# CONFIG_NETPOLL_RX is not set
+CONFIG_NETPOLL_TRAP=y
+CONFIG_NET_POLL_CONTROLLER=y
+
+#
+# ISDN subsystem
+#
+CONFIG_ISDN=m
+
+#
+# Old ISDN4Linux
+#
+CONFIG_ISDN_I4L=m
+CONFIG_ISDN_PPP=y
+CONFIG_ISDN_PPP_VJ=y
+CONFIG_ISDN_MPP=y
+CONFIG_IPPP_FILTER=y
+# CONFIG_ISDN_PPP_BSDCOMP is not set
+CONFIG_ISDN_AUDIO=y
+CONFIG_ISDN_TTY_FAX=y
+
+#
+# ISDN feature submodules
+#
+CONFIG_ISDN_DIVERSION=m
+
+#
+# ISDN4Linux hardware drivers
+#
+
+#
+# Passive cards
+#
+CONFIG_ISDN_DRV_HISAX=m
+
+#
+# D-channel protocol features
+#
+CONFIG_HISAX_EURO=y
+CONFIG_DE_AOC=y
+CONFIG_HISAX_NO_SENDCOMPLETE=y
+CONFIG_HISAX_NO_LLC=y
+CONFIG_HISAX_NO_KEYPAD=y
+CONFIG_HISAX_1TR6=y
+CONFIG_HISAX_NI1=y
+CONFIG_HISAX_MAX_CARDS=8
+
+#
+# HiSax supported cards
+#
+CONFIG_HISAX_16_3=y
+CONFIG_HISAX_S0BOX=y
+CONFIG_HISAX_AVM_A1_PCMCIA=y
+CONFIG_HISAX_ELSA=y
+CONFIG_HISAX_DIEHLDIVA=y
+CONFIG_HISAX_SEDLBAUER=y
+CONFIG_HISAX_NICCY=y
+CONFIG_HISAX_BKM_A4T=y
+CONFIG_HISAX_SCT_QUADRO=y
+CONFIG_HISAX_GAZEL=y
+CONFIG_HISAX_W6692=y
+CONFIG_HISAX_HFC_SX=y
+# CONFIG_HISAX_DEBUG is not set
+
+#
+# HiSax PCMCIA card service modules
+#
+CONFIG_HISAX_SEDLBAUER_CS=m
+CONFIG_HISAX_ELSA_CS=m
+CONFIG_HISAX_AVM_A1_CS=m
+CONFIG_HISAX_TELES_CS=m
+
+#
+# HiSax sub driver modules
+#
+CONFIG_HISAX_ST5481=m
+# CONFIG_HISAX_HFCUSB is not set
+CONFIG_HISAX_HFC4S8S=m
+CONFIG_HISAX_FRITZ_PCIPNP=m
+CONFIG_HISAX_HDLC=y
+
+#
+# Active cards
+#
+
+#
+# Siemens Gigaset
+#
+CONFIG_ISDN_DRV_GIGASET=m
+CONFIG_GIGASET_BASE=m
+CONFIG_GIGASET_M105=m
+# CONFIG_GIGASET_DEBUG is not set
+# CONFIG_GIGASET_UNDOCREQ is not set
+
+#
+# CAPI subsystem
+#
+CONFIG_ISDN_CAPI=m
+CONFIG_ISDN_DRV_AVMB1_VERBOSE_REASON=y
+CONFIG_ISDN_CAPI_MIDDLEWARE=y
+CONFIG_ISDN_CAPI_CAPI20=m
+CONFIG_ISDN_CAPI_CAPIFS_BOOL=y
+CONFIG_ISDN_CAPI_CAPIFS=m
+CONFIG_ISDN_CAPI_CAPIDRV=m
+
+#
+# CAPI hardware drivers
+#
+
+#
+# Active AVM cards
+#
+CONFIG_CAPI_AVM=y
+CONFIG_ISDN_DRV_AVMB1_B1PCI=m
+CONFIG_ISDN_DRV_AVMB1_B1PCIV4=y
+CONFIG_ISDN_DRV_AVMB1_B1PCMCIA=m
+CONFIG_ISDN_DRV_AVMB1_AVM_CS=m
+CONFIG_ISDN_DRV_AVMB1_T1PCI=m
+CONFIG_ISDN_DRV_AVMB1_C4=m
+
+#
+# Active Eicon DIVA Server cards
+#
+CONFIG_CAPI_EICON=y
+CONFIG_ISDN_DIVAS=m
+CONFIG_ISDN_DIVAS_BRIPCI=y
+CONFIG_ISDN_DIVAS_PRIPCI=y
+CONFIG_ISDN_DIVAS_DIVACAPI=m
+CONFIG_ISDN_DIVAS_USERIDI=m
+CONFIG_ISDN_DIVAS_MAINT=m
+
+#
+# Telephony Support
+#
+# CONFIG_PHONE is not set
+
+#
+# Input device support
+#
+CONFIG_INPUT=y
+
+#
+# Userland interfaces
+#
+CONFIG_INPUT_MOUSEDEV=y
+# CONFIG_INPUT_MOUSEDEV_PSAUX is not set
+CONFIG_INPUT_MOUSEDEV_SCREEN_X=1024
+CONFIG_INPUT_MOUSEDEV_SCREEN_Y=768
+CONFIG_INPUT_JOYDEV=m
+# CONFIG_INPUT_TSDEV is not set
+CONFIG_INPUT_EVDEV=y
+# CONFIG_INPUT_EVBUG is not set
+
+#
+# Input Device Drivers
+#
+CONFIG_INPUT_KEYBOARD=y
+CONFIG_KEYBOARD_ATKBD=y
+# CONFIG_KEYBOARD_SUNKBD is not set
+# CONFIG_KEYBOARD_LKKBD is not set
+# CONFIG_KEYBOARD_XTKBD is not set
+# CONFIG_KEYBOARD_NEWTON is not set
+CONFIG_INPUT_MOUSE=y
+CONFIG_MOUSE_PS2=y
+CONFIG_MOUSE_SERIAL=m
+CONFIG_MOUSE_VSXXXAA=m
+CONFIG_INPUT_JOYSTICK=y
+CONFIG_JOYSTICK_ANALOG=m
+CONFIG_JOYSTICK_A3D=m
+CONFIG_JOYSTICK_ADI=m
+CONFIG_JOYSTICK_COBRA=m
+CONFIG_JOYSTICK_GF2K=m
+CONFIG_JOYSTICK_GRIP=m
+CONFIG_JOYSTICK_GRIP_MP=m
+CONFIG_JOYSTICK_GUILLEMOT=m
+CONFIG_JOYSTICK_INTERACT=m
+CONFIG_JOYSTICK_SIDEWINDER=m
+CONFIG_JOYSTICK_TMDC=m
+CONFIG_JOYSTICK_IFORCE=m
+CONFIG_JOYSTICK_IFORCE_USB=y
+CONFIG_JOYSTICK_IFORCE_232=y
+CONFIG_JOYSTICK_WARRIOR=m
+CONFIG_JOYSTICK_MAGELLAN=m
+CONFIG_JOYSTICK_SPACEORB=m
+CONFIG_JOYSTICK_SPACEBALL=m
+CONFIG_JOYSTICK_STINGER=m
+CONFIG_JOYSTICK_TWIDJOY=m
+CONFIG_JOYSTICK_DB9=m
+CONFIG_JOYSTICK_GAMECON=m
+CONFIG_JOYSTICK_TURBOGRAFX=m
+CONFIG_JOYSTICK_JOYDUMP=m
+CONFIG_INPUT_TOUCHSCREEN=y
+CONFIG_TOUCHSCREEN_GUNZE=m
+CONFIG_TOUCHSCREEN_ELO=m
+CONFIG_TOUCHSCREEN_MTOUCH=m
+# CONFIG_TOUCHSCREEN_MK712 is not set
+CONFIG_INPUT_MISC=y
+# CONFIG_INPUT_PCSPKR is not set
+CONFIG_INPUT_UINPUT=m
+
+#
+# Hardware I/O ports
+#
+CONFIG_SERIO=y
+CONFIG_SERIO_I8042=y
+CONFIG_SERIO_SERPORT=y
+# CONFIG_SERIO_PARKBD is not set
+# CONFIG_SERIO_PCIPS2 is not set
+CONFIG_SERIO_LIBPS2=y
+CONFIG_SERIO_RAW=m
+CONFIG_GAMEPORT=m
+# CONFIG_GAMEPORT_NS558 is not set
+# CONFIG_GAMEPORT_L4 is not set
+CONFIG_GAMEPORT_EMU10K1=m
+CONFIG_GAMEPORT_FM801=m
+
+#
+# Character devices
+#
+CONFIG_VT=y
+CONFIG_VT_CONSOLE=y
+CONFIG_HW_CONSOLE=y
+CONFIG_VT_HW_CONSOLE_BINDING=y
+CONFIG_SERIAL_NONSTANDARD=y
+# CONFIG_COMPUTONE is not set
+# CONFIG_ROCKETPORT is not set
+CONFIG_CYCLADES=m
+# CONFIG_CYZ_INTR is not set
+# CONFIG_DIGIEPCA is not set
+# CONFIG_MOXA_INTELLIO is not set
+# CONFIG_MOXA_SMARTIO is not set
+# CONFIG_ISI is not set
+CONFIG_SYNCLINK=m
+CONFIG_SYNCLINKMP=m
+CONFIG_SYNCLINK_GT=m
+CONFIG_N_HDLC=m
+# CONFIG_SPECIALIX is not set
+# CONFIG_SX is not set
+# CONFIG_RIO is not set
+# CONFIG_STALDRV is not set
+
+#
+# Serial drivers
+#
+CONFIG_SERIAL_8250=y
+CONFIG_SERIAL_8250_CONSOLE=y
+CONFIG_SERIAL_8250_PCI=y
+CONFIG_SERIAL_8250_CS=m
+CONFIG_SERIAL_8250_NR_UARTS=32
+CONFIG_SERIAL_8250_RUNTIME_UARTS=4
+CONFIG_SERIAL_8250_EXTENDED=y
+CONFIG_SERIAL_8250_MANY_PORTS=y
+CONFIG_SERIAL_8250_SHARE_IRQ=y
+CONFIG_SERIAL_8250_DETECT_IRQ=y
+CONFIG_SERIAL_8250_RSA=y
+
+#
+# Non-8250 serial port support
+#
+CONFIG_SERIAL_CORE=y
+CONFIG_SERIAL_CORE_CONSOLE=y
+CONFIG_SERIAL_PMACZILOG=m
+CONFIG_SERIAL_ICOM=m
+CONFIG_SERIAL_JSM=m
+CONFIG_SERIAL_OF_PLATFORM=y
+CONFIG_UNIX98_PTYS=y
+# CONFIG_LEGACY_PTYS is not set
+# CONFIG_CRASH is not set
+CONFIG_PRINTER=m
+CONFIG_LP_CONSOLE=y
+CONFIG_PPDEV=m
+CONFIG_TIPAR=m
+CONFIG_HVC_DRIVER=y
+CONFIG_HVC_CONSOLE=y
+CONFIG_HVC_RTAS=y
+CONFIG_HVCS=m
+
+#
+# IPMI
+#
+CONFIG_IPMI_HANDLER=m
+CONFIG_IPMI_PANIC_EVENT=y
+CONFIG_IPMI_PANIC_STRING=y
+CONFIG_IPMI_DEVICE_INTERFACE=m
+CONFIG_IPMI_SI=m
+CONFIG_IPMI_WATCHDOG=m
+CONFIG_IPMI_POWEROFF=m
+
+#
+# Watchdog Cards
+#
+CONFIG_WATCHDOG=y
+# CONFIG_WATCHDOG_NOWAYOUT is not set
+
+#
+# Watchdog Device Drivers
+#
+CONFIG_SOFT_WATCHDOG=m
+CONFIG_WATCHDOG_RTAS=m
+
+#
+# PCI-based Watchdog Cards
+#
+CONFIG_PCIPCWATCHDOG=m
+CONFIG_WDTPCI=m
+CONFIG_WDT_501_PCI=y
+
+#
+# USB-based Watchdog Cards
+#
+CONFIG_USBPCWATCHDOG=m
+CONFIG_HW_RANDOM=y
+CONFIG_GEN_RTC=y
+# CONFIG_GEN_RTC_X is not set
+# CONFIG_DTLK is not set
+CONFIG_R3964=m
+# CONFIG_APPLICOM is not set
+
+#
+# Ftape, the floppy tape device driver
+#
+CONFIG_AGP=y
+CONFIG_AGP_SIS=y
+CONFIG_AGP_VIA=y
+CONFIG_AGP_UNINORTH=y
+CONFIG_DRM=m
+CONFIG_DRM_TDFX=m
+CONFIG_DRM_R128=m
+CONFIG_DRM_RADEON=m
+CONFIG_DRM_MGA=m
+CONFIG_DRM_SIS=m
+CONFIG_DRM_VIA=m
+CONFIG_DRM_SAVAGE=m
+
+#
+# PCMCIA character devices
+#
+# CONFIG_SYNCLINK_CS is not set
+CONFIG_CARDMAN_4000=m
+CONFIG_CARDMAN_4040=m
+# CONFIG_RAW_DRIVER is not set
+CONFIG_HANGCHECK_TIMER=m
+CONFIG_TCG_TPM=m
+CONFIG_TCG_ATMEL=m
+CONFIG_TELCLOCK=m
+
+#
+# I2C support
+#
+CONFIG_I2C=y
+CONFIG_I2C_CHARDEV=m
+
+#
+# I2C Algorithms
+#
+CONFIG_I2C_ALGOBIT=y
+CONFIG_I2C_ALGOPCF=m
+CONFIG_I2C_ALGOPCA=m
+
+#
+# I2C Hardware Bus support
+#
+# CONFIG_I2C_ALI1535 is not set
+# CONFIG_I2C_ALI1563 is not set
+# CONFIG_I2C_ALI15X3 is not set
+# CONFIG_I2C_AMD756 is not set
+# CONFIG_I2C_AMD8111 is not set
+# CONFIG_I2C_I801 is not set
+# CONFIG_I2C_I810 is not set
+CONFIG_I2C_PIIX4=m
+CONFIG_I2C_ISA=m
+CONFIG_I2C_POWERMAC=y
+CONFIG_I2C_NFORCE2=m
+# CONFIG_I2C_OCORES is not set
+CONFIG_I2C_PARPORT=m
+CONFIG_I2C_PARPORT_LIGHT=m
+CONFIG_I2C_PROSAVAGE=m
+CONFIG_I2C_SAVAGE4=m
+# CONFIG_I2C_SIS5595 is not set
+# CONFIG_I2C_SIS630 is not set
+# CONFIG_I2C_SIS96X is not set
+CONFIG_I2C_STUB=m
+# CONFIG_I2C_VIA is not set
+# CONFIG_I2C_VIAPRO is not set
+CONFIG_I2C_VOODOO3=m
+CONFIG_I2C_PCA_ISA=m
+
+#
+# Miscellaneous I2C Chip support
+#
+CONFIG_SENSORS_DS1337=m
+CONFIG_SENSORS_DS1374=m
+CONFIG_SENSORS_EEPROM=m
+CONFIG_SENSORS_PCF8574=m
+CONFIG_SENSORS_PCA9539=m
+CONFIG_SENSORS_PCF8591=m
+CONFIG_SENSORS_MAX6875=m
+# CONFIG_I2C_DEBUG_CORE is not set
+# CONFIG_I2C_DEBUG_ALGO is not set
+# CONFIG_I2C_DEBUG_BUS is not set
+# CONFIG_I2C_DEBUG_CHIP is not set
+
+#
+# SPI support
+#
+# CONFIG_SPI is not set
+# CONFIG_SPI_MASTER is not set
+
+#
+# Dallas's 1-wire bus
+#
+# CONFIG_W1 is not set
+
+#
+# Hardware Monitoring support
+#
+CONFIG_HWMON=m
+CONFIG_HWMON_VID=m
+# CONFIG_SENSORS_ABITUGURU is not set
+CONFIG_SENSORS_ADM1021=m
+CONFIG_SENSORS_ADM1025=m
+CONFIG_SENSORS_ADM1026=m
+CONFIG_SENSORS_ADM1031=m
+CONFIG_SENSORS_ADM9240=m
+CONFIG_SENSORS_ASB100=m
+CONFIG_SENSORS_ATXP1=m
+CONFIG_SENSORS_DS1621=m
+CONFIG_SENSORS_F71805F=m
+CONFIG_SENSORS_FSCHER=m
+CONFIG_SENSORS_FSCPOS=m
+CONFIG_SENSORS_GL518SM=m
+CONFIG_SENSORS_GL520SM=m
+# CONFIG_SENSORS_IT87 is not set
+CONFIG_SENSORS_LM63=m
+CONFIG_SENSORS_LM75=m
+CONFIG_SENSORS_LM77=m
+CONFIG_SENSORS_LM78=m
+CONFIG_SENSORS_LM80=m
+CONFIG_SENSORS_LM83=m
+CONFIG_SENSORS_LM85=m
+CONFIG_SENSORS_LM87=m
+CONFIG_SENSORS_LM90=m
+CONFIG_SENSORS_LM92=m
+CONFIG_SENSORS_MAX1619=m
+# CONFIG_SENSORS_PC87360 is not set
+CONFIG_SENSORS_SIS5595=m
+# CONFIG_SENSORS_SMSC47M1 is not set
+CONFIG_SENSORS_SMSC47M192=m
+# CONFIG_SENSORS_SMSC47B397 is not set
+CONFIG_SENSORS_VIA686A=m
+CONFIG_SENSORS_VT8231=m
+# CONFIG_SENSORS_W83781D is not set
+CONFIG_SENSORS_W83791D=m
+CONFIG_SENSORS_W83792D=m
+CONFIG_SENSORS_W83L785TS=m
+# CONFIG_SENSORS_W83627HF is not set
+# CONFIG_SENSORS_W83627EHF is not set
+# CONFIG_HWMON_DEBUG_CHIP is not set
+
+#
+# Misc devices
+#
+
+#
+# Multimedia devices
+#
+CONFIG_VIDEO_DEV=m
+CONFIG_VIDEO_V4L1=y
+CONFIG_VIDEO_V4L1_COMPAT=y
+CONFIG_VIDEO_V4L2=y
+
+#
+# Video Capture Adapters
+#
+
+#
+# Video Capture Adapters
+#
+# CONFIG_VIDEO_ADV_DEBUG is not set
+# CONFIG_VIDEO_VIVI is not set
+CONFIG_VIDEO_BT848=m
+CONFIG_VIDEO_BT848_DVB=y
+CONFIG_VIDEO_SAA6588=m
+CONFIG_VIDEO_BWQCAM=m
+CONFIG_VIDEO_CQCAM=m
+CONFIG_VIDEO_W9966=m
+CONFIG_VIDEO_CPIA=m
+CONFIG_VIDEO_CPIA_PP=m
+CONFIG_VIDEO_CPIA_USB=m
+CONFIG_VIDEO_CPIA2=m
+CONFIG_VIDEO_SAA5246A=m
+CONFIG_VIDEO_SAA5249=m
+CONFIG_TUNER_3036=m
+CONFIG_VIDEO_SAA7134=m
+CONFIG_VIDEO_SAA7134_ALSA=m
+CONFIG_VIDEO_SAA7134_DVB=m
+CONFIG_VIDEO_SAA7134_DVB_ALL_FRONTENDS=y
+CONFIG_VIDEO_MXB=m
+CONFIG_VIDEO_DPC=m
+CONFIG_VIDEO_HEXIUM_ORION=m
+CONFIG_VIDEO_HEXIUM_GEMINI=m
+CONFIG_VIDEO_CX88_VP3054=m
+CONFIG_VIDEO_CX88=m
+CONFIG_VIDEO_CX88_ALSA=m
+CONFIG_VIDEO_CX88_BLACKBIRD=m
+CONFIG_VIDEO_CX88_DVB=m
+CONFIG_VIDEO_CX88_DVB_ALL_FRONTENDS=y
+
+#
+# Encoders and Decoders
+#
+CONFIG_VIDEO_MSP3400=m
+CONFIG_VIDEO_CS53L32A=m
+CONFIG_VIDEO_TLV320AIC23B=m
+CONFIG_VIDEO_WM8775=m
+CONFIG_VIDEO_WM8739=m
+CONFIG_VIDEO_CX2341X=m
+CONFIG_VIDEO_CX25840=m
+CONFIG_VIDEO_SAA711X=m
+CONFIG_VIDEO_SAA7127=m
+CONFIG_VIDEO_UPD64031A=m
+CONFIG_VIDEO_UPD64083=m
+
+#
+# V4L USB devices
+#
+CONFIG_VIDEO_PVRUSB2=m
+CONFIG_VIDEO_PVRUSB2_24XXX=y
+CONFIG_VIDEO_PVRUSB2_SYSFS=y
+# CONFIG_VIDEO_PVRUSB2_DEBUGIFC is not set
+CONFIG_VIDEO_EM28XX=m
+CONFIG_VIDEO_USBVIDEO=m
+CONFIG_USB_VICAM=m
+CONFIG_USB_IBMCAM=m
+CONFIG_USB_KONICAWC=m
+CONFIG_USB_QUICKCAM_MESSENGER=m
+CONFIG_USB_ET61X251=m
+CONFIG_VIDEO_OVCAMCHIP=m
+CONFIG_USB_W9968CF=m
+CONFIG_USB_OV511=m
+CONFIG_USB_SE401=m
+CONFIG_USB_SN9C102=m
+CONFIG_USB_STV680=m
+CONFIG_USB_ZC0301=m
+CONFIG_USB_PWC=m
+# CONFIG_USB_PWC_DEBUG is not set
+
+#
+# Radio Adapters
+#
+CONFIG_RADIO_GEMTEK_PCI=m
+CONFIG_RADIO_MAXIRADIO=m
+CONFIG_RADIO_MAESTRO=m
+CONFIG_USB_DSBR=m
+
+#
+# Digital Video Broadcasting Devices
+#
+CONFIG_DVB=y
+CONFIG_DVB_CORE=m
+
+#
+# Supported SAA7146 based PCI Adapters
+#
+CONFIG_DVB_AV7110=m
+CONFIG_DVB_AV7110_OSD=y
+CONFIG_DVB_BUDGET=m
+CONFIG_DVB_BUDGET_CI=m
+CONFIG_DVB_BUDGET_AV=m
+CONFIG_DVB_BUDGET_PATCH=m
+
+#
+# Supported USB Adapters
+#
+CONFIG_DVB_USB=m
+# CONFIG_DVB_USB_DEBUG is not set
+CONFIG_DVB_USB_A800=m
+CONFIG_DVB_USB_DIBUSB_MB=m
+# CONFIG_DVB_USB_DIBUSB_MB_FAULTY is not set
+CONFIG_DVB_USB_DIBUSB_MC=m
+CONFIG_DVB_USB_UMT_010=m
+CONFIG_DVB_USB_CXUSB=m
+CONFIG_DVB_USB_DIGITV=m
+CONFIG_DVB_USB_VP7045=m
+CONFIG_DVB_USB_VP702X=m
+CONFIG_DVB_USB_GP8PSK=m
+CONFIG_DVB_USB_NOVA_T_USB2=m
+CONFIG_DVB_USB_DTT200U=m
+CONFIG_DVB_TTUSB_BUDGET=m
+CONFIG_DVB_TTUSB_DEC=m
+CONFIG_DVB_CINERGYT2=m
+CONFIG_DVB_CINERGYT2_TUNING=y
+CONFIG_DVB_CINERGYT2_STREAM_URB_COUNT=32
+CONFIG_DVB_CINERGYT2_STREAM_BUF_SIZE=512
+CONFIG_DVB_CINERGYT2_QUERY_INTERVAL=250
+CONFIG_DVB_CINERGYT2_ENABLE_RC_INPUT_DEVICE=y
+CONFIG_DVB_CINERGYT2_RC_QUERY_INTERVAL=100
+
+#
+# Supported FlexCopII (B2C2) Adapters
+#
+CONFIG_DVB_B2C2_FLEXCOP=m
+CONFIG_DVB_B2C2_FLEXCOP_PCI=m
+CONFIG_DVB_B2C2_FLEXCOP_USB=m
+# CONFIG_DVB_B2C2_FLEXCOP_DEBUG is not set
+
+#
+# Supported BT878 Adapters
+#
+CONFIG_DVB_BT8XX=m
+
+#
+# Supported Pluto2 Adapters
+#
+CONFIG_DVB_PLUTO2=m
+
+#
+# Supported DVB Frontends
+#
+
+#
+# Customise DVB Frontends
+#
+
+#
+# DVB-S (satellite) frontends
+#
+CONFIG_DVB_STV0299=m
+CONFIG_DVB_CX24110=m
+CONFIG_DVB_CX24123=m
+CONFIG_DVB_TDA8083=m
+CONFIG_DVB_MT312=m
+CONFIG_DVB_VES1X93=m
+CONFIG_DVB_S5H1420=m
+
+#
+# DVB-T (terrestrial) frontends
+#
+CONFIG_DVB_SP8870=m
+CONFIG_DVB_SP887X=m
+CONFIG_DVB_CX22700=m
+CONFIG_DVB_CX22702=m
+CONFIG_DVB_L64781=m
+CONFIG_DVB_TDA1004X=m
+CONFIG_DVB_NXT6000=m
+CONFIG_DVB_MT352=m
+CONFIG_DVB_ZL10353=m
+CONFIG_DVB_DIB3000MB=m
+CONFIG_DVB_DIB3000MC=m
+
+#
+# DVB-C (cable) frontends
+#
+CONFIG_DVB_VES1820=m
+CONFIG_DVB_TDA10021=m
+CONFIG_DVB_STV0297=m
+
+#
+# ATSC (North American/Korean Terrestrial/Cable DTV) frontends
+#
+CONFIG_DVB_NXT200X=m
+CONFIG_DVB_OR51211=m
+CONFIG_DVB_OR51132=m
+CONFIG_DVB_BCM3510=m
+CONFIG_DVB_LGDT330X=m
+
+#
+# Miscellaneous devices
+#
+CONFIG_DVB_PLL=m
+CONFIG_DVB_LNBP21=m
+CONFIG_DVB_ISL6421=m
+CONFIG_VIDEO_SAA7146=m
+CONFIG_VIDEO_SAA7146_VV=m
+CONFIG_VIDEO_VIDEOBUF=m
+CONFIG_VIDEO_TUNER=m
+CONFIG_VIDEO_BUF=m
+CONFIG_VIDEO_BUF_DVB=m
+CONFIG_VIDEO_BTCX=m
+CONFIG_VIDEO_IR=m
+CONFIG_VIDEO_TVEEPROM=m
+CONFIG_USB_DABUSB=m
+
+#
+# Graphics support
+#
+# CONFIG_FIRMWARE_EDID is not set
+CONFIG_FB=y
+CONFIG_FB_CFB_FILLRECT=y
+CONFIG_FB_CFB_COPYAREA=y
+CONFIG_FB_CFB_IMAGEBLIT=y
+CONFIG_FB_MACMODES=y
+# CONFIG_FB_BACKLIGHT is not set
+CONFIG_FB_MODE_HELPERS=y
+CONFIG_FB_TILEBLITTING=y
+CONFIG_FB_CIRRUS=m
+# CONFIG_FB_PM2 is not set
+# CONFIG_FB_CYBER2000 is not set
+CONFIG_FB_OF=y
+# CONFIG_FB_ASILIANT is not set
+# CONFIG_FB_IMSTT is not set
+# CONFIG_FB_VGA16 is not set
+# CONFIG_FB_S1D13XXX is not set
+CONFIG_FB_NVIDIA=m
+CONFIG_FB_NVIDIA_I2C=y
+CONFIG_FB_RIVA=m
+# CONFIG_FB_RIVA_I2C is not set
+# CONFIG_FB_RIVA_DEBUG is not set
+CONFIG_FB_MATROX=m
+CONFIG_FB_MATROX_MILLENIUM=y
+CONFIG_FB_MATROX_MYSTIQUE=y
+CONFIG_FB_MATROX_G=y
+CONFIG_FB_MATROX_I2C=m
+CONFIG_FB_MATROX_MAVEN=m
+CONFIG_FB_MATROX_MULTIHEAD=y
+CONFIG_FB_RADEON=y
+CONFIG_FB_RADEON_I2C=y
+# CONFIG_FB_RADEON_DEBUG is not set
+# CONFIG_FB_ATY128 is not set
+# CONFIG_FB_ATY is not set
+CONFIG_FB_SAVAGE=m
+CONFIG_FB_SAVAGE_I2C=y
+CONFIG_FB_SAVAGE_ACCEL=y
+# CONFIG_FB_SIS is not set
+CONFIG_FB_NEOMAGIC=m
+CONFIG_FB_KYRO=m
+CONFIG_FB_3DFX=m
+CONFIG_FB_3DFX_ACCEL=y
+CONFIG_FB_VOODOO1=m
+CONFIG_FB_TRIDENT=m
+CONFIG_FB_TRIDENT_ACCEL=y
+# CONFIG_FB_VIRTUAL is not set
+
+#
+# Console display driver support
+#
+CONFIG_VGA_CONSOLE=y
+CONFIG_VGACON_SOFT_SCROLLBACK=y
+CONFIG_VGACON_SOFT_SCROLLBACK_SIZE=64
+CONFIG_DUMMY_CONSOLE=y
+CONFIG_FRAMEBUFFER_CONSOLE=y
+CONFIG_FRAMEBUFFER_CONSOLE_ROTATION=y
+# CONFIG_FONTS is not set
+CONFIG_FONT_8x8=y
+CONFIG_FONT_8x16=y
+
+#
+# Logo configuration
+#
+CONFIG_LOGO=y
+# CONFIG_LOGO_LINUX_MONO is not set
+# CONFIG_LOGO_LINUX_VGA16 is not set
+CONFIG_LOGO_LINUX_CLUT224=y
+CONFIG_BACKLIGHT_LCD_SUPPORT=y
+CONFIG_BACKLIGHT_CLASS_DEVICE=m
+CONFIG_BACKLIGHT_DEVICE=y
+CONFIG_LCD_CLASS_DEVICE=m
+CONFIG_LCD_DEVICE=y
+
+#
+# Sound
+#
+CONFIG_SOUND=m
+
+#
+# Advanced Linux Sound Architecture
+#
+CONFIG_SND=m
+CONFIG_SND_TIMER=m
+CONFIG_SND_PCM=m
+CONFIG_SND_HWDEP=m
+CONFIG_SND_RAWMIDI=m
+CONFIG_SND_SEQUENCER=m
+CONFIG_SND_SEQ_DUMMY=m
+CONFIG_SND_OSSEMUL=y
+CONFIG_SND_MIXER_OSS=m
+CONFIG_SND_PCM_OSS=m
+CONFIG_SND_PCM_OSS_PLUGINS=y
+CONFIG_SND_SEQUENCER_OSS=y
+CONFIG_SND_DYNAMIC_MINORS=y
+# CONFIG_SND_SUPPORT_OLD_API is not set
+CONFIG_SND_VERBOSE_PROCFS=y
+# CONFIG_SND_VERBOSE_PRINTK is not set
+# CONFIG_SND_DEBUG is not set
+
+#
+# Generic devices
+#
+CONFIG_SND_MPU401_UART=m
+CONFIG_SND_OPL3_LIB=m
+CONFIG_SND_VX_LIB=m
+CONFIG_SND_AC97_CODEC=m
+CONFIG_SND_DUMMY=m
+CONFIG_SND_VIRMIDI=m
+CONFIG_SND_MTPAV=m
+# CONFIG_SND_MTS64 is not set
+# CONFIG_SND_SERIAL_U16550 is not set
+CONFIG_SND_MPU401=m
+# CONFIG_SND_PORTMAN2X4 is not set
+
+#
+# PCI devices
+#
+CONFIG_SND_AD1889=m
+CONFIG_SND_ALS300=m
+CONFIG_SND_ALS4000=m
+CONFIG_SND_ALI5451=m
+CONFIG_SND_ATIIXP=m
+CONFIG_SND_ATIIXP_MODEM=m
+CONFIG_SND_AU8810=m
+CONFIG_SND_AU8820=m
+CONFIG_SND_AU8830=m
+CONFIG_SND_AZT3328=m
+CONFIG_SND_BT87X=m
+# CONFIG_SND_BT87X_OVERCLOCK is not set
+CONFIG_SND_CA0106=m
+CONFIG_SND_CMIPCI=m
+CONFIG_SND_CS4281=m
+CONFIG_SND_CS46XX=m
+CONFIG_SND_CS46XX_NEW_DSP=y
+CONFIG_SND_DARLA20=m
+CONFIG_SND_GINA20=m
+CONFIG_SND_LAYLA20=m
+CONFIG_SND_DARLA24=m
+CONFIG_SND_GINA24=m
+CONFIG_SND_LAYLA24=m
+CONFIG_SND_MONA=m
+CONFIG_SND_MIA=m
+CONFIG_SND_ECHO3G=m
+CONFIG_SND_INDIGO=m
+CONFIG_SND_INDIGOIO=m
+CONFIG_SND_INDIGODJ=m
+CONFIG_SND_EMU10K1=m
+CONFIG_SND_EMU10K1X=m
+CONFIG_SND_ENS1370=m
+CONFIG_SND_ENS1371=m
+CONFIG_SND_ES1938=m
+CONFIG_SND_ES1968=m
+CONFIG_SND_FM801=m
+CONFIG_SND_FM801_TEA575X_BOOL=y
+CONFIG_SND_FM801_TEA575X=m
+CONFIG_SND_HDA_INTEL=m
+CONFIG_SND_HDSP=m
+CONFIG_SND_HDSPM=m
+CONFIG_SND_ICE1712=m
+CONFIG_SND_ICE1724=m
+CONFIG_SND_INTEL8X0=m
+CONFIG_SND_INTEL8X0M=m
+CONFIG_SND_KORG1212=m
+CONFIG_SND_MAESTRO3=m
+CONFIG_SND_MIXART=m
+CONFIG_SND_NM256=m
+CONFIG_SND_PCXHR=m
+CONFIG_SND_RIPTIDE=m
+CONFIG_SND_RME32=m
+CONFIG_SND_RME96=m
+CONFIG_SND_RME9652=m
+CONFIG_SND_SONICVIBES=m
+CONFIG_SND_TRIDENT=m
+CONFIG_SND_VIA82XX=m
+CONFIG_SND_VIA82XX_MODEM=m
+CONFIG_SND_VX222=m
+CONFIG_SND_YMFPCI=m
+# CONFIG_SND_AC97_POWER_SAVE is not set
+
+#
+# ALSA PowerMac devices
+#
+CONFIG_SND_POWERMAC=m
+CONFIG_SND_POWERMAC_AUTO_DRC=y
+
+#
+# Apple Onboard Audio driver
+#
+CONFIG_SND_AOA=m
+CONFIG_SND_AOA_FABRIC_LAYOUT=m
+CONFIG_SND_AOA_ONYX=m
+CONFIG_SND_AOA_TAS=m
+CONFIG_SND_AOA_TOONIE=m
+CONFIG_SND_AOA_SOUNDBUS=m
+CONFIG_SND_AOA_SOUNDBUS_I2S=m
+
+#
+# USB devices
+#
+CONFIG_SND_USB_AUDIO=m
+CONFIG_SND_USB_USX2Y=m
+
+#
+# PCMCIA devices
+#
+# CONFIG_SND_VXPOCKET is not set
+# CONFIG_SND_PDAUDIOCF is not set
+
+#
+# SoC audio support
+#
+# CONFIG_SND_SOC is not set
+
+#
+# Open Sound System
+#
+# CONFIG_SOUND_PRIME is not set
+CONFIG_AC97_BUS=m
+
+#
+# USB support
+#
+CONFIG_USB_ARCH_HAS_HCD=y
+CONFIG_USB_ARCH_HAS_OHCI=y
+CONFIG_USB_ARCH_HAS_EHCI=y
+CONFIG_USB=y
+# CONFIG_USB_DEBUG is not set
+
+#
+# Miscellaneous USB options
+#
+CONFIG_USB_DEVICEFS=y
+# CONFIG_USB_BANDWIDTH is not set
+# CONFIG_USB_DYNAMIC_MINORS is not set
+# CONFIG_USB_SUSPEND is not set
+# CONFIG_USB_OTG is not set
+
+#
+# USB Host Controller Drivers
+#
+CONFIG_USB_EHCI_HCD=m
+CONFIG_USB_EHCI_SPLIT_ISO=y
+CONFIG_USB_EHCI_ROOT_HUB_TT=y
+CONFIG_USB_EHCI_TT_NEWSCHED=y
+CONFIG_USB_ISP116X_HCD=m
+CONFIG_USB_OHCI_HCD=m
+# CONFIG_USB_OHCI_BIG_ENDIAN is not set
+CONFIG_USB_OHCI_LITTLE_ENDIAN=y
+CONFIG_USB_UHCI_HCD=m
+CONFIG_USB_SL811_HCD=m
+CONFIG_USB_SL811_CS=m
+
+#
+# USB Device Class drivers
+#
+CONFIG_USB_ACM=m
+CONFIG_USB_PRINTER=m
+
+#
+# NOTE: USB_STORAGE enables SCSI, and 'SCSI disk support'
+#
+
+#
+# may also be needed; see USB_STORAGE Help for more information
+#
+CONFIG_USB_STORAGE=m
+# CONFIG_USB_STORAGE_DEBUG is not set
+CONFIG_USB_STORAGE_DATAFAB=y
+CONFIG_USB_STORAGE_FREECOM=y
+CONFIG_USB_STORAGE_ISD200=y
+CONFIG_USB_STORAGE_DPCM=y
+CONFIG_USB_STORAGE_USBAT=y
+CONFIG_USB_STORAGE_SDDR09=y
+CONFIG_USB_STORAGE_SDDR55=y
+CONFIG_USB_STORAGE_JUMPSHOT=y
+CONFIG_USB_STORAGE_ALAUDA=y
+# CONFIG_USB_LIBUSUAL is not set
+
+#
+# USB Input Devices
+#
+CONFIG_USB_HID=y
+CONFIG_USB_HIDINPUT=y
+CONFIG_USB_HIDINPUT_POWERBOOK=y
+CONFIG_HID_FF=y
+CONFIG_HID_PID=y
+CONFIG_LOGITECH_FF=y
+CONFIG_THRUSTMASTER_FF=y
+CONFIG_USB_HIDDEV=y
+CONFIG_USB_AIPTEK=m
+CONFIG_USB_WACOM=m
+CONFIG_USB_ACECAD=m
+CONFIG_USB_KBTAB=m
+CONFIG_USB_POWERMATE=m
+CONFIG_USB_TOUCHSCREEN=m
+CONFIG_USB_TOUCHSCREEN_EGALAX=y
+CONFIG_USB_TOUCHSCREEN_PANJIT=y
+CONFIG_USB_TOUCHSCREEN_3M=y
+CONFIG_USB_TOUCHSCREEN_ITM=y
+# CONFIG_USB_YEALINK is not set
+CONFIG_USB_XPAD=m
+CONFIG_USB_ATI_REMOTE=m
+CONFIG_USB_ATI_REMOTE2=m
+CONFIG_USB_KEYSPAN_REMOTE=m
+CONFIG_USB_APPLETOUCH=m
+
+#
+# USB Imaging devices
+#
+CONFIG_USB_MDC800=m
+CONFIG_USB_MICROTEK=m
+
+#
+# USB Network Adapters
+#
+CONFIG_USB_CATC=m
+CONFIG_USB_KAWETH=m
+CONFIG_USB_PEGASUS=m
+CONFIG_USB_RTL8150=m
+CONFIG_USB_USBNET=m
+CONFIG_USB_NET_AX8817X=m
+CONFIG_USB_NET_CDCETHER=m
+CONFIG_USB_NET_DM9601=m
+CONFIG_USB_NET_GL620A=m
+CONFIG_USB_NET_NET1080=m
+CONFIG_USB_NET_PLUSB=m
+CONFIG_USB_NET_RNDIS_HOST=m
+CONFIG_USB_NET_CDC_SUBSET=m
+CONFIG_USB_ALI_M5632=y
+CONFIG_USB_AN2720=y
+CONFIG_USB_BELKIN=y
+CONFIG_USB_ARMLINUX=y
+CONFIG_USB_EPSON2888=y
+CONFIG_USB_NET_ZAURUS=m
+CONFIG_USB_MON=y
+
+#
+# USB port drivers
+#
+CONFIG_USB_USS720=m
+
+#
+# USB Serial Converter support
+#
+CONFIG_USB_SERIAL=m
+CONFIG_USB_SERIAL_GENERIC=y
+CONFIG_USB_SERIAL_AIRPRIME=m
+CONFIG_USB_SERIAL_ARK3116=m
+CONFIG_USB_SERIAL_BELKIN=m
+CONFIG_USB_SERIAL_WHITEHEAT=m
+CONFIG_USB_SERIAL_DIGI_ACCELEPORT=m
+CONFIG_USB_SERIAL_CP2101=m
+CONFIG_USB_SERIAL_CYPRESS_M8=m
+CONFIG_USB_SERIAL_EMPEG=m
+CONFIG_USB_SERIAL_FTDI_SIO=m
+CONFIG_USB_SERIAL_FUNSOFT=m
+CONFIG_USB_SERIAL_VISOR=m
+CONFIG_USB_SERIAL_IPAQ=m
+CONFIG_USB_SERIAL_IR=m
+CONFIG_USB_SERIAL_EDGEPORT=m
+CONFIG_USB_SERIAL_EDGEPORT_TI=m
+CONFIG_USB_SERIAL_GARMIN=m
+CONFIG_USB_SERIAL_IPW=m
+CONFIG_USB_SERIAL_KEYSPAN_PDA=m
+CONFIG_USB_SERIAL_KEYSPAN=m
+CONFIG_USB_SERIAL_KEYSPAN_MPR=y
+CONFIG_USB_SERIAL_KEYSPAN_USA28=y
+CONFIG_USB_SERIAL_KEYSPAN_USA28X=y
+CONFIG_USB_SERIAL_KEYSPAN_USA28XA=y
+CONFIG_USB_SERIAL_KEYSPAN_USA28XB=y
+CONFIG_USB_SERIAL_KEYSPAN_USA19=y
+CONFIG_USB_SERIAL_KEYSPAN_USA18X=y
+CONFIG_USB_SERIAL_KEYSPAN_USA19W=y
+CONFIG_USB_SERIAL_KEYSPAN_USA19QW=y
+CONFIG_USB_SERIAL_KEYSPAN_USA19QI=y
+CONFIG_USB_SERIAL_KEYSPAN_USA49W=y
+CONFIG_USB_SERIAL_KEYSPAN_USA49WLC=y
+CONFIG_USB_SERIAL_KLSI=m
+CONFIG_USB_SERIAL_KOBIL_SCT=m
+CONFIG_USB_SERIAL_MCT_U232=m
+CONFIG_USB_SERIAL_NAVMAN=m
+CONFIG_USB_SERIAL_PL2303=m
+CONFIG_USB_SERIAL_HP4X=m
+CONFIG_USB_SERIAL_SAFE=m
+CONFIG_USB_SERIAL_SAFE_PADDED=y
+CONFIG_USB_SERIAL_SIERRAWIRELESS=m
+CONFIG_USB_SERIAL_TI=m
+CONFIG_USB_SERIAL_CYBERJACK=m
+CONFIG_USB_SERIAL_XIRCOM=m
+CONFIG_USB_SERIAL_OPTION=m
+CONFIG_USB_SERIAL_OMNINET=m
+CONFIG_USB_EZUSB=y
+
+#
+# USB Miscellaneous drivers
+#
+CONFIG_USB_EMI62=m
+CONFIG_USB_EMI26=m
+CONFIG_USB_AUERSWALD=m
+CONFIG_USB_RIO500=m
+CONFIG_USB_LEGOTOWER=m
+CONFIG_USB_LCD=m
+CONFIG_USB_LED=m
+# CONFIG_USB_CYPRESS_CY7C63 is not set
+# CONFIG_USB_CYTHERM is not set
+CONFIG_USB_PHIDGETKIT=m
+CONFIG_USB_PHIDGETSERVO=m
+CONFIG_USB_IDMOUSE=m
+CONFIG_USB_APPLEDISPLAY=m
+CONFIG_USB_SISUSBVGA=m
+CONFIG_USB_SISUSBVGA_CON=y
+CONFIG_USB_LD=m
+CONFIG_USB_TEST=m
+
+#
+# USB DSL modem support
+#
+CONFIG_USB_ATM=m
+CONFIG_USB_SPEEDTOUCH=m
+CONFIG_USB_CXACRU=m
+CONFIG_USB_UEAGLEATM=m
+CONFIG_USB_XUSBATM=m
+
+#
+# USB Gadget Support
+#
+# CONFIG_USB_GADGET is not set
+
+#
+# MMC/SD Card support
+#
+CONFIG_MMC=m
+# CONFIG_MMC_DEBUG is not set
+CONFIG_MMC_BLOCK=m
+CONFIG_MMC_SDHCI=m
+# CONFIG_MMC_WBSD is not set
+
+#
+# LED devices
+#
+CONFIG_NEW_LEDS=y
+CONFIG_LEDS_CLASS=y
+
+#
+# LED drivers
+#
+
+#
+# LED Triggers
+#
+CONFIG_LEDS_TRIGGERS=y
+CONFIG_LEDS_TRIGGER_TIMER=m
+CONFIG_LEDS_TRIGGER_IDE_DISK=y
+CONFIG_LEDS_TRIGGER_HEARTBEAT=m
+# CONFIG_INFINIBAND is not set
+
+#
+# EDAC - error detection and reporting (RAS) (EXPERIMENTAL)
+#
+
+#
+# Real Time Clock
+#
+CONFIG_RTC_LIB=m
+CONFIG_RTC_CLASS=m
+
+#
+# RTC interfaces
+#
+CONFIG_RTC_INTF_SYSFS=m
+CONFIG_RTC_INTF_PROC=m
+CONFIG_RTC_INTF_DEV=m
+# CONFIG_RTC_INTF_DEV_UIE_EMUL is not set
+
+#
+# RTC drivers
+#
+CONFIG_RTC_DRV_X1205=m
+CONFIG_RTC_DRV_DS1307=m
+CONFIG_RTC_DRV_DS1553=m
+CONFIG_RTC_DRV_ISL1208=m
+CONFIG_RTC_DRV_DS1672=m
+CONFIG_RTC_DRV_DS1742=m
+CONFIG_RTC_DRV_PCF8563=m
+CONFIG_RTC_DRV_PCF8583=m
+CONFIG_RTC_DRV_RS5C372=m
+# CONFIG_RTC_DRV_M48T86 is not set
+# CONFIG_RTC_DRV_TEST is not set
+CONFIG_RTC_DRV_V3020=m
+
+#
+# DMA Engine support
+#
+CONFIG_DMA_ENGINE=y
+
+#
+# DMA Clients
+#
+CONFIG_NET_DMA=y
+
+#
+# DMA Devices
+#
+CONFIG_INTEL_IOATDMA=m
+
+#
+# File systems
+#
+CONFIG_EXT2_FS=y
+CONFIG_EXT2_FS_XATTR=y
+CONFIG_EXT2_FS_POSIX_ACL=y
+CONFIG_EXT2_FS_SECURITY=y
+CONFIG_EXT2_FS_XIP=y
+CONFIG_FS_XIP=y
+CONFIG_EXT3_FS=m
+CONFIG_EXT3_FS_XATTR=y
+CONFIG_EXT3_FS_POSIX_ACL=y
+CONFIG_EXT3_FS_SECURITY=y
+CONFIG_JBD=m
+# CONFIG_JBD_DEBUG is not set
+CONFIG_FS_MBCACHE=y
+CONFIG_REISERFS_FS=m
+# CONFIG_REISERFS_CHECK is not set
+CONFIG_REISERFS_PROC_INFO=y
+CONFIG_REISERFS_FS_XATTR=y
+CONFIG_REISERFS_FS_POSIX_ACL=y
+CONFIG_REISERFS_FS_SECURITY=y
+CONFIG_JFS_FS=m
+CONFIG_JFS_POSIX_ACL=y
+CONFIG_JFS_SECURITY=y
+# CONFIG_JFS_DEBUG is not set
+# CONFIG_JFS_STATISTICS is not set
+CONFIG_FS_POSIX_ACL=y
+CONFIG_XFS_FS=m
+CONFIG_XFS_QUOTA=y
+CONFIG_XFS_SECURITY=y
+CONFIG_XFS_POSIX_ACL=y
+# CONFIG_XFS_RT is not set
+CONFIG_GFS2_FS=m
+CONFIG_GFS2_FS_LOCKING_NOLOCK=m
+CONFIG_GFS2_FS_LOCKING_DLM=m
+CONFIG_OCFS2_FS=m
+# CONFIG_OCFS2_DEBUG_MASKLOG is not set
+CONFIG_MINIX_FS=m
+CONFIG_ROMFS_FS=m
+CONFIG_INOTIFY=y
+CONFIG_INOTIFY_USER=y
+CONFIG_QUOTA=y
+# CONFIG_QFMT_V1 is not set
+CONFIG_QFMT_V2=y
+CONFIG_QUOTACTL=y
+CONFIG_DNOTIFY=y
+CONFIG_AUTOFS_FS=m
+CONFIG_AUTOFS4_FS=m
+CONFIG_FUSE_FS=m
+
+#
+# Caches
+#
+CONFIG_FSCACHE=m
+
+#
+# CD-ROM/DVD Filesystems
+#
+CONFIG_ISO9660_FS=y
+CONFIG_JOLIET=y
+CONFIG_ZISOFS=y
+CONFIG_ZISOFS_FS=y
+CONFIG_UDF_FS=m
+CONFIG_UDF_NLS=y
+CONFIG_CACHEFILES=m
+CONFIG_CACHEFILES_DEBUG=y
+
+#
+# DOS/FAT/NT Filesystems
+#
+CONFIG_FAT_FS=m
+CONFIG_MSDOS_FS=m
+CONFIG_VFAT_FS=m
+CONFIG_FAT_DEFAULT_CODEPAGE=437
+CONFIG_FAT_DEFAULT_IOCHARSET="ascii"
+# CONFIG_NTFS_FS is not set
+
+#
+# Pseudo filesystems
+#
+CONFIG_PROC_FS=y
+CONFIG_PROC_KCORE=y
+CONFIG_SYSFS=y
+CONFIG_TMPFS=y
+CONFIG_HUGETLBFS=y
+CONFIG_HUGETLB_PAGE=y
+CONFIG_RAMFS=y
+CONFIG_CONFIGFS_FS=m
+
+#
+# Miscellaneous filesystems
+#
+# CONFIG_ADFS_FS is not set
+CONFIG_AFFS_FS=m
+CONFIG_ECRYPT_FS=m
+CONFIG_HFS_FS=m
+CONFIG_HFSPLUS_FS=m
+CONFIG_BEFS_FS=m
+# CONFIG_BEFS_DEBUG is not set
+CONFIG_BFS_FS=m
+CONFIG_EFS_FS=m
+# CONFIG_JFFS_FS is not set
+CONFIG_JFFS2_FS=m
+CONFIG_JFFS2_FS_DEBUG=0
+CONFIG_JFFS2_FS_WRITEBUFFER=y
+CONFIG_JFFS2_SUMMARY=y
+# CONFIG_JFFS2_FS_XATTR is not set
+# CONFIG_JFFS2_COMPRESSION_OPTIONS is not set
+CONFIG_JFFS2_ZLIB=y
+CONFIG_JFFS2_RTIME=y
+# CONFIG_JFFS2_RUBIN is not set
+CONFIG_CRAMFS=m
+CONFIG_SQUASHFS=m
+# CONFIG_SQUASHFS_EMBEDDED is not set
+CONFIG_SQUASHFS_FRAGMENT_CACHE_SIZE=3
+# CONFIG_SQUASHFS_VMALLOC is not set
+CONFIG_VXFS_FS=m
+# CONFIG_HPFS_FS is not set
+CONFIG_QNX4FS_FS=m
+CONFIG_SYSV_FS=m
+CONFIG_UFS_FS=m
+# CONFIG_UFS_FS_WRITE is not set
+# CONFIG_UFS_DEBUG is not set
+
+#
+# Network File Systems
+#
+CONFIG_NFS_FS=m
+CONFIG_NFS_V3=y
+CONFIG_NFS_V3_ACL=y
+CONFIG_NFS_V4=y
+CONFIG_NFS_FSCACHE=y
+CONFIG_NFS_DIRECTIO=y
+CONFIG_NFSD=m
+CONFIG_NFSD_V3=y
+CONFIG_NFSD_V3_ACL=y
+CONFIG_NFSD_V4=y
+CONFIG_NFSD_TCP=y
+CONFIG_LOCKD=m
+CONFIG_LOCKD_V4=y
+CONFIG_EXPORTFS=m
+CONFIG_NFS_ACL_SUPPORT=m
+CONFIG_NFS_COMMON=y
+CONFIG_SUNRPC=m
+CONFIG_SUNRPC_GSS=m
+CONFIG_RPCSEC_GSS_KRB5=m
+CONFIG_RPCSEC_GSS_SPKM3=m
+# CONFIG_SMB_FS is not set
+CONFIG_CIFS=m
+# CONFIG_CIFS_STATS is not set
+CONFIG_CIFS_WEAK_PW_HASH=y
+CONFIG_CIFS_XATTR=y
+CONFIG_CIFS_POSIX=y
+# CONFIG_CIFS_DEBUG2 is not set
+# CONFIG_CIFS_EXPERIMENTAL is not set
+# CONFIG_NCP_FS is not set
+CONFIG_CODA_FS=m
+# CONFIG_CODA_FS_OLD_API is not set
+# CONFIG_AFS_FS is not set
+CONFIG_9P_FS=m
+
+#
+# Partition Types
+#
+CONFIG_PARTITION_ADVANCED=y
+# CONFIG_ACORN_PARTITION is not set
+CONFIG_OSF_PARTITION=y
+CONFIG_AMIGA_PARTITION=y
+# CONFIG_ATARI_PARTITION is not set
+CONFIG_MAC_PARTITION=y
+CONFIG_MSDOS_PARTITION=y
+CONFIG_BSD_DISKLABEL=y
+CONFIG_MINIX_SUBPARTITION=y
+CONFIG_SOLARIS_X86_PARTITION=y
+CONFIG_UNIXWARE_DISKLABEL=y
+# CONFIG_LDM_PARTITION is not set
+CONFIG_SGI_PARTITION=y
+# CONFIG_ULTRIX_PARTITION is not set
+CONFIG_SUN_PARTITION=y
+CONFIG_KARMA_PARTITION=y
+CONFIG_EFI_PARTITION=y
+
+#
+# Native Language Support
+#
+CONFIG_NLS=y
+CONFIG_NLS_DEFAULT="utf8"
+CONFIG_NLS_CODEPAGE_437=y
+CONFIG_NLS_CODEPAGE_737=m
+CONFIG_NLS_CODEPAGE_775=m
+CONFIG_NLS_CODEPAGE_850=m
+CONFIG_NLS_CODEPAGE_852=m
+CONFIG_NLS_CODEPAGE_855=m
+CONFIG_NLS_CODEPAGE_857=m
+CONFIG_NLS_CODEPAGE_860=m
+CONFIG_NLS_CODEPAGE_861=m
+CONFIG_NLS_CODEPAGE_862=m
+CONFIG_NLS_CODEPAGE_863=m
+CONFIG_NLS_CODEPAGE_864=m
+CONFIG_NLS_CODEPAGE_865=m
+CONFIG_NLS_CODEPAGE_866=m
+CONFIG_NLS_CODEPAGE_869=m
+CONFIG_NLS_CODEPAGE_936=m
+CONFIG_NLS_CODEPAGE_950=m
+CONFIG_NLS_CODEPAGE_932=m
+CONFIG_NLS_CODEPAGE_949=m
+CONFIG_NLS_CODEPAGE_874=m
+CONFIG_NLS_ISO8859_8=m
+CONFIG_NLS_CODEPAGE_1250=m
+CONFIG_NLS_CODEPAGE_1251=m
+CONFIG_NLS_ASCII=y
+CONFIG_NLS_ISO8859_1=m
+CONFIG_NLS_ISO8859_2=m
+CONFIG_NLS_ISO8859_3=m
+CONFIG_NLS_ISO8859_4=m
+CONFIG_NLS_ISO8859_5=m
+CONFIG_NLS_ISO8859_6=m
+CONFIG_NLS_ISO8859_7=m
+CONFIG_NLS_ISO8859_9=m
+CONFIG_NLS_ISO8859_13=m
+CONFIG_NLS_ISO8859_14=m
+CONFIG_NLS_ISO8859_15=m
+CONFIG_NLS_KOI8_R=m
+CONFIG_NLS_KOI8_U=m
+CONFIG_NLS_UTF8=m
+
+#
+# Distributed Lock Manager
+#
+CONFIG_DLM=m
+CONFIG_DLM_DEBUG=y
+
+#
+# Library routines
+#
+CONFIG_CRC_CCITT=m
+CONFIG_CRC16=m
+CONFIG_CRC32=y
+CONFIG_LIBCRC32C=y
+CONFIG_ZLIB_INFLATE=y
+CONFIG_ZLIB_DEFLATE=m
+CONFIG_TEXTSEARCH=y
+CONFIG_TEXTSEARCH_KMP=m
+CONFIG_TEXTSEARCH_BM=m
+CONFIG_TEXTSEARCH_FSM=m
+CONFIG_PLIST=y
+CONFIG_HAS_IOMEM=y
+CONFIG_HAS_IOPORT=y
+CONFIG_HAS_DMA=y
+
+#
+# Instrumentation Support
+#
+CONFIG_PROFILING=y
+CONFIG_OPROFILE=m
+CONFIG_OPROFILE_CELL=y
+CONFIG_KPROBES=y
+
+#
+# Kernel hacking
+#
+# CONFIG_PRINTK_TIME is not set
+CONFIG_MAGIC_SYSRQ=y
+# CONFIG_UNUSED_SYMBOLS is not set
+CONFIG_DEBUG_KERNEL=y
+CONFIG_LOG_BUF_SHIFT=17
+CONFIG_DETECT_SOFTLOCKUP=y
+CONFIG_SCHEDSTATS=y
+# CONFIG_DEBUG_SLAB is not set
+# CONFIG_DEBUG_RT_MUTEXES is not set
+# CONFIG_RT_MUTEX_TESTER is not set
+# CONFIG_DEBUG_SPINLOCK is not set
+# CONFIG_DEBUG_MUTEXES is not set
+# CONFIG_DEBUG_RWSEMS is not set
+# CONFIG_DEBUG_SPINLOCK_SLEEP is not set
+# CONFIG_DEBUG_LOCKING_API_SELFTESTS is not set
+# CONFIG_DEBUG_KOBJECT is not set
+CONFIG_DEBUG_INFO=y
+CONFIG_DEBUG_FS=y
+# CONFIG_DEBUG_VM is not set
+CONFIG_DEBUG_LIST=y
+# CONFIG_FORCED_INLINING is not set
+CONFIG_BOOT_DELAY=y
+# CONFIG_RCU_TORTURE_TEST is not set
+CONFIG_DEBUG_STACKOVERFLOW=y
+CONFIG_DEBUG_STACK_USAGE=y
+CONFIG_DEBUGGER=y
+CONFIG_XMON=y
+CONFIG_XMON_DEFAULT=y
+CONFIG_IRQSTACKS=y
+CONFIG_BOOTX_TEXT=y
+# CONFIG_PPC_EARLY_DEBUG is not set
+
+#
+# Security options
+#
+CONFIG_KEYS=y
+CONFIG_KEYS_DEBUG_PROC_KEYS=y
+CONFIG_SECURITY=y
+CONFIG_SECURITY_NETWORK=y
+CONFIG_SECURITY_NETWORK_XFRM=y
+CONFIG_SECURITY_CAPABILITIES=y
+# CONFIG_SECURITY_ROOTPLUG is not set
+# CONFIG_SECURITY_SECLVL is not set
+CONFIG_SECURITY_SELINUX=y
+CONFIG_SECURITY_SELINUX_BOOTPARAM=y
+CONFIG_SECURITY_SELINUX_BOOTPARAM_VALUE=1
+CONFIG_SECURITY_SELINUX_DISABLE=y
+CONFIG_SECURITY_SELINUX_DEVELOP=y
+CONFIG_SECURITY_SELINUX_AVC_STATS=y
+CONFIG_SECURITY_SELINUX_CHECKREQPROT_VALUE=1
+CONFIG_SECURITY_SELINUX_ENABLE_SECMARK_DEFAULT=y
+# CONFIG_SECURITY_SELINUX_POLICYDB_VERSION_MAX is not set
+CONFIG_KEYS_COMPAT=y
+
+#
+# Cryptographic options
+#
+CONFIG_CRYPTO=y
+CONFIG_CRYPTO_API=m
+CONFIG_CRYPTO_ALGAPI=m
+CONFIG_CRYPTO_AEAD=m
+CONFIG_CRYPTO_BLKCIPHER=m
+CONFIG_CRYPTO_SEQIV=m
+CONFIG_CRYPTO_HASH=m
+CONFIG_CRYPTO_MANAGER=m
+CONFIG_CRYPTO_HMAC=y
+CONFIG_CRYPTO_NHMAC=m
+CONFIG_CRYPTO_XCBC=m
+CONFIG_CRYPTO_NULL=m
+CONFIG_CRYPTO_MD4=m
+CONFIG_CRYPTO_MD5=m
+CONFIG_CRYPTO_SHA1=y
+CONFIG_CRYPTO_SHA256=m
+CONFIG_CRYPTO_SHA512=m
+CONFIG_CRYPTO_WP512=m
+CONFIG_CRYPTO_TGR192=m
+CONFIG_CRYPTO_ECB=m
+CONFIG_CRYPTO_CBC=m
+CONFIG_CRYPTO_CTR=m
+CONFIG_CRYPTO_CCM=m
+CONFIG_CRYPTO_DES=m
+CONFIG_CRYPTO_BLOWFISH=m
+CONFIG_CRYPTO_TWOFISH=m
+CONFIG_CRYPTO_SERPENT=m
+CONFIG_CRYPTO_AES=m
+CONFIG_CRYPTO_CAST5=m
+CONFIG_CRYPTO_CAST6=m
+CONFIG_CRYPTO_TEA=m
+CONFIG_CRYPTO_ARC4=m
+CONFIG_CRYPTO_KHAZAD=m
+CONFIG_CRYPTO_ANUBIS=m
+CONFIG_CRYPTO_DEFLATE=m
+CONFIG_CRYPTO_MICHAEL_MIC=m
+CONFIG_CRYPTO_CRC32C=y
+# CONFIG_CRYPTO_TEST is not set
+CONFIG_CRYPTO_AUTHENC=m
+CONFIG_CRYPTO_SIGNATURE=y
+CONFIG_CRYPTO_SIGNATURE_DSA=y
+CONFIG_CRYPTO_MPILIB=y
+
+#
+# Hardware crypto devices
+#
diff --git a/lustre/kernel_patches/kernel_configs/kernel-2.6.18-2.6-rhel5-ppc64.config b/lustre/kernel_patches/kernel_configs/kernel-2.6.18-2.6-rhel5-ppc64.config

new file mode 100644 (file)

index 0000000..d8a493c
--- /dev/null
+++ b/lustre/kernel_patches/kernel_configs/kernel-2.6.18-2.6-rhel5-ppc64.config
@@ -0,0 +1,3104 @@
+#
+# Automatically generated make config: don't edit
+# Linux kernel version: 2.6.18-prep
+# Sat Jul 12 00:22:15 2008
+#
+CONFIG_PPC64=y
+CONFIG_64BIT=y
+CONFIG_PPC_MERGE=y
+CONFIG_MMU=y
+CONFIG_GENERIC_HARDIRQS=y
+CONFIG_IRQ_PER_CPU=y
+CONFIG_RWSEM_XCHGADD_ALGORITHM=y
+CONFIG_GENERIC_HWEIGHT=y
+CONFIG_GENERIC_CALIBRATE_DELAY=y
+CONFIG_GENERIC_FIND_NEXT_BIT=y
+CONFIG_PPC=y
+CONFIG_EARLY_PRINTK=y
+CONFIG_COMPAT=y
+CONFIG_SYSVIPC_COMPAT=y
+CONFIG_SCHED_NO_NO_OMIT_FRAME_POINTER=y
+CONFIG_ARCH_MAY_HAVE_PC_FDC=y
+CONFIG_PPC_OF=y
+CONFIG_PPC_UDBG_16550=y
+CONFIG_GENERIC_TBSYNC=y
+CONFIG_AUDIT_ARCH=y
+# CONFIG_DEFAULT_UIMAGE is not set
+
+#
+# Processor support
+#
+# CONFIG_POWER4_ONLY is not set
+CONFIG_POWER3=y
+CONFIG_POWER4=y
+CONFIG_PPC_FPU=y
+# CONFIG_PPC_DCR_NATIVE is not set
+CONFIG_PPC_DCR_MMIO=y
+CONFIG_PPC_DCR=y
+CONFIG_ALTIVEC=y
+CONFIG_PPC_STD_MMU=y
+CONFIG_VIRT_CPU_ACCOUNTING=y
+CONFIG_SMP=y
+CONFIG_NR_CPUS=128
+CONFIG_DEFCONFIG_LIST="/lib/modules/$UNAME_RELEASE/.config"
+
+#
+# Code maturity level options
+#
+CONFIG_EXPERIMENTAL=y
+CONFIG_LOCK_KERNEL=y
+CONFIG_INIT_ENV_ARG_LIMIT=32
+
+#
+# General setup
+#
+CONFIG_LOCALVERSION=""
+# CONFIG_LOCALVERSION_AUTO is not set
+CONFIG_SWAP=y
+CONFIG_SYSVIPC=y
+CONFIG_POSIX_MQUEUE=y
+CONFIG_BSD_PROCESS_ACCT=y
+# CONFIG_BSD_PROCESS_ACCT_V3 is not set
+CONFIG_TASKSTATS=y
+CONFIG_TASK_DELAY_ACCT=y
+CONFIG_AUDIT=y
+CONFIG_AUDITSYSCALL=y
+# CONFIG_IKCONFIG is not set
+CONFIG_CPUSETS=y
+CONFIG_RELAY=y
+CONFIG_INITRAMFS_SOURCE=""
+CONFIG_CC_OPTIMIZE_FOR_SIZE=y
+# CONFIG_EMBEDDED is not set
+CONFIG_SYSCTL=y
+CONFIG_KALLSYMS=y
+# CONFIG_KALLSYMS_ALL is not set
+CONFIG_KALLSYMS_EXTRA_PASS=y
+CONFIG_HOTPLUG=y
+CONFIG_PRINTK=y
+CONFIG_BUG=y
+CONFIG_ELF_CORE=y
+CONFIG_BASE_FULL=y
+CONFIG_FUTEX=y
+CONFIG_EPOLL=y
+CONFIG_SHMEM=y
+CONFIG_SLAB=y
+CONFIG_VM_EVENT_COUNTERS=y
+CONFIG_RT_MUTEXES=y
+# CONFIG_TINY_SHMEM is not set
+CONFIG_BASE_SMALL=0
+# CONFIG_SLOB is not set
+
+#
+# Loadable module support
+#
+CONFIG_MODULES=y
+CONFIG_MODULE_UNLOAD=y
+# CONFIG_MODULE_FORCE_UNLOAD is not set
+CONFIG_MODVERSIONS=y
+CONFIG_MODULE_SRCVERSION_ALL=y
+CONFIG_MODULE_SIG=y
+# CONFIG_MODULE_SIG_FORCE is not set
+CONFIG_KMOD=y
+CONFIG_STOP_MACHINE=y
+
+#
+# Process debugging support
+#
+CONFIG_PTRACE=y
+CONFIG_UTRACE=y
+
+#
+# Block layer
+#
+CONFIG_BLK_DEV_IO_TRACE=y
+
+#
+# IO Schedulers
+#
+CONFIG_IOSCHED_NOOP=y
+CONFIG_IOSCHED_AS=y
+CONFIG_IOSCHED_DEADLINE=y
+CONFIG_IOSCHED_CFQ=y
+# CONFIG_DEFAULT_AS is not set
+CONFIG_DEFAULT_DEADLINE=y
+# CONFIG_DEFAULT_CFQ is not set
+# CONFIG_DEFAULT_NOOP is not set
+CONFIG_DEFAULT_IOSCHED="deadline"
+
+#
+# Platform support
+#
+CONFIG_PPC_MULTIPLATFORM=y
+# CONFIG_PPC_ISERIES is not set
+# CONFIG_EMBEDDED6xx is not set
+# CONFIG_APUS is not set
+CONFIG_PPC_PSERIES=y
+CONFIG_PPC_PMAC=y
+CONFIG_PPC_PMAC64=y
+# CONFIG_PPC_MAPLE is not set
+CONFIG_PPC_CELL=y
+CONFIG_PPC_CELL_NATIVE=y
+CONFIG_PPC_IBM_CELL_BLADE=y
+CONFIG_UDBG_RTAS_CONSOLE=y
+CONFIG_XICS=y
+CONFIG_U3_DART=y
+CONFIG_PPC_RTAS=y
+CONFIG_RTAS_ERROR_LOGGING=y
+CONFIG_RTAS_PROC=y
+CONFIG_RTAS_FLASH=y
+CONFIG_MMIO_NVRAM=y
+CONFIG_PPC_PMI=m
+CONFIG_MPIC_BROKEN_U3=y
+CONFIG_IBMVIO=y
+CONFIG_IBMEBUS=y
+# CONFIG_PPC_MPC106 is not set
+CONFIG_PPC_970_NAP=y
+CONFIG_CPU_FREQ=y
+CONFIG_CPU_FREQ_TABLE=y
+CONFIG_CPU_FREQ_DEBUG=y
+CONFIG_CPU_FREQ_STAT=m
+CONFIG_CPU_FREQ_STAT_DETAILS=y
+# CONFIG_CPU_FREQ_DEFAULT_GOV_PERFORMANCE is not set
+CONFIG_CPU_FREQ_DEFAULT_GOV_USERSPACE=y
+CONFIG_CPU_FREQ_GOV_PERFORMANCE=y
+CONFIG_CPU_FREQ_GOV_POWERSAVE=m
+CONFIG_CPU_FREQ_GOV_USERSPACE=y
+CONFIG_CPU_FREQ_GOV_ONDEMAND=m
+CONFIG_CPU_FREQ_GOV_CONSERVATIVE=m
+CONFIG_CPU_FREQ_PMAC64=y
+CONFIG_AXON_RAM=m
+# CONFIG_WANT_EARLY_SERIAL is not set
+CONFIG_MPIC=y
+
+#
+# Cell Broadband Engine options
+#
+CONFIG_SPU_FS=m
+CONFIG_SPU_BASE=y
+CONFIG_SPUFS_MMAP=y
+CONFIG_CBE_RAS=y
+CONFIG_CBE_THERM=m
+CONFIG_CBE_CPUFREQ=m
+CONFIG_CBE_CPUFREQ_PMI=m
+CONFIG_CBE_AXON_UTL=y
+CONFIG_CBE_AXON_PCI=y
+
+#
+# Kernel options
+#
+# CONFIG_HZ_100 is not set
+# CONFIG_HZ_250 is not set
+CONFIG_HZ_1000=y
+CONFIG_HZ=1000
+CONFIG_PREEMPT_NONE=y
+# CONFIG_PREEMPT_VOLUNTARY is not set
+# CONFIG_PREEMPT is not set
+CONFIG_PREEMPT_BKL=y
+CONFIG_BINFMT_ELF=y
+CONFIG_BINFMT_MISC=y
+CONFIG_FORCE_MAX_ZONEORDER=9
+CONFIG_IOMMU_VMERGE=y
+CONFIG_HOTPLUG_CPU=y
+CONFIG_ARCH_ENABLE_MEMORY_HOTPLUG=y
+CONFIG_KEXEC=y
+# CONFIG_CRASH_DUMP is not set
+CONFIG_IRQ_ALL_CPUS=y
+CONFIG_PPC_SPLPAR=y
+CONFIG_EEH=y
+CONFIG_SCANLOG=y
+CONFIG_LPARCFG=y
+CONFIG_NUMA=y
+CONFIG_NODES_SHIFT=4
+CONFIG_ARCH_SELECT_MEMORY_MODEL=y
+CONFIG_ARCH_SPARSEMEM_ENABLE=y
+CONFIG_ARCH_SPARSEMEM_DEFAULT=y
+CONFIG_SELECT_MEMORY_MODEL=y
+# CONFIG_FLATMEM_MANUAL is not set
+# CONFIG_DISCONTIGMEM_MANUAL is not set
+CONFIG_SPARSEMEM_MANUAL=y
+CONFIG_SPARSEMEM=y
+CONFIG_NEED_MULTIPLE_NODES=y
+CONFIG_HAVE_MEMORY_PRESENT=y
+# CONFIG_SPARSEMEM_STATIC is not set
+CONFIG_SPARSEMEM_EXTREME=y
+CONFIG_MEMORY_HOTPLUG=y
+CONFIG_MEMORY_HOTPLUG_SPARSE=y
+CONFIG_SPLIT_PTLOCK_CPUS=4
+CONFIG_MIGRATION=y
+CONFIG_RESOURCES_64BIT=y
+CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID=y
+CONFIG_ARCH_MEMORY_PROBE=y
+CONFIG_NODES_SPAN_OTHER_NODES=y
+CONFIG_PPC_64K_PAGES=y
+CONFIG_SCHED_SMT=y
+CONFIG_PROC_DEVICETREE=y
+# CONFIG_CMDLINE_BOOL is not set
+CONFIG_PM=y
+CONFIG_PM_LEGACY=y
+# CONFIG_PM_DEBUG is not set
+# CONFIG_SECCOMP is not set
+CONFIG_ISA_DMA_API=y
+
+#
+# Bus options
+#
+CONFIG_GENERIC_ISA_DMA=y
+# CONFIG_MPIC_WEIRD is not set
+CONFIG_PPC_I8259=y
+# CONFIG_PPC_INDIRECT_PCI is not set
+CONFIG_PCI=y
+CONFIG_PCI_DOMAINS=y
+CONFIG_PCIEPORTBUS=y
+# CONFIG_HOTPLUG_PCI_PCIE is not set
+CONFIG_PCI_MSI=y
+# CONFIG_PCI_DEBUG is not set
+
+#
+# PCCARD (PCMCIA/CardBus) support
+#
+CONFIG_PCCARD=y
+# CONFIG_PCMCIA_DEBUG is not set
+CONFIG_PCMCIA=y
+CONFIG_PCMCIA_LOAD_CIS=y
+CONFIG_PCMCIA_IOCTL=y
+CONFIG_CARDBUS=y
+
+#
+# PC-card bridges
+#
+CONFIG_YENTA=y
+CONFIG_YENTA_O2=y
+CONFIG_YENTA_RICOH=y
+CONFIG_YENTA_TI=y
+CONFIG_YENTA_ENE_TUNE=y
+CONFIG_YENTA_TOSHIBA=y
+CONFIG_PD6729=m
+CONFIG_I82092=m
+CONFIG_PCCARD_NONSTATIC=y
+
+#
+# PCI Hotplug Support
+#
+CONFIG_HOTPLUG_PCI=y
+# CONFIG_HOTPLUG_PCI_FAKE is not set
+# CONFIG_HOTPLUG_PCI_CPCI is not set
+CONFIG_HOTPLUG_PCI_SHPC=m
+CONFIG_HOTPLUG_PCI_SHPC_POLL_EVENT_MODE=y
+CONFIG_HOTPLUG_PCI_RPA=y
+CONFIG_HOTPLUG_PCI_RPA_DLPAR=y
+CONFIG_KERNEL_START=0xc000000000000000
+
+#
+# Networking
+#
+CONFIG_NET=y
+
+#
+# Networking options
+#
+# CONFIG_NETDEBUG is not set
+CONFIG_PACKET=y
+CONFIG_PACKET_MMAP=y
+CONFIG_UNIX=y
+CONFIG_XFRM=y
+CONFIG_XFRM_NALGO=m
+CONFIG_XFRM_USER=y
+CONFIG_NET_KEY=m
+CONFIG_INET=y
+CONFIG_IP_MULTICAST=y
+CONFIG_IP_ADVANCED_ROUTER=y
+CONFIG_ASK_IP_FIB_HASH=y
+# CONFIG_IP_FIB_TRIE is not set
+CONFIG_IP_FIB_HASH=y
+CONFIG_IP_MULTIPLE_TABLES=y
+CONFIG_IP_ROUTE_FWMARK=y
+CONFIG_IP_ROUTE_MULTIPATH=y
+# CONFIG_IP_ROUTE_MULTIPATH_CACHED is not set
+CONFIG_IP_ROUTE_VERBOSE=y
+# CONFIG_IP_PNP is not set
+CONFIG_NET_IPIP=m
+CONFIG_NET_IPGRE=m
+CONFIG_NET_IPGRE_BROADCAST=y
+CONFIG_IP_MROUTE=y
+CONFIG_IP_PIMSM_V1=y
+CONFIG_IP_PIMSM_V2=y
+# CONFIG_ARPD is not set
+CONFIG_SYN_COOKIES=y
+CONFIG_INET_AH=m
+CONFIG_INET_ESP=m
+CONFIG_INET_IPCOMP=m
+CONFIG_INET_XFRM_TUNNEL=m
+CONFIG_INET_TUNNEL=m
+CONFIG_INET_XFRM_MODE_TRANSPORT=m
+CONFIG_INET_XFRM_MODE_TUNNEL=m
+CONFIG_INET_DIAG=m
+CONFIG_INET_TCP_DIAG=m
+CONFIG_TCP_CONG_ADVANCED=y
+
+#
+# TCP congestion control
+#
+CONFIG_TCP_CONG_BIC=y
+CONFIG_TCP_CONG_CUBIC=m
+CONFIG_TCP_CONG_WESTWOOD=m
+CONFIG_TCP_CONG_HTCP=m
+CONFIG_TCP_CONG_HSTCP=m
+CONFIG_TCP_CONG_HYBLA=m
+CONFIG_TCP_CONG_VEGAS=m
+CONFIG_TCP_CONG_SCALABLE=m
+CONFIG_TCP_CONG_LP=m
+CONFIG_TCP_CONG_VENO=m
+
+#
+# IP: Virtual Server Configuration
+#
+CONFIG_IP_VS=m
+# CONFIG_IP_VS_DEBUG is not set
+CONFIG_IP_VS_TAB_BITS=12
+
+#
+# IPVS transport protocol load balancing support
+#
+CONFIG_IP_VS_PROTO_TCP=y
+CONFIG_IP_VS_PROTO_UDP=y
+CONFIG_IP_VS_PROTO_ESP=y
+CONFIG_IP_VS_PROTO_AH=y
+
+#
+# IPVS scheduler
+#
+CONFIG_IP_VS_RR=m
+CONFIG_IP_VS_WRR=m
+CONFIG_IP_VS_LC=m
+CONFIG_IP_VS_WLC=m
+CONFIG_IP_VS_LBLC=m
+CONFIG_IP_VS_LBLCR=m
+CONFIG_IP_VS_DH=m
+CONFIG_IP_VS_SH=m
+CONFIG_IP_VS_SED=m
+CONFIG_IP_VS_NQ=m
+
+#
+# IPVS application helper
+#
+CONFIG_IP_VS_FTP=m
+CONFIG_IPV6=m
+CONFIG_IPV6_PRIVACY=y
+CONFIG_IPV6_ROUTER_PREF=y
+CONFIG_IPV6_ROUTE_INFO=y
+CONFIG_IPV6_OPTIMISTIC_DAD=y
+CONFIG_INET6_AH=m
+CONFIG_INET6_ESP=m
+CONFIG_INET6_IPCOMP=m
+CONFIG_INET6_XFRM_TUNNEL=m
+CONFIG_INET6_TUNNEL=m
+CONFIG_INET6_XFRM_MODE_TRANSPORT=m
+CONFIG_INET6_XFRM_MODE_TUNNEL=m
+CONFIG_IPV6_TUNNEL=m
+# CONFIG_IPV6_SUBTREES is not set
+CONFIG_IPV6_MULTIPLE_TABLES=y
+CONFIG_IPV6_ROUTE_FWMARK=y
+CONFIG_NETWORK_SECMARK=y
+CONFIG_NETFILTER=y
+# CONFIG_NETFILTER_DEBUG is not set
+CONFIG_BRIDGE_NETFILTER=y
+
+#
+# Core Netfilter Configuration
+#
+CONFIG_NETFILTER_NETLINK=m
+CONFIG_NETFILTER_NETLINK_QUEUE=m
+CONFIG_NETFILTER_NETLINK_LOG=m
+CONFIG_NETFILTER_XTABLES=m
+CONFIG_NETFILTER_XT_TARGET_CLASSIFY=m
+CONFIG_NETFILTER_XT_TARGET_CONNMARK=m
+CONFIG_NETFILTER_XT_TARGET_MARK=m
+CONFIG_NETFILTER_XT_TARGET_NFQUEUE=m
+CONFIG_NETFILTER_XT_TARGET_NOTRACK=m
+CONFIG_NETFILTER_XT_TARGET_SECMARK=m
+CONFIG_NETFILTER_XT_TARGET_CONNSECMARK=m
+CONFIG_NETFILTER_XT_MATCH_COMMENT=m
+CONFIG_NETFILTER_XT_MATCH_CONNBYTES=m
+CONFIG_NETFILTER_XT_MATCH_CONNMARK=m
+CONFIG_NETFILTER_XT_MATCH_CONNTRACK=m
+CONFIG_NETFILTER_XT_MATCH_DCCP=m
+CONFIG_NETFILTER_XT_MATCH_ESP=m
+CONFIG_NETFILTER_XT_MATCH_HELPER=m
+CONFIG_NETFILTER_XT_MATCH_LENGTH=m
+CONFIG_NETFILTER_XT_MATCH_LIMIT=m
+CONFIG_NETFILTER_XT_MATCH_MAC=m
+CONFIG_NETFILTER_XT_MATCH_MARK=m
+CONFIG_NETFILTER_XT_MATCH_POLICY=m
+CONFIG_NETFILTER_XT_MATCH_MULTIPORT=m
+CONFIG_NETFILTER_XT_MATCH_PHYSDEV=m
+CONFIG_NETFILTER_XT_MATCH_PKTTYPE=m
+CONFIG_NETFILTER_XT_MATCH_QUOTA=m
+CONFIG_NETFILTER_XT_MATCH_REALM=m
+CONFIG_NETFILTER_XT_MATCH_SCTP=m
+CONFIG_NETFILTER_XT_MATCH_STATE=m
+CONFIG_NETFILTER_XT_MATCH_STATISTIC=m
+CONFIG_NETFILTER_XT_MATCH_STRING=m
+CONFIG_NETFILTER_XT_MATCH_TCPMSS=m
+
+#
+# IP: Netfilter Configuration
+#
+CONFIG_IP_NF_CONNTRACK=m
+CONFIG_IP_NF_CT_ACCT=y
+CONFIG_IP_NF_CONNTRACK_MARK=y
+CONFIG_IP_NF_CONNTRACK_SECMARK=y
+CONFIG_IP_NF_CONNTRACK_EVENTS=y
+CONFIG_IP_NF_CONNTRACK_NETLINK=m
+CONFIG_IP_NF_CT_PROTO_SCTP=m
+CONFIG_IP_NF_FTP=m
+CONFIG_IP_NF_IRC=m
+CONFIG_IP_NF_NETBIOS_NS=m
+CONFIG_IP_NF_TFTP=m
+CONFIG_IP_NF_AMANDA=m
+CONFIG_IP_NF_PPTP=m
+CONFIG_IP_NF_H323=m
+CONFIG_IP_NF_SIP=m
+CONFIG_IP_NF_QUEUE=m
+CONFIG_IP_NF_IPTABLES=m
+CONFIG_IP_NF_MATCH_IPRANGE=m
+CONFIG_IP_NF_MATCH_TOS=m
+CONFIG_IP_NF_MATCH_RECENT=m
+CONFIG_IP_NF_MATCH_ECN=m
+CONFIG_IP_NF_MATCH_DSCP=m
+CONFIG_IP_NF_MATCH_AH=m
+CONFIG_IP_NF_MATCH_TTL=m
+CONFIG_IP_NF_MATCH_OWNER=m
+CONFIG_IP_NF_MATCH_ADDRTYPE=m
+CONFIG_IP_NF_MATCH_HASHLIMIT=m
+CONFIG_IP_NF_FILTER=m
+CONFIG_IP_NF_TARGET_REJECT=m
+CONFIG_IP_NF_TARGET_LOG=m
+CONFIG_IP_NF_TARGET_ULOG=m
+CONFIG_IP_NF_TARGET_TCPMSS=m
+CONFIG_IP_NF_NAT=m
+CONFIG_IP_NF_NAT_NEEDED=y
+CONFIG_IP_NF_TARGET_MASQUERADE=m
+CONFIG_IP_NF_TARGET_REDIRECT=m
+CONFIG_IP_NF_TARGET_NETMAP=m
+CONFIG_IP_NF_TARGET_SAME=m
+CONFIG_IP_NF_NAT_SNMP_BASIC=m
+CONFIG_IP_NF_NAT_IRC=m
+CONFIG_IP_NF_NAT_FTP=m
+CONFIG_IP_NF_NAT_TFTP=m
+CONFIG_IP_NF_NAT_AMANDA=m
+CONFIG_IP_NF_NAT_PPTP=m
+CONFIG_IP_NF_NAT_H323=m
+CONFIG_IP_NF_NAT_SIP=m
+CONFIG_IP_NF_MANGLE=m
+CONFIG_IP_NF_TARGET_TOS=m
+CONFIG_IP_NF_TARGET_ECN=m
+CONFIG_IP_NF_TARGET_DSCP=m
+CONFIG_IP_NF_TARGET_TTL=m
+CONFIG_IP_NF_TARGET_CLUSTERIP=m
+CONFIG_IP_NF_RAW=m
+CONFIG_IP_NF_ARPTABLES=m
+CONFIG_IP_NF_ARPFILTER=m
+CONFIG_IP_NF_ARP_MANGLE=m
+
+#
+# IPv6: Netfilter Configuration (EXPERIMENTAL)
+#
+CONFIG_IP6_NF_QUEUE=m
+CONFIG_IP6_NF_IPTABLES=m
+CONFIG_IP6_NF_MATCH_RT=m
+CONFIG_IP6_NF_MATCH_OPTS=m
+CONFIG_IP6_NF_MATCH_FRAG=m
+CONFIG_IP6_NF_MATCH_HL=m
+CONFIG_IP6_NF_MATCH_OWNER=m
+CONFIG_IP6_NF_MATCH_IPV6HEADER=m
+CONFIG_IP6_NF_MATCH_AH=m
+CONFIG_IP6_NF_MATCH_EUI64=m
+CONFIG_IP6_NF_FILTER=m
+CONFIG_IP6_NF_TARGET_LOG=m
+CONFIG_IP6_NF_TARGET_REJECT=m
+CONFIG_IP6_NF_MANGLE=m
+CONFIG_IP6_NF_TARGET_HL=m
+CONFIG_IP6_NF_RAW=m
+
+#
+# DECnet: Netfilter Configuration
+#
+# CONFIG_DECNET_NF_GRABULATOR is not set
+
+#
+# Bridge: Netfilter Configuration
+#
+CONFIG_BRIDGE_NF_EBTABLES=m
+CONFIG_BRIDGE_EBT_BROUTE=m
+CONFIG_BRIDGE_EBT_T_FILTER=m
+CONFIG_BRIDGE_EBT_T_NAT=m
+CONFIG_BRIDGE_EBT_802_3=m
+CONFIG_BRIDGE_EBT_AMONG=m
+CONFIG_BRIDGE_EBT_ARP=m
+CONFIG_BRIDGE_EBT_IP=m
+CONFIG_BRIDGE_EBT_LIMIT=m
+CONFIG_BRIDGE_EBT_MARK=m
+CONFIG_BRIDGE_EBT_PKTTYPE=m
+CONFIG_BRIDGE_EBT_STP=m
+CONFIG_BRIDGE_EBT_VLAN=m
+CONFIG_BRIDGE_EBT_ARPREPLY=m
+CONFIG_BRIDGE_EBT_DNAT=m
+CONFIG_BRIDGE_EBT_MARK_T=m
+CONFIG_BRIDGE_EBT_REDIRECT=m
+CONFIG_BRIDGE_EBT_SNAT=m
+CONFIG_BRIDGE_EBT_LOG=m
+CONFIG_BRIDGE_EBT_ULOG=m
+
+#
+# DCCP Configuration (EXPERIMENTAL)
+#
+CONFIG_IP_DCCP=m
+CONFIG_INET_DCCP_DIAG=m
+CONFIG_IP_DCCP_ACKVEC=y
+
+#
+# DCCP CCIDs Configuration (EXPERIMENTAL)
+#
+CONFIG_IP_DCCP_CCID2=m
+CONFIG_IP_DCCP_CCID3=m
+CONFIG_IP_DCCP_TFRC_LIB=m
+
+#
+# DCCP Kernel Hacking
+#
+# CONFIG_IP_DCCP_DEBUG is not set
+
+#
+# SCTP Configuration (EXPERIMENTAL)
+#
+CONFIG_IP_SCTP=m
+# CONFIG_SCTP_DBG_MSG is not set
+# CONFIG_SCTP_DBG_OBJCNT is not set
+# CONFIG_SCTP_HMAC_NONE is not set
+# CONFIG_SCTP_HMAC_SHA1 is not set
+CONFIG_SCTP_HMAC_MD5=y
+
+#
+# TIPC Configuration (EXPERIMENTAL)
+#
+CONFIG_TIPC=m
+# CONFIG_TIPC_ADVANCED is not set
+# CONFIG_TIPC_DEBUG is not set
+CONFIG_ATM=m
+CONFIG_ATM_CLIP=m
+# CONFIG_ATM_CLIP_NO_ICMP is not set
+CONFIG_ATM_LANE=m
+# CONFIG_ATM_MPOA is not set
+CONFIG_ATM_BR2684=m
+# CONFIG_ATM_BR2684_IPFILTER is not set
+CONFIG_BRIDGE=m
+CONFIG_VLAN_8021Q=m
+CONFIG_DECNET=m
+CONFIG_DECNET_ROUTER=y
+CONFIG_DECNET_ROUTE_FWMARK=y
+CONFIG_LLC=y
+# CONFIG_LLC2 is not set
+CONFIG_IPX=m
+# CONFIG_IPX_INTERN is not set
+CONFIG_ATALK=m
+CONFIG_DEV_APPLETALK=m
+CONFIG_IPDDP=m
+CONFIG_IPDDP_ENCAP=y
+CONFIG_IPDDP_DECAP=y
+# CONFIG_X25 is not set
+# CONFIG_LAPB is not set
+# CONFIG_ECONET is not set
+CONFIG_WAN_ROUTER=m
+
+#
+# QoS and/or fair queueing
+#
+CONFIG_NET_SCHED=y
+# CONFIG_NET_SCH_CLK_JIFFIES is not set
+CONFIG_NET_SCH_CLK_GETTIMEOFDAY=y
+# CONFIG_NET_SCH_CLK_CPU is not set
+
+#
+# Queueing/Scheduling
+#
+CONFIG_NET_SCH_CBQ=m
+CONFIG_NET_SCH_HTB=m
+CONFIG_NET_SCH_HFSC=m
+CONFIG_NET_SCH_ATM=m
+CONFIG_NET_SCH_PRIO=m
+CONFIG_NET_SCH_RED=m
+CONFIG_NET_SCH_SFQ=m
+CONFIG_NET_SCH_TEQL=m
+CONFIG_NET_SCH_TBF=m
+CONFIG_NET_SCH_GRED=m
+CONFIG_NET_SCH_DSMARK=m
+CONFIG_NET_SCH_NETEM=m
+CONFIG_NET_SCH_INGRESS=m
+
+#
+# Classification
+#
+CONFIG_NET_CLS=y
+CONFIG_NET_CLS_BASIC=m
+CONFIG_NET_CLS_TCINDEX=m
+CONFIG_NET_CLS_ROUTE4=m
+CONFIG_NET_CLS_ROUTE=y
+CONFIG_NET_CLS_FW=m
+CONFIG_NET_CLS_U32=m
+CONFIG_CLS_U32_PERF=y
+CONFIG_CLS_U32_MARK=y
+CONFIG_NET_CLS_RSVP=m
+CONFIG_NET_CLS_RSVP6=m
+CONFIG_NET_EMATCH=y
+CONFIG_NET_EMATCH_STACK=32
+CONFIG_NET_EMATCH_CMP=m
+CONFIG_NET_EMATCH_NBYTE=m
+CONFIG_NET_EMATCH_U32=m
+CONFIG_NET_EMATCH_META=m
+CONFIG_NET_EMATCH_TEXT=m
+CONFIG_NET_CLS_ACT=y
+CONFIG_NET_ACT_POLICE=m
+CONFIG_NET_ACT_GACT=m
+CONFIG_GACT_PROB=y
+CONFIG_NET_ACT_MIRRED=m
+CONFIG_NET_ACT_IPT=m
+CONFIG_NET_ACT_PEDIT=m
+CONFIG_NET_ACT_SIMP=m
+CONFIG_NET_CLS_IND=y
+CONFIG_NET_ESTIMATOR=y
+
+#
+# Network testing
+#
+CONFIG_NET_PKTGEN=m
+# CONFIG_NET_TCPPROBE is not set
+# CONFIG_HAMRADIO is not set
+CONFIG_IRDA=m
+
+#
+# IrDA protocols
+#
+CONFIG_IRLAN=m
+CONFIG_IRNET=m
+CONFIG_IRCOMM=m
+# CONFIG_IRDA_ULTRA is not set
+
+#
+# IrDA options
+#
+CONFIG_IRDA_CACHE_LAST_LSAP=y
+CONFIG_IRDA_FAST_RR=y
+# CONFIG_IRDA_DEBUG is not set
+
+#
+# Infrared-port device drivers
+#
+
+#
+# SIR device drivers
+#
+CONFIG_IRTTY_SIR=m
+
+#
+# Dongle support
+#
+CONFIG_DONGLE=y
+CONFIG_ESI_DONGLE=m
+CONFIG_ACTISYS_DONGLE=m
+CONFIG_TEKRAM_DONGLE=m
+CONFIG_TOIM3232_DONGLE=m
+CONFIG_LITELINK_DONGLE=m
+CONFIG_MA600_DONGLE=m
+CONFIG_GIRBIL_DONGLE=m
+CONFIG_MCP2120_DONGLE=m
+CONFIG_OLD_BELKIN_DONGLE=m
+CONFIG_ACT200L_DONGLE=m
+
+#
+# Old SIR device drivers
+#
+
+#
+# Old Serial dongle support
+#
+
+#
+# FIR device drivers
+#
+CONFIG_USB_IRDA=m
+CONFIG_SIGMATEL_FIR=m
+CONFIG_NSC_FIR=m
+CONFIG_WINBOND_FIR=m
+CONFIG_SMC_IRCC_FIR=m
+CONFIG_ALI_FIR=m
+CONFIG_VLSI_FIR=m
+CONFIG_VIA_FIR=m
+CONFIG_MCS_FIR=m
+CONFIG_BT=m
+CONFIG_BT_L2CAP=m
+CONFIG_BT_SCO=m
+CONFIG_BT_RFCOMM=m
+CONFIG_BT_RFCOMM_TTY=y
+CONFIG_BT_BNEP=m
+CONFIG_BT_BNEP_MC_FILTER=y
+CONFIG_BT_BNEP_PROTO_FILTER=y
+CONFIG_BT_CMTP=m
+CONFIG_BT_HIDP=m
+
+#
+# Bluetooth device drivers
+#
+CONFIG_BT_HCIUSB=m
+CONFIG_BT_HCIUSB_SCO=y
+CONFIG_BT_HCIUART=m
+CONFIG_BT_HCIUART_H4=y
+CONFIG_BT_HCIUART_BCSP=y
+CONFIG_BT_HCIBCM203X=m
+CONFIG_BT_HCIBPA10X=m
+CONFIG_BT_HCIBFUSB=m
+CONFIG_BT_HCIDTL1=m
+CONFIG_BT_HCIBT3C=m
+CONFIG_BT_HCIBLUECARD=m
+CONFIG_BT_HCIBTUART=m
+CONFIG_BT_HCIVHCI=m
+CONFIG_TUX=m
+
+#
+# TUX options
+#
+CONFIG_TUX_EXTCGI=y
+CONFIG_TUX_EXTENDED_LOG=y
+# CONFIG_TUX_DEBUG is not set
+CONFIG_NETLABEL=y
+CONFIG_FIB_RULES=y
+
+#
+# Wireless
+#
+CONFIG_CFG80211=m
+CONFIG_NL80211=y
+CONFIG_WIRELESS_EXT=y
+CONFIG_NET_WIRELESS_RTNETLINK=y
+CONFIG_MAC80211=m
+CONFIG_MAC80211_RCSIMPLE=y
+CONFIG_MAC80211_LEDS=y
+# CONFIG_MAC80211_DEBUGFS is not set
+# CONFIG_MAC80211_DEBUG is not set
+CONFIG_IEEE80211=m
+CONFIG_IEEE80211_DEBUG=y
+CONFIG_IEEE80211_CRYPT_WEP=m
+CONFIG_IEEE80211_CRYPT_CCMP=m
+CONFIG_IEEE80211_CRYPT_TKIP=m
+CONFIG_IEEE80211_SOFTMAC=m
+CONFIG_IEEE80211_SOFTMAC_DEBUG=y
+
+#
+# Device Drivers
+#
+
+#
+# Generic Driver Options
+#
+CONFIG_STANDALONE=y
+CONFIG_PREVENT_FIRMWARE_BUILD=y
+CONFIG_FW_LOADER=y
+# CONFIG_DEBUG_DRIVER is not set
+# CONFIG_DEBUG_DEVRES is not set
+# CONFIG_SYS_HYPERVISOR is not set
+
+#
+# Connector - unified userspace <-> kernelspace linker
+#
+CONFIG_CONNECTOR=y
+CONFIG_PROC_EVENTS=y
+
+#
+# Memory Technology Devices (MTD)
+#
+CONFIG_MTD=m
+# CONFIG_MTD_DEBUG is not set
+CONFIG_MTD_CONCAT=m
+CONFIG_MTD_PARTITIONS=y
+CONFIG_MTD_REDBOOT_PARTS=m
+CONFIG_MTD_REDBOOT_DIRECTORY_BLOCK=-1
+# CONFIG_MTD_REDBOOT_PARTS_UNALLOCATED is not set
+# CONFIG_MTD_REDBOOT_PARTS_READONLY is not set
+CONFIG_MTD_CMDLINE_PARTS=y
+
+#
+# User Modules And Translation Layers
+#
+CONFIG_MTD_CHAR=m
+CONFIG_MTD_BLOCK=m
+CONFIG_MTD_BLOCK_RO=m
+CONFIG_FTL=m
+CONFIG_NFTL=m
+CONFIG_NFTL_RW=y
+CONFIG_INFTL=m
+CONFIG_RFD_FTL=m
+
+#
+# RAM/ROM/Flash chip drivers
+#
+CONFIG_MTD_CFI=m
+CONFIG_MTD_JEDECPROBE=m
+CONFIG_MTD_GEN_PROBE=m
+# CONFIG_MTD_CFI_ADV_OPTIONS is not set
+CONFIG_MTD_MAP_BANK_WIDTH_1=y
+CONFIG_MTD_MAP_BANK_WIDTH_2=y
+CONFIG_MTD_MAP_BANK_WIDTH_4=y
+# CONFIG_MTD_MAP_BANK_WIDTH_8 is not set
+# CONFIG_MTD_MAP_BANK_WIDTH_16 is not set
+# CONFIG_MTD_MAP_BANK_WIDTH_32 is not set
+CONFIG_MTD_CFI_I1=y
+CONFIG_MTD_CFI_I2=y
+# CONFIG_MTD_CFI_I4 is not set
+# CONFIG_MTD_CFI_I8 is not set
+CONFIG_MTD_CFI_INTELEXT=m
+CONFIG_MTD_CFI_AMDSTD=m
+CONFIG_MTD_CFI_STAA=m
+CONFIG_MTD_CFI_UTIL=m
+CONFIG_MTD_RAM=m
+CONFIG_MTD_ROM=m
+CONFIG_MTD_ABSENT=m
+# CONFIG_MTD_OBSOLETE_CHIPS is not set
+
+#
+# Mapping drivers for chip access
+#
+CONFIG_MTD_COMPLEX_MAPPINGS=y
+# CONFIG_MTD_PHYSMAP is not set
+CONFIG_MTD_PCI=m
+# CONFIG_MTD_PLATRAM is not set
+
+#
+# Self-contained MTD device drivers
+#
+CONFIG_MTD_PMC551=m
+# CONFIG_MTD_PMC551_BUGFIX is not set
+# CONFIG_MTD_PMC551_DEBUG is not set
+# CONFIG_MTD_SLRAM is not set
+# CONFIG_MTD_PHRAM is not set
+CONFIG_MTD_MTDRAM=m
+CONFIG_MTDRAM_TOTAL_SIZE=4096
+CONFIG_MTDRAM_ERASE_SIZE=128
+CONFIG_MTD_BLOCK2MTD=m
+
+#
+# Disk-On-Chip Device Drivers
+#
+# CONFIG_MTD_DOC2000 is not set
+# CONFIG_MTD_DOC2001 is not set
+# CONFIG_MTD_DOC2001PLUS is not set
+
+#
+# NAND Flash Device Drivers
+#
+CONFIG_MTD_NAND=m
+# CONFIG_MTD_NAND_VERIFY_WRITE is not set
+CONFIG_MTD_NAND_ECC_SMC=y
+CONFIG_MTD_NAND_IDS=m
+# CONFIG_MTD_NAND_DISKONCHIP is not set
+CONFIG_MTD_NAND_NANDSIM=m
+
+#
+# OneNAND Flash Device Drivers
+#
+# CONFIG_MTD_ONENAND is not set
+
+#
+# Parallel port support
+#
+CONFIG_PARPORT=m
+CONFIG_PARPORT_PC=m
+CONFIG_PARPORT_SERIAL=m
+# CONFIG_PARPORT_PC_FIFO is not set
+# CONFIG_PARPORT_PC_SUPERIO is not set
+CONFIG_PARPORT_PC_PCMCIA=m
+CONFIG_PARPORT_NOT_PC=y
+# CONFIG_PARPORT_GSC is not set
+# CONFIG_PARPORT_AX88796 is not set
+CONFIG_PARPORT_1284=y
+
+#
+# Plug and Play support
+#
+
+#
+# Block devices
+#
+CONFIG_BLK_DEV_FD=m
+CONFIG_PARIDE=m
+CONFIG_PARIDE_PARPORT=m
+
+#
+# Parallel IDE high-level drivers
+#
+CONFIG_PARIDE_PD=m
+CONFIG_PARIDE_PCD=m
+CONFIG_PARIDE_PF=m
+CONFIG_PARIDE_PT=m
+CONFIG_PARIDE_PG=m
+
+#
+# Parallel IDE protocol modules
+#
+CONFIG_PARIDE_ATEN=m
+CONFIG_PARIDE_BPCK=m
+CONFIG_PARIDE_COMM=m
+CONFIG_PARIDE_DSTR=m
+CONFIG_PARIDE_FIT2=m
+CONFIG_PARIDE_FIT3=m
+CONFIG_PARIDE_EPAT=m
+CONFIG_PARIDE_EPATC8=y
+CONFIG_PARIDE_EPIA=m
+CONFIG_PARIDE_FRIQ=m
+CONFIG_PARIDE_FRPW=m
+CONFIG_PARIDE_KBIC=m
+CONFIG_PARIDE_KTTI=m
+CONFIG_PARIDE_ON20=m
+CONFIG_PARIDE_ON26=m
+# CONFIG_BLK_CPQ_DA is not set
+CONFIG_BLK_CPQ_CISS_DA=m
+CONFIG_CISS_SCSI_TAPE=y
+CONFIG_BLK_DEV_DAC960=m
+CONFIG_BLK_DEV_UMEM=m
+# CONFIG_BLK_DEV_COW_COMMON is not set
+CONFIG_BLK_DEV_LOOP=m
+CONFIG_BLK_DEV_CRYPTOLOOP=m
+CONFIG_BLK_DEV_NBD=m
+CONFIG_BLK_DEV_SX8=m
+# CONFIG_BLK_DEV_UB is not set
+CONFIG_BLK_DEV_RAM=y
+CONFIG_BLK_DEV_RAM_COUNT=16
+CONFIG_BLK_DEV_RAM_SIZE=16384
+CONFIG_BLK_DEV_RAM_BLOCKSIZE=4096
+CONFIG_BLK_DEV_INITRD=y
+CONFIG_CDROM_PKTCDVD=m
+CONFIG_CDROM_PKTCDVD_BUFFERS=8
+# CONFIG_CDROM_PKTCDVD_WCACHE is not set
+CONFIG_ATA_OVER_ETH=m
+
+#
+# ATA/ATAPI/MFM/RLL support
+#
+CONFIG_IDE=y
+CONFIG_BLK_DEV_IDE=y
+
+#
+# Please see Documentation/ide.txt for help/info on IDE drives
+#
+# CONFIG_BLK_DEV_IDE_SATA is not set
+CONFIG_BLK_DEV_IDEDISK=y
+CONFIG_IDEDISK_MULTI_MODE=y
+CONFIG_BLK_DEV_IDECS=m
+CONFIG_BLK_DEV_IDECD=m
+# CONFIG_BLK_DEV_IDETAPE is not set
+CONFIG_BLK_DEV_IDEFLOPPY=y
+CONFIG_BLK_DEV_IDESCSI=m
+CONFIG_IDE_TASK_IOCTL=y
+
+#
+# IDE chipset support/bugfixes
+#
+CONFIG_IDE_GENERIC=y
+CONFIG_BLK_DEV_IDEPCI=y
+CONFIG_IDEPCI_SHARE_IRQ=y
+# CONFIG_BLK_DEV_OFFBOARD is not set
+CONFIG_BLK_DEV_GENERIC=y
+# CONFIG_BLK_DEV_OPTI621 is not set
+CONFIG_BLK_DEV_SL82C105=y
+CONFIG_BLK_DEV_IDEDMA_PCI=y
+# CONFIG_BLK_DEV_IDEDMA_FORCED is not set
+CONFIG_IDEDMA_PCI_AUTO=y
+# CONFIG_IDEDMA_ONLYDISK is not set
+CONFIG_BLK_DEV_AEC62XX=y
+CONFIG_BLK_DEV_ALI15X3=y
+# CONFIG_WDC_ALI15X3 is not set
+CONFIG_BLK_DEV_AMD74XX=y
+CONFIG_BLK_DEV_CMD64X=y
+CONFIG_BLK_DEV_TRIFLEX=y
+# CONFIG_BLK_DEV_CY82C693 is not set
+CONFIG_BLK_DEV_CS5520=y
+CONFIG_BLK_DEV_CS5530=y
+CONFIG_BLK_DEV_HPT34X=y
+# CONFIG_HPT34X_AUTODMA is not set
+CONFIG_BLK_DEV_HPT366=y
+# CONFIG_BLK_DEV_SC1200 is not set
+CONFIG_BLK_DEV_PIIX=y
+CONFIG_BLK_DEV_IT821X=y
+# CONFIG_BLK_DEV_NS87415 is not set
+CONFIG_BLK_DEV_PDC202XX_OLD=y
+# CONFIG_PDC202XX_BURST is not set
+CONFIG_BLK_DEV_PDC202XX_NEW=y
+CONFIG_BLK_DEV_SVWKS=y
+CONFIG_BLK_DEV_SIIMAGE=y
+# CONFIG_BLK_DEV_SLC90E66 is not set
+# CONFIG_BLK_DEV_TRM290 is not set
+CONFIG_BLK_DEV_VIA82CXXX=y
+CONFIG_BLK_DEV_IDE_PMAC=y
+CONFIG_BLK_DEV_IDE_PMAC_ATA100FIRST=y
+CONFIG_BLK_DEV_IDEDMA_PMAC=y
+# CONFIG_IDE_ARM is not set
+CONFIG_BLK_DEV_IDEDMA=y
+# CONFIG_IDEDMA_IVB is not set
+CONFIG_IDEDMA_AUTO=y
+# CONFIG_BLK_DEV_HD is not set
+
+#
+# SCSI device support
+#
+CONFIG_RAID_ATTRS=m
+CONFIG_SCSI=m
+CONFIG_SCSI_NETLINK=y
+CONFIG_SCSI_PROC_FS=y
+
+#
+# SCSI support type (disk, tape, CD-ROM)
+#
+CONFIG_BLK_DEV_SD=m
+CONFIG_SD_IOSTATS=y
+CONFIG_CHR_DEV_ST=m
+CONFIG_CHR_DEV_OSST=m
+CONFIG_BLK_DEV_SR=m
+CONFIG_BLK_DEV_SR_VENDOR=y
+CONFIG_CHR_DEV_SG=m
+CONFIG_CHR_DEV_SCH=m
+
+#
+# Some SCSI devices (e.g. CD jukebox) support multiple LUNs
+#
+CONFIG_SCSI_MULTI_LUN=y
+CONFIG_SCSI_CONSTANTS=y
+CONFIG_SCSI_LOGGING=y
+
+#
+# SCSI Transport
+#
+CONFIG_SCSI_SPI_ATTRS=m
+CONFIG_SCSI_FC_ATTRS=m
+# CONFIG_SCSI_ISCSI_ATTRS is not set
+CONFIG_SCSI_SAS_ATTRS=m
+CONFIG_SCSI_SAS_LIBSAS=m
+CONFIG_SCSI_SAS_ATA=y
+# CONFIG_SCSI_SAS_LIBSAS_DEBUG is not set
+
+#
+# SCSI low-level drivers
+#
+# CONFIG_ISCSI_TCP is not set
+CONFIG_BLK_DEV_3W_XXXX_RAID=m
+CONFIG_SCSI_3W_9XXX=m
+CONFIG_SCSI_ACARD=m
+CONFIG_SCSI_AACRAID=m
+CONFIG_SCSI_AIC7XXX=m
+CONFIG_AIC7XXX_CMDS_PER_DEVICE=4
+CONFIG_AIC7XXX_RESET_DELAY_MS=15000
+# CONFIG_AIC7XXX_DEBUG_ENABLE is not set
+CONFIG_AIC7XXX_DEBUG_MASK=0
+# CONFIG_AIC7XXX_REG_PRETTY_PRINT is not set
+CONFIG_SCSI_AIC7XXX_OLD=m
+CONFIG_SCSI_AIC79XX=m
+CONFIG_AIC79XX_CMDS_PER_DEVICE=4
+CONFIG_AIC79XX_RESET_DELAY_MS=15000
+# CONFIG_AIC79XX_ENABLE_RD_STRM is not set
+# CONFIG_AIC79XX_DEBUG_ENABLE is not set
+CONFIG_AIC79XX_DEBUG_MASK=0
+# CONFIG_AIC79XX_REG_PRETTY_PRINT is not set
+CONFIG_SCSI_AIC94XX=m
+# CONFIG_AIC94XX_DEBUG is not set
+CONFIG_SCSI_ARCMSR=m
+CONFIG_MEGARAID_NEWGEN=y
+CONFIG_MEGARAID_MM=m
+CONFIG_MEGARAID_MAILBOX=m
+CONFIG_MEGARAID_LEGACY=m
+CONFIG_MEGARAID_SAS=m
+CONFIG_SCSI_HPTIOP=m
+# CONFIG_SCSI_BUSLOGIC is not set
+# CONFIG_SCSI_DMX3191D is not set
+# CONFIG_SCSI_EATA is not set
+# CONFIG_SCSI_FUTURE_DOMAIN is not set
+CONFIG_SCSI_GDTH=m
+CONFIG_SCSI_IPS=m
+CONFIG_SCSI_IBMVSCSI=m
+CONFIG_SCSI_INITIO=m
+CONFIG_SCSI_INIA100=m
+CONFIG_SCSI_PPA=m
+CONFIG_SCSI_IMM=m
+# CONFIG_SCSI_IZIP_EPP16 is not set
+# CONFIG_SCSI_IZIP_SLOW_CTR is not set
+CONFIG_SCSI_STEX=m
+CONFIG_SCSI_SYM53C8XX_2=m
+CONFIG_SCSI_SYM53C8XX_DMA_ADDRESSING_MODE=1
+CONFIG_SCSI_SYM53C8XX_DEFAULT_TAGS=16
+CONFIG_SCSI_SYM53C8XX_MAX_TAGS=64
+CONFIG_SCSI_SYM53C8XX_MMIO=y
+CONFIG_SCSI_IPR=m
+CONFIG_SCSI_IPR_TRACE=y
+CONFIG_SCSI_IPR_DUMP=y
+CONFIG_SCSI_QLOGIC_1280=m
+CONFIG_SCSI_QLA_FC=m
+# CONFIG_SCSI_QLA_ISCSI is not set
+CONFIG_SCSI_LPFC=m
+CONFIG_SCSI_DC395x=m
+# CONFIG_SCSI_DC390T is not set
+# CONFIG_SCSI_DEBUG is not set
+
+#
+# PCMCIA SCSI adapter support
+#
+# CONFIG_PCMCIA_FDOMAIN is not set
+CONFIG_PCMCIA_QLOGIC=m
+CONFIG_PCMCIA_SYM53C500=m
+CONFIG_ATA=m
+# CONFIG_ATA_NONSTANDARD is not set
+CONFIG_SATA_AHCI=m
+CONFIG_SATA_SVW=m
+CONFIG_ATA_PIIX=m
+CONFIG_SATA_MV=m
+CONFIG_SATA_NV=m
+CONFIG_PDC_ADMA=m
+CONFIG_SATA_QSTOR=m
+CONFIG_SATA_PROMISE=m
+CONFIG_SATA_SX4=m
+CONFIG_SATA_SIL=m
+CONFIG_SATA_SIL24=m
+CONFIG_SATA_SIS=m
+CONFIG_SATA_ULI=m
+CONFIG_SATA_VIA=m
+CONFIG_SATA_VITESSE=m
+CONFIG_SATA_INIC162X=m
+# CONFIG_PATA_ALI is not set
+# CONFIG_PATA_AMD is not set
+# CONFIG_PATA_ARTOP is not set
+# CONFIG_PATA_ATIIXP is not set
+# CONFIG_PATA_CMD640_PCI is not set
+# CONFIG_PATA_CMD64X is not set
+# CONFIG_PATA_CS5520 is not set
+# CONFIG_PATA_CS5530 is not set
+# CONFIG_PATA_CYPRESS is not set
+# CONFIG_PATA_EFAR is not set
+# CONFIG_ATA_GENERIC is not set
+# CONFIG_PATA_HPT366 is not set
+# CONFIG_PATA_HPT37X is not set
+# CONFIG_PATA_HPT3X2N is not set
+# CONFIG_PATA_HPT3X3 is not set
+# CONFIG_PATA_IT821X is not set
+# CONFIG_PATA_IT8213 is not set
+# CONFIG_PATA_JMICRON is not set
+# CONFIG_PATA_TRIFLEX is not set
+CONFIG_PATA_MARVELL=m
+# CONFIG_PATA_MPIIX is not set
+# CONFIG_PATA_OLDPIIX is not set
+# CONFIG_PATA_NETCELL is not set
+# CONFIG_PATA_NS87410 is not set
+# CONFIG_PATA_NS87415 is not set
+# CONFIG_PATA_OPTI is not set
+# CONFIG_PATA_OPTIDMA is not set
+# CONFIG_PATA_PCMCIA is not set
+# CONFIG_PATA_PDC_OLD is not set
+# CONFIG_PATA_RADISYS is not set
+# CONFIG_PATA_RZ1000 is not set
+# CONFIG_PATA_SC1200 is not set
+# CONFIG_PATA_SERVERWORKS is not set
+CONFIG_PATA_PDC2027X=m
+# CONFIG_PATA_SIL680 is not set
+CONFIG_PATA_SIS=m
+# CONFIG_PATA_VIA is not set
+# CONFIG_PATA_WINBOND is not set
+CONFIG_ATA_INTEL_COMBINED=y
+
+#
+# Multi-device support (RAID and LVM)
+#
+CONFIG_MD=y
+CONFIG_BLK_DEV_MD=y
+CONFIG_MD_LINEAR=m
+CONFIG_MD_RAID0=m
+CONFIG_MD_RAID1=m
+CONFIG_MD_RAID10=m
+CONFIG_MD_RAID456=m
+CONFIG_MD_RAID5_RESHAPE=y
+CONFIG_MD_MULTIPATH=m
+CONFIG_MD_FAULTY=m
+CONFIG_BLK_DEV_DM=m
+CONFIG_DM_CRYPT=m
+CONFIG_DM_SNAPSHOT=m
+CONFIG_DM_MIRROR=m
+CONFIG_DM_ZERO=m
+CONFIG_DM_MULTIPATH=m
+CONFIG_DM_MULTIPATH_EMC=m
+CONFIG_DM_MULTIPATH_RDAC=m
+CONFIG_DM_MULTIPATH_HP=m
+CONFIG_DM_UEVENT=y
+
+#
+# Fusion MPT device support
+#
+CONFIG_FUSION=y
+CONFIG_FUSION_SPI=m
+CONFIG_FUSION_FC=m
+CONFIG_FUSION_SAS=m
+CONFIG_FUSION_MAX_SGE=128
+CONFIG_FUSION_CTL=m
+CONFIG_FUSION_LAN=m
+CONFIG_FUSION_LOGGING=y
+CONFIG_FIREWIRE=m
+CONFIG_FIREWIRE_OHCI=m
+CONFIG_FIREWIRE_SBP2=m
+
+#
+# IEEE 1394 (FireWire) support
+#
+# CONFIG_IEEE1394 is not set
+
+#
+# I2O device support
+#
+# CONFIG_I2O is not set
+
+#
+# Macintosh device drivers
+#
+CONFIG_ADB_PMU=y
+CONFIG_ADB_PMU_LED=y
+CONFIG_ADB_PMU_LED_IDE=y
+CONFIG_PMAC_SMU=y
+CONFIG_THERM_PM72=y
+CONFIG_WINDFARM=y
+CONFIG_WINDFARM_PM81=y
+CONFIG_WINDFARM_PM91=y
+CONFIG_WINDFARM_PM112=y
+
+#
+# Network device support
+#
+CONFIG_NETDEVICES=y
+CONFIG_IFB=m
+CONFIG_DUMMY=m
+CONFIG_BONDING=m
+CONFIG_EQUALIZER=m
+CONFIG_TUN=m
+
+#
+# ARCnet devices
+#
+# CONFIG_ARCNET is not set
+
+#
+# PHY device support
+#
+CONFIG_PHYLIB=m
+
+#
+# MII PHY device drivers
+#
+CONFIG_MARVELL_PHY=m
+CONFIG_DAVICOM_PHY=m
+CONFIG_QSEMI_PHY=m
+CONFIG_LXT_PHY=m
+CONFIG_CICADA_PHY=m
+CONFIG_VITESSE_PHY=m
+CONFIG_SMSC_PHY=m
+CONFIG_FIXED_PHY=m
+CONFIG_FIXED_MII_10_FDX=y
+CONFIG_FIXED_MII_100_FDX=y
+
+#
+# Ethernet (10 or 100Mbit)
+#
+CONFIG_NET_ETHERNET=y
+CONFIG_MII=m
+CONFIG_HAPPYMEAL=m
+CONFIG_SUNGEM=m
+CONFIG_CASSINI=m
+CONFIG_NET_VENDOR_3COM=y
+CONFIG_VORTEX=m
+CONFIG_TYPHOON=m
+
+#
+# Tulip family network device support
+#
+CONFIG_NET_TULIP=y
+CONFIG_DE2104X=m
+CONFIG_TULIP=m
+# CONFIG_TULIP_MWI is not set
+CONFIG_TULIP_MMIO=y
+# CONFIG_TULIP_NAPI is not set
+CONFIG_DE4X5=m
+CONFIG_WINBOND_840=m
+CONFIG_DM9102=m
+CONFIG_ULI526X=m
+CONFIG_PCMCIA_XIRCOM=m
+# CONFIG_HP100 is not set
+CONFIG_IBMVETH=m
+CONFIG_NET_PCI=y
+CONFIG_PCNET32=m
+CONFIG_AMD8111_ETH=m
+CONFIG_AMD8111E_NAPI=y
+CONFIG_ADAPTEC_STARFIRE=m
+CONFIG_ADAPTEC_STARFIRE_NAPI=y
+CONFIG_B44=m
+CONFIG_FORCEDETH=m
+# CONFIG_DGRS is not set
+# CONFIG_EEPRO100 is not set
+CONFIG_E100=m
+CONFIG_FEALNX=m
+CONFIG_NATSEMI=m
+CONFIG_NE2K_PCI=m
+CONFIG_8139CP=m
+CONFIG_8139TOO=m
+# CONFIG_8139TOO_PIO is not set
+# CONFIG_8139TOO_TUNE_TWISTER is not set
+CONFIG_8139TOO_8129=y
+# CONFIG_8139_OLD_RX_RESET is not set
+CONFIG_SIS900=m
+CONFIG_EPIC100=m
+CONFIG_SUNDANCE=m
+# CONFIG_SUNDANCE_MMIO is not set
+CONFIG_VIA_RHINE=m
+CONFIG_VIA_RHINE_MMIO=y
+CONFIG_VIA_RHINE_NAPI=y
+CONFIG_NET_POCKET=y
+CONFIG_DE600=m
+CONFIG_DE620=m
+
+#
+# Ethernet (1000 Mbit)
+#
+CONFIG_ACENIC=m
+# CONFIG_ACENIC_OMIT_TIGON_I is not set
+CONFIG_DL2K=m
+CONFIG_E1000=m
+CONFIG_E1000_NAPI=y
+# CONFIG_E1000_DISABLE_PACKET_SPLIT is not set
+CONFIG_E1000E=m
+CONFIG_IGB=m
+CONFIG_NS83820=m
+CONFIG_HAMACHI=m
+CONFIG_YELLOWFIN=m
+CONFIG_R8169=m
+CONFIG_R8169_NAPI=y
+CONFIG_R8169_VLAN=y
+CONFIG_SIS190=m
+CONFIG_SKGE=m
+CONFIG_SKY2=m
+# CONFIG_SK98LIN is not set
+CONFIG_VIA_VELOCITY=m
+CONFIG_TIGON3=m
+CONFIG_BNX2=m
+CONFIG_SPIDER_NET=m
+# CONFIG_MV643XX_ETH is not set
+CONFIG_QLA3XXX=m
+
+#
+# Ethernet (10000 Mbit)
+#
+CONFIG_CHELSIO_T1=m
+CONFIG_CHELSIO_T3=m
+CONFIG_EHEA=m
+CONFIG_IXGBE=m
+CONFIG_IXGB=m
+CONFIG_IXGB_NAPI=y
+CONFIG_S2IO=m
+CONFIG_S2IO_NAPI=y
+CONFIG_MYRI10GE=m
+CONFIG_NETXEN_NIC=m
+CONFIG_BNX2X=m
+# CONFIG_MLX4_CORE is not set
+
+#
+# Token Ring devices
+#
+CONFIG_TR=y
+CONFIG_IBMOL=m
+CONFIG_3C359=m
+# CONFIG_TMS380TR is not set
+
+#
+# Wireless LAN (non-hamradio)
+#
+CONFIG_NET_RADIO=y
+
+#
+# Obsolete Wireless cards support (pre-802.11)
+#
+# CONFIG_STRIP is not set
+CONFIG_PCMCIA_WAVELAN=m
+CONFIG_PCMCIA_NETWAVE=m
+
+#
+# Wireless 802.11 Frequency Hopping cards support
+#
+# CONFIG_PCMCIA_RAYCS is not set
+
+#
+# Wireless 802.11b ISA/PCI cards support
+#
+# CONFIG_IPW2100 is not set
+# CONFIG_IPW2200 is not set
+CONFIG_AIRO=m
+CONFIG_HERMES=m
+CONFIG_APPLE_AIRPORT=m
+CONFIG_PLX_HERMES=m
+CONFIG_TMD_HERMES=m
+CONFIG_NORTEL_HERMES=m
+CONFIG_PCI_HERMES=m
+CONFIG_ATMEL=m
+CONFIG_PCI_ATMEL=m
+
+#
+# Wireless 802.11b Pcmcia/Cardbus cards support
+#
+CONFIG_PCMCIA_HERMES=m
+CONFIG_PCMCIA_SPECTRUM=m
+CONFIG_AIRO_CS=m
+CONFIG_PCMCIA_ATMEL=m
+CONFIG_PCMCIA_WL3501=m
+
+#
+# Prism GT/Duette 802.11(a/b/g) PCI/Cardbus support
+#
+CONFIG_PRISM54=m
+CONFIG_USB_ZD1201=m
+CONFIG_HOSTAP=m
+CONFIG_HOSTAP_FIRMWARE=y
+CONFIG_HOSTAP_FIRMWARE_NVRAM=y
+CONFIG_HOSTAP_PLX=m
+CONFIG_HOSTAP_PCI=m
+CONFIG_HOSTAP_CS=m
+CONFIG_BCM43XX=m
+CONFIG_BCM43XX_DEBUG=y
+CONFIG_BCM43XX_DMA=y
+CONFIG_BCM43XX_PIO=y
+CONFIG_BCM43XX_DMA_AND_PIO_MODE=y
+# CONFIG_BCM43XX_DMA_MODE is not set
+# CONFIG_BCM43XX_PIO_MODE is not set
+CONFIG_ZD1211RW=m
+# CONFIG_ZD1211RW_DEBUG is not set
+CONFIG_NET_WIRELESS=y
+CONFIG_IWL4965=m
+CONFIG_IWL4965_QOS=y
+CONFIG_IWL4965_SPECTRUM_MEASUREMENT=y
+CONFIG_IWL4965_SENSITIVITY=y
+# CONFIG_IWL4965_DEBUG is not set
+# CONFIG_IWL3945 is not set
+
+#
+# PCMCIA network device support
+#
+CONFIG_NET_PCMCIA=y
+CONFIG_PCMCIA_3C589=m
+CONFIG_PCMCIA_3C574=m
+CONFIG_PCMCIA_FMVJ18X=m
+CONFIG_PCMCIA_PCNET=m
+CONFIG_PCMCIA_NMCLAN=m
+CONFIG_PCMCIA_SMC91C92=m
+CONFIG_PCMCIA_XIRC2PS=m
+CONFIG_PCMCIA_AXNET=m
+
+#
+# Wan interfaces
+#
+# CONFIG_WAN is not set
+
+#
+# ATM drivers
+#
+# CONFIG_ATM_DUMMY is not set
+CONFIG_ATM_TCP=m
+CONFIG_ATM_LANAI=m
+CONFIG_ATM_ENI=m
+# CONFIG_ATM_ENI_DEBUG is not set
+# CONFIG_ATM_ENI_TUNE_BURST is not set
+# CONFIG_ATM_FIRESTREAM is not set
+# CONFIG_ATM_ZATM is not set
+CONFIG_ATM_IDT77252=m
+# CONFIG_ATM_IDT77252_DEBUG is not set
+# CONFIG_ATM_IDT77252_RCV_ALL is not set
+CONFIG_ATM_IDT77252_USE_SUNI=y
+# CONFIG_ATM_AMBASSADOR is not set
+# CONFIG_ATM_HORIZON is not set
+CONFIG_ATM_FORE200E_MAYBE=m
+# CONFIG_ATM_FORE200E_PCA is not set
+CONFIG_ATM_HE=m
+# CONFIG_ATM_HE_USE_SUNI is not set
+CONFIG_FDDI=y
+# CONFIG_DEFXX is not set
+CONFIG_SKFP=m
+# CONFIG_HIPPI is not set
+CONFIG_PLIP=m
+CONFIG_PPP=m
+CONFIG_PPP_MULTILINK=y
+CONFIG_PPP_FILTER=y
+CONFIG_PPP_ASYNC=m
+CONFIG_PPP_SYNC_TTY=m
+CONFIG_PPP_DEFLATE=m
+# CONFIG_PPP_BSDCOMP is not set
+CONFIG_PPP_MPPE=m
+CONFIG_PPPOE=m
+CONFIG_PPPOATM=m
+CONFIG_SLIP=m
+CONFIG_SLIP_COMPRESSED=y
+CONFIG_SLIP_SMART=y
+# CONFIG_SLIP_MODE_SLIP6 is not set
+CONFIG_NET_FC=y
+# CONFIG_SHAPER is not set
+CONFIG_NETCONSOLE=m
+CONFIG_NETPOLL=y
+# CONFIG_NETPOLL_RX is not set
+CONFIG_NETPOLL_TRAP=y
+CONFIG_NET_POLL_CONTROLLER=y
+
+#
+# ISDN subsystem
+#
+CONFIG_ISDN=m
+
+#
+# Old ISDN4Linux
+#
+CONFIG_ISDN_I4L=m
+CONFIG_ISDN_PPP=y
+CONFIG_ISDN_PPP_VJ=y
+CONFIG_ISDN_MPP=y
+CONFIG_IPPP_FILTER=y
+# CONFIG_ISDN_PPP_BSDCOMP is not set
+CONFIG_ISDN_AUDIO=y
+CONFIG_ISDN_TTY_FAX=y
+
+#
+# ISDN feature submodules
+#
+CONFIG_ISDN_DIVERSION=m
+
+#
+# ISDN4Linux hardware drivers
+#
+
+#
+# Passive cards
+#
+CONFIG_ISDN_DRV_HISAX=m
+
+#
+# D-channel protocol features
+#
+CONFIG_HISAX_EURO=y
+CONFIG_DE_AOC=y
+CONFIG_HISAX_NO_SENDCOMPLETE=y
+CONFIG_HISAX_NO_LLC=y
+CONFIG_HISAX_NO_KEYPAD=y
+CONFIG_HISAX_1TR6=y
+CONFIG_HISAX_NI1=y
+CONFIG_HISAX_MAX_CARDS=8
+
+#
+# HiSax supported cards
+#
+CONFIG_HISAX_16_3=y
+CONFIG_HISAX_S0BOX=y
+CONFIG_HISAX_AVM_A1_PCMCIA=y
+CONFIG_HISAX_ELSA=y
+CONFIG_HISAX_DIEHLDIVA=y
+CONFIG_HISAX_SEDLBAUER=y
+CONFIG_HISAX_NICCY=y
+CONFIG_HISAX_BKM_A4T=y
+CONFIG_HISAX_SCT_QUADRO=y
+CONFIG_HISAX_GAZEL=y
+CONFIG_HISAX_W6692=y
+CONFIG_HISAX_HFC_SX=y
+# CONFIG_HISAX_DEBUG is not set
+
+#
+# HiSax PCMCIA card service modules
+#
+CONFIG_HISAX_SEDLBAUER_CS=m
+CONFIG_HISAX_ELSA_CS=m
+CONFIG_HISAX_AVM_A1_CS=m
+CONFIG_HISAX_TELES_CS=m
+
+#
+# HiSax sub driver modules
+#
+CONFIG_HISAX_ST5481=m
+# CONFIG_HISAX_HFCUSB is not set
+CONFIG_HISAX_HFC4S8S=m
+CONFIG_HISAX_FRITZ_PCIPNP=m
+CONFIG_HISAX_HDLC=y
+
+#
+# Active cards
+#
+
+#
+# Siemens Gigaset
+#
+CONFIG_ISDN_DRV_GIGASET=m
+CONFIG_GIGASET_BASE=m
+CONFIG_GIGASET_M105=m
+# CONFIG_GIGASET_DEBUG is not set
+# CONFIG_GIGASET_UNDOCREQ is not set
+
+#
+# CAPI subsystem
+#
+CONFIG_ISDN_CAPI=m
+CONFIG_ISDN_DRV_AVMB1_VERBOSE_REASON=y
+CONFIG_ISDN_CAPI_MIDDLEWARE=y
+CONFIG_ISDN_CAPI_CAPI20=m
+CONFIG_ISDN_CAPI_CAPIFS_BOOL=y
+CONFIG_ISDN_CAPI_CAPIFS=m
+CONFIG_ISDN_CAPI_CAPIDRV=m
+
+#
+# CAPI hardware drivers
+#
+
+#
+# Active AVM cards
+#
+CONFIG_CAPI_AVM=y
+CONFIG_ISDN_DRV_AVMB1_B1PCI=m
+CONFIG_ISDN_DRV_AVMB1_B1PCIV4=y
+CONFIG_ISDN_DRV_AVMB1_B1PCMCIA=m
+CONFIG_ISDN_DRV_AVMB1_AVM_CS=m
+CONFIG_ISDN_DRV_AVMB1_T1PCI=m
+CONFIG_ISDN_DRV_AVMB1_C4=m
+
+#
+# Active Eicon DIVA Server cards
+#
+CONFIG_CAPI_EICON=y
+CONFIG_ISDN_DIVAS=m
+CONFIG_ISDN_DIVAS_BRIPCI=y
+CONFIG_ISDN_DIVAS_PRIPCI=y
+CONFIG_ISDN_DIVAS_DIVACAPI=m
+CONFIG_ISDN_DIVAS_USERIDI=m
+CONFIG_ISDN_DIVAS_MAINT=m
+
+#
+# Telephony Support
+#
+# CONFIG_PHONE is not set
+
+#
+# Input device support
+#
+CONFIG_INPUT=y
+
+#
+# Userland interfaces
+#
+CONFIG_INPUT_MOUSEDEV=y
+# CONFIG_INPUT_MOUSEDEV_PSAUX is not set
+CONFIG_INPUT_MOUSEDEV_SCREEN_X=1024
+CONFIG_INPUT_MOUSEDEV_SCREEN_Y=768
+CONFIG_INPUT_JOYDEV=m
+# CONFIG_INPUT_TSDEV is not set
+CONFIG_INPUT_EVDEV=y
+# CONFIG_INPUT_EVBUG is not set
+
+#
+# Input Device Drivers
+#
+CONFIG_INPUT_KEYBOARD=y
+CONFIG_KEYBOARD_ATKBD=y
+# CONFIG_KEYBOARD_SUNKBD is not set
+# CONFIG_KEYBOARD_LKKBD is not set
+# CONFIG_KEYBOARD_XTKBD is not set
+# CONFIG_KEYBOARD_NEWTON is not set
+CONFIG_INPUT_MOUSE=y
+CONFIG_MOUSE_PS2=y
+CONFIG_MOUSE_SERIAL=m
+CONFIG_MOUSE_VSXXXAA=m
+CONFIG_INPUT_JOYSTICK=y
+CONFIG_JOYSTICK_ANALOG=m
+CONFIG_JOYSTICK_A3D=m
+CONFIG_JOYSTICK_ADI=m
+CONFIG_JOYSTICK_COBRA=m
+CONFIG_JOYSTICK_GF2K=m
+CONFIG_JOYSTICK_GRIP=m
+CONFIG_JOYSTICK_GRIP_MP=m
+CONFIG_JOYSTICK_GUILLEMOT=m
+CONFIG_JOYSTICK_INTERACT=m
+CONFIG_JOYSTICK_SIDEWINDER=m
+CONFIG_JOYSTICK_TMDC=m
+CONFIG_JOYSTICK_IFORCE=m
+CONFIG_JOYSTICK_IFORCE_USB=y
+CONFIG_JOYSTICK_IFORCE_232=y
+CONFIG_JOYSTICK_WARRIOR=m
+CONFIG_JOYSTICK_MAGELLAN=m
+CONFIG_JOYSTICK_SPACEORB=m
+CONFIG_JOYSTICK_SPACEBALL=m
+CONFIG_JOYSTICK_STINGER=m
+CONFIG_JOYSTICK_TWIDJOY=m
+CONFIG_JOYSTICK_DB9=m
+CONFIG_JOYSTICK_GAMECON=m
+CONFIG_JOYSTICK_TURBOGRAFX=m
+CONFIG_JOYSTICK_JOYDUMP=m
+CONFIG_INPUT_TOUCHSCREEN=y
+CONFIG_TOUCHSCREEN_GUNZE=m
+CONFIG_TOUCHSCREEN_ELO=m
+CONFIG_TOUCHSCREEN_MTOUCH=m
+# CONFIG_TOUCHSCREEN_MK712 is not set
+CONFIG_INPUT_MISC=y
+# CONFIG_INPUT_PCSPKR is not set
+CONFIG_INPUT_UINPUT=m
+
+#
+# Hardware I/O ports
+#
+CONFIG_SERIO=y
+CONFIG_SERIO_I8042=y
+CONFIG_SERIO_SERPORT=y
+# CONFIG_SERIO_PARKBD is not set
+# CONFIG_SERIO_PCIPS2 is not set
+CONFIG_SERIO_LIBPS2=y
+CONFIG_SERIO_RAW=m
+CONFIG_GAMEPORT=m
+# CONFIG_GAMEPORT_NS558 is not set
+# CONFIG_GAMEPORT_L4 is not set
+CONFIG_GAMEPORT_EMU10K1=m
+CONFIG_GAMEPORT_FM801=m
+
+#
+# Character devices
+#
+CONFIG_VT=y
+CONFIG_VT_CONSOLE=y
+CONFIG_HW_CONSOLE=y
+CONFIG_VT_HW_CONSOLE_BINDING=y
+CONFIG_SERIAL_NONSTANDARD=y
+# CONFIG_COMPUTONE is not set
+# CONFIG_ROCKETPORT is not set
+CONFIG_CYCLADES=m
+# CONFIG_CYZ_INTR is not set
+# CONFIG_DIGIEPCA is not set
+# CONFIG_MOXA_INTELLIO is not set
+# CONFIG_MOXA_SMARTIO is not set
+# CONFIG_ISI is not set
+CONFIG_SYNCLINK=m
+CONFIG_SYNCLINKMP=m
+CONFIG_SYNCLINK_GT=m
+CONFIG_N_HDLC=m
+# CONFIG_SPECIALIX is not set
+# CONFIG_SX is not set
+# CONFIG_RIO is not set
+# CONFIG_STALDRV is not set
+
+#
+# Serial drivers
+#
+CONFIG_SERIAL_8250=y
+CONFIG_SERIAL_8250_CONSOLE=y
+CONFIG_SERIAL_8250_PCI=y
+CONFIG_SERIAL_8250_CS=m
+CONFIG_SERIAL_8250_NR_UARTS=32
+CONFIG_SERIAL_8250_RUNTIME_UARTS=4
+CONFIG_SERIAL_8250_EXTENDED=y
+CONFIG_SERIAL_8250_MANY_PORTS=y
+CONFIG_SERIAL_8250_SHARE_IRQ=y
+CONFIG_SERIAL_8250_DETECT_IRQ=y
+CONFIG_SERIAL_8250_RSA=y
+
+#
+# Non-8250 serial port support
+#
+CONFIG_SERIAL_CORE=y
+CONFIG_SERIAL_CORE_CONSOLE=y
+CONFIG_SERIAL_PMACZILOG=m
+CONFIG_SERIAL_ICOM=m
+CONFIG_SERIAL_JSM=m
+CONFIG_SERIAL_OF_PLATFORM=y
+CONFIG_UNIX98_PTYS=y
+# CONFIG_LEGACY_PTYS is not set
+# CONFIG_CRASH is not set
+CONFIG_PRINTER=m
+CONFIG_LP_CONSOLE=y
+CONFIG_PPDEV=m
+CONFIG_TIPAR=m
+CONFIG_HVC_DRIVER=y
+CONFIG_HVC_CONSOLE=y
+CONFIG_HVC_RTAS=y
+CONFIG_HVCS=m
+
+#
+# IPMI
+#
+CONFIG_IPMI_HANDLER=m
+CONFIG_IPMI_PANIC_EVENT=y
+CONFIG_IPMI_PANIC_STRING=y
+CONFIG_IPMI_DEVICE_INTERFACE=m
+CONFIG_IPMI_SI=m
+CONFIG_IPMI_WATCHDOG=m
+CONFIG_IPMI_POWEROFF=m
+
+#
+# Watchdog Cards
+#
+CONFIG_WATCHDOG=y
+# CONFIG_WATCHDOG_NOWAYOUT is not set
+
+#
+# Watchdog Device Drivers
+#
+CONFIG_SOFT_WATCHDOG=m
+CONFIG_WATCHDOG_RTAS=m
+
+#
+# PCI-based Watchdog Cards
+#
+CONFIG_PCIPCWATCHDOG=m
+CONFIG_WDTPCI=m
+CONFIG_WDT_501_PCI=y
+
+#
+# USB-based Watchdog Cards
+#
+CONFIG_USBPCWATCHDOG=m
+CONFIG_HW_RANDOM=y
+CONFIG_GEN_RTC=y
+# CONFIG_GEN_RTC_X is not set
+# CONFIG_DTLK is not set
+CONFIG_R3964=m
+# CONFIG_APPLICOM is not set
+
+#
+# Ftape, the floppy tape device driver
+#
+CONFIG_AGP=y
+CONFIG_AGP_SIS=y
+CONFIG_AGP_VIA=y
+CONFIG_AGP_UNINORTH=y
+CONFIG_DRM=m
+CONFIG_DRM_TDFX=m
+CONFIG_DRM_R128=m
+CONFIG_DRM_RADEON=m
+CONFIG_DRM_MGA=m
+CONFIG_DRM_SIS=m
+CONFIG_DRM_VIA=m
+CONFIG_DRM_SAVAGE=m
+
+#
+# PCMCIA character devices
+#
+# CONFIG_SYNCLINK_CS is not set
+CONFIG_CARDMAN_4000=m
+CONFIG_CARDMAN_4040=m
+# CONFIG_RAW_DRIVER is not set
+CONFIG_HANGCHECK_TIMER=m
+CONFIG_TCG_TPM=m
+CONFIG_TCG_ATMEL=m
+CONFIG_TELCLOCK=m
+
+#
+# I2C support
+#
+CONFIG_I2C=y
+CONFIG_I2C_CHARDEV=m
+
+#
+# I2C Algorithms
+#
+CONFIG_I2C_ALGOBIT=y
+CONFIG_I2C_ALGOPCF=m
+CONFIG_I2C_ALGOPCA=m
+
+#
+# I2C Hardware Bus support
+#
+# CONFIG_I2C_ALI1535 is not set
+# CONFIG_I2C_ALI1563 is not set
+# CONFIG_I2C_ALI15X3 is not set
+# CONFIG_I2C_AMD756 is not set
+# CONFIG_I2C_AMD8111 is not set
+# CONFIG_I2C_I801 is not set
+# CONFIG_I2C_I810 is not set
+CONFIG_I2C_PIIX4=m
+CONFIG_I2C_ISA=m
+CONFIG_I2C_POWERMAC=y
+CONFIG_I2C_NFORCE2=m
+# CONFIG_I2C_OCORES is not set
+CONFIG_I2C_PARPORT=m
+CONFIG_I2C_PARPORT_LIGHT=m
+CONFIG_I2C_PROSAVAGE=m
+CONFIG_I2C_SAVAGE4=m
+# CONFIG_I2C_SIS5595 is not set
+# CONFIG_I2C_SIS630 is not set
+# CONFIG_I2C_SIS96X is not set
+CONFIG_I2C_STUB=m
+# CONFIG_I2C_VIA is not set
+# CONFIG_I2C_VIAPRO is not set
+CONFIG_I2C_VOODOO3=m
+CONFIG_I2C_PCA_ISA=m
+
+#
+# Miscellaneous I2C Chip support
+#
+CONFIG_SENSORS_DS1337=m
+CONFIG_SENSORS_DS1374=m
+CONFIG_SENSORS_EEPROM=m
+CONFIG_SENSORS_PCF8574=m
+CONFIG_SENSORS_PCA9539=m
+CONFIG_SENSORS_PCF8591=m
+CONFIG_SENSORS_MAX6875=m
+# CONFIG_I2C_DEBUG_CORE is not set
+# CONFIG_I2C_DEBUG_ALGO is not set
+# CONFIG_I2C_DEBUG_BUS is not set
+# CONFIG_I2C_DEBUG_CHIP is not set
+
+#
+# SPI support
+#
+# CONFIG_SPI is not set
+# CONFIG_SPI_MASTER is not set
+
+#
+# Dallas's 1-wire bus
+#
+# CONFIG_W1 is not set
+
+#
+# Hardware Monitoring support
+#
+CONFIG_HWMON=m
+CONFIG_HWMON_VID=m
+# CONFIG_SENSORS_ABITUGURU is not set
+CONFIG_SENSORS_ADM1021=m
+CONFIG_SENSORS_ADM1025=m
+CONFIG_SENSORS_ADM1026=m
+CONFIG_SENSORS_ADM1031=m
+CONFIG_SENSORS_ADM9240=m
+CONFIG_SENSORS_ASB100=m
+CONFIG_SENSORS_ATXP1=m
+CONFIG_SENSORS_DS1621=m
+CONFIG_SENSORS_F71805F=m
+CONFIG_SENSORS_FSCHER=m
+CONFIG_SENSORS_FSCPOS=m
+CONFIG_SENSORS_GL518SM=m
+CONFIG_SENSORS_GL520SM=m
+# CONFIG_SENSORS_IT87 is not set
+CONFIG_SENSORS_LM63=m
+CONFIG_SENSORS_LM75=m
+CONFIG_SENSORS_LM77=m
+CONFIG_SENSORS_LM78=m
+CONFIG_SENSORS_LM80=m
+CONFIG_SENSORS_LM83=m
+CONFIG_SENSORS_LM85=m
+CONFIG_SENSORS_LM87=m
+CONFIG_SENSORS_LM90=m
+CONFIG_SENSORS_LM92=m
+CONFIG_SENSORS_MAX1619=m
+# CONFIG_SENSORS_PC87360 is not set
+CONFIG_SENSORS_SIS5595=m
+# CONFIG_SENSORS_SMSC47M1 is not set
+CONFIG_SENSORS_SMSC47M192=m
+# CONFIG_SENSORS_SMSC47B397 is not set
+CONFIG_SENSORS_VIA686A=m
+CONFIG_SENSORS_VT8231=m
+# CONFIG_SENSORS_W83781D is not set
+CONFIG_SENSORS_W83791D=m
+CONFIG_SENSORS_W83792D=m
+CONFIG_SENSORS_W83L785TS=m
+# CONFIG_SENSORS_W83627HF is not set
+# CONFIG_SENSORS_W83627EHF is not set
+# CONFIG_HWMON_DEBUG_CHIP is not set
+
+#
+# Misc devices
+#
+
+#
+# Multimedia devices
+#
+CONFIG_VIDEO_DEV=m
+CONFIG_VIDEO_V4L1=y
+CONFIG_VIDEO_V4L1_COMPAT=y
+CONFIG_VIDEO_V4L2=y
+
+#
+# Video Capture Adapters
+#
+
+#
+# Video Capture Adapters
+#
+# CONFIG_VIDEO_ADV_DEBUG is not set
+# CONFIG_VIDEO_VIVI is not set
+CONFIG_VIDEO_BT848=m
+CONFIG_VIDEO_BT848_DVB=y
+CONFIG_VIDEO_SAA6588=m
+CONFIG_VIDEO_BWQCAM=m
+CONFIG_VIDEO_CQCAM=m
+CONFIG_VIDEO_W9966=m
+CONFIG_VIDEO_CPIA=m
+CONFIG_VIDEO_CPIA_PP=m
+CONFIG_VIDEO_CPIA_USB=m
+CONFIG_VIDEO_CPIA2=m
+CONFIG_VIDEO_SAA5246A=m
+CONFIG_VIDEO_SAA5249=m
+CONFIG_TUNER_3036=m
+CONFIG_VIDEO_SAA7134=m
+CONFIG_VIDEO_SAA7134_ALSA=m
+CONFIG_VIDEO_SAA7134_DVB=m
+CONFIG_VIDEO_SAA7134_DVB_ALL_FRONTENDS=y
+CONFIG_VIDEO_MXB=m
+CONFIG_VIDEO_DPC=m
+CONFIG_VIDEO_HEXIUM_ORION=m
+CONFIG_VIDEO_HEXIUM_GEMINI=m
+CONFIG_VIDEO_CX88_VP3054=m
+CONFIG_VIDEO_CX88=m
+CONFIG_VIDEO_CX88_ALSA=m
+CONFIG_VIDEO_CX88_BLACKBIRD=m
+CONFIG_VIDEO_CX88_DVB=m
+CONFIG_VIDEO_CX88_DVB_ALL_FRONTENDS=y
+
+#
+# Encoders and Decoders
+#
+CONFIG_VIDEO_MSP3400=m
+CONFIG_VIDEO_CS53L32A=m
+CONFIG_VIDEO_TLV320AIC23B=m
+CONFIG_VIDEO_WM8775=m
+CONFIG_VIDEO_WM8739=m
+CONFIG_VIDEO_CX2341X=m
+CONFIG_VIDEO_CX25840=m
+CONFIG_VIDEO_SAA711X=m
+CONFIG_VIDEO_SAA7127=m
+CONFIG_VIDEO_UPD64031A=m
+CONFIG_VIDEO_UPD64083=m
+
+#
+# V4L USB devices
+#
+CONFIG_VIDEO_PVRUSB2=m
+CONFIG_VIDEO_PVRUSB2_24XXX=y
+CONFIG_VIDEO_PVRUSB2_SYSFS=y
+# CONFIG_VIDEO_PVRUSB2_DEBUGIFC is not set
+CONFIG_VIDEO_EM28XX=m
+CONFIG_VIDEO_USBVIDEO=m
+CONFIG_USB_VICAM=m
+CONFIG_USB_IBMCAM=m
+CONFIG_USB_KONICAWC=m
+CONFIG_USB_QUICKCAM_MESSENGER=m
+CONFIG_USB_ET61X251=m
+CONFIG_VIDEO_OVCAMCHIP=m
+CONFIG_USB_W9968CF=m
+CONFIG_USB_OV511=m
+CONFIG_USB_SE401=m
+CONFIG_USB_SN9C102=m
+CONFIG_USB_STV680=m
+CONFIG_USB_ZC0301=m
+CONFIG_USB_PWC=m
+# CONFIG_USB_PWC_DEBUG is not set
+
+#
+# Radio Adapters
+#
+CONFIG_RADIO_GEMTEK_PCI=m
+CONFIG_RADIO_MAXIRADIO=m
+CONFIG_RADIO_MAESTRO=m
+CONFIG_USB_DSBR=m
+
+#
+# Digital Video Broadcasting Devices
+#
+CONFIG_DVB=y
+CONFIG_DVB_CORE=m
+
+#
+# Supported SAA7146 based PCI Adapters
+#
+CONFIG_DVB_AV7110=m
+CONFIG_DVB_AV7110_OSD=y
+CONFIG_DVB_BUDGET=m
+CONFIG_DVB_BUDGET_CI=m
+CONFIG_DVB_BUDGET_AV=m
+CONFIG_DVB_BUDGET_PATCH=m
+
+#
+# Supported USB Adapters
+#
+CONFIG_DVB_USB=m
+# CONFIG_DVB_USB_DEBUG is not set
+CONFIG_DVB_USB_A800=m
+CONFIG_DVB_USB_DIBUSB_MB=m
+# CONFIG_DVB_USB_DIBUSB_MB_FAULTY is not set
+CONFIG_DVB_USB_DIBUSB_MC=m
+CONFIG_DVB_USB_UMT_010=m
+CONFIG_DVB_USB_CXUSB=m
+CONFIG_DVB_USB_DIGITV=m
+CONFIG_DVB_USB_VP7045=m
+CONFIG_DVB_USB_VP702X=m
+CONFIG_DVB_USB_GP8PSK=m
+CONFIG_DVB_USB_NOVA_T_USB2=m
+CONFIG_DVB_USB_DTT200U=m
+CONFIG_DVB_TTUSB_BUDGET=m
+CONFIG_DVB_TTUSB_DEC=m
+CONFIG_DVB_CINERGYT2=m
+CONFIG_DVB_CINERGYT2_TUNING=y
+CONFIG_DVB_CINERGYT2_STREAM_URB_COUNT=32
+CONFIG_DVB_CINERGYT2_STREAM_BUF_SIZE=512
+CONFIG_DVB_CINERGYT2_QUERY_INTERVAL=250
+CONFIG_DVB_CINERGYT2_ENABLE_RC_INPUT_DEVICE=y
+CONFIG_DVB_CINERGYT2_RC_QUERY_INTERVAL=100
+
+#
+# Supported FlexCopII (B2C2) Adapters
+#
+CONFIG_DVB_B2C2_FLEXCOP=m
+CONFIG_DVB_B2C2_FLEXCOP_PCI=m
+CONFIG_DVB_B2C2_FLEXCOP_USB=m
+# CONFIG_DVB_B2C2_FLEXCOP_DEBUG is not set
+
+#
+# Supported BT878 Adapters
+#
+CONFIG_DVB_BT8XX=m
+
+#
+# Supported Pluto2 Adapters
+#
+CONFIG_DVB_PLUTO2=m
+
+#
+# Supported DVB Frontends
+#
+
+#
+# Customise DVB Frontends
+#
+
+#
+# DVB-S (satellite) frontends
+#
+CONFIG_DVB_STV0299=m
+CONFIG_DVB_CX24110=m
+CONFIG_DVB_CX24123=m
+CONFIG_DVB_TDA8083=m
+CONFIG_DVB_MT312=m
+CONFIG_DVB_VES1X93=m
+CONFIG_DVB_S5H1420=m
+
+#
+# DVB-T (terrestrial) frontends
+#
+CONFIG_DVB_SP8870=m
+CONFIG_DVB_SP887X=m
+CONFIG_DVB_CX22700=m
+CONFIG_DVB_CX22702=m
+CONFIG_DVB_L64781=m
+CONFIG_DVB_TDA1004X=m
+CONFIG_DVB_NXT6000=m
+CONFIG_DVB_MT352=m
+CONFIG_DVB_ZL10353=m
+CONFIG_DVB_DIB3000MB=m
+CONFIG_DVB_DIB3000MC=m
+
+#
+# DVB-C (cable) frontends
+#
+CONFIG_DVB_VES1820=m
+CONFIG_DVB_TDA10021=m
+CONFIG_DVB_STV0297=m
+
+#
+# ATSC (North American/Korean Terrestrial/Cable DTV) frontends
+#
+CONFIG_DVB_NXT200X=m
+CONFIG_DVB_OR51211=m
+CONFIG_DVB_OR51132=m
+CONFIG_DVB_BCM3510=m
+CONFIG_DVB_LGDT330X=m
+
+#
+# Miscellaneous devices
+#
+CONFIG_DVB_PLL=m
+CONFIG_DVB_LNBP21=m
+CONFIG_DVB_ISL6421=m
+CONFIG_VIDEO_SAA7146=m
+CONFIG_VIDEO_SAA7146_VV=m
+CONFIG_VIDEO_VIDEOBUF=m
+CONFIG_VIDEO_TUNER=m
+CONFIG_VIDEO_BUF=m
+CONFIG_VIDEO_BUF_DVB=m
+CONFIG_VIDEO_BTCX=m
+CONFIG_VIDEO_IR=m
+CONFIG_VIDEO_TVEEPROM=m
+CONFIG_USB_DABUSB=m
+
+#
+# Graphics support
+#
+# CONFIG_FIRMWARE_EDID is not set
+CONFIG_FB=y
+CONFIG_FB_CFB_FILLRECT=y
+CONFIG_FB_CFB_COPYAREA=y
+CONFIG_FB_CFB_IMAGEBLIT=y
+CONFIG_FB_MACMODES=y
+# CONFIG_FB_BACKLIGHT is not set
+CONFIG_FB_MODE_HELPERS=y
+CONFIG_FB_TILEBLITTING=y
+CONFIG_FB_CIRRUS=m
+# CONFIG_FB_PM2 is not set
+# CONFIG_FB_CYBER2000 is not set
+CONFIG_FB_OF=y
+# CONFIG_FB_ASILIANT is not set
+# CONFIG_FB_IMSTT is not set
+# CONFIG_FB_VGA16 is not set
+# CONFIG_FB_S1D13XXX is not set
+CONFIG_FB_NVIDIA=m
+CONFIG_FB_NVIDIA_I2C=y
+CONFIG_FB_RIVA=m
+# CONFIG_FB_RIVA_I2C is not set
+# CONFIG_FB_RIVA_DEBUG is not set
+CONFIG_FB_MATROX=m
+CONFIG_FB_MATROX_MILLENIUM=y
+CONFIG_FB_MATROX_MYSTIQUE=y
+CONFIG_FB_MATROX_G=y
+CONFIG_FB_MATROX_I2C=m
+CONFIG_FB_MATROX_MAVEN=m
+CONFIG_FB_MATROX_MULTIHEAD=y
+CONFIG_FB_RADEON=y
+CONFIG_FB_RADEON_I2C=y
+# CONFIG_FB_RADEON_DEBUG is not set
+# CONFIG_FB_ATY128 is not set
+# CONFIG_FB_ATY is not set
+CONFIG_FB_SAVAGE=m
+CONFIG_FB_SAVAGE_I2C=y
+CONFIG_FB_SAVAGE_ACCEL=y
+# CONFIG_FB_SIS is not set
+CONFIG_FB_NEOMAGIC=m
+CONFIG_FB_KYRO=m
+CONFIG_FB_3DFX=m
+CONFIG_FB_3DFX_ACCEL=y
+CONFIG_FB_VOODOO1=m
+CONFIG_FB_TRIDENT=m
+CONFIG_FB_TRIDENT_ACCEL=y
+# CONFIG_FB_VIRTUAL is not set
+
+#
+# Console display driver support
+#
+CONFIG_VGA_CONSOLE=y
+CONFIG_VGACON_SOFT_SCROLLBACK=y
+CONFIG_VGACON_SOFT_SCROLLBACK_SIZE=64
+CONFIG_DUMMY_CONSOLE=y
+CONFIG_FRAMEBUFFER_CONSOLE=y
+CONFIG_FRAMEBUFFER_CONSOLE_ROTATION=y
+# CONFIG_FONTS is not set
+CONFIG_FONT_8x8=y
+CONFIG_FONT_8x16=y
+
+#
+# Logo configuration
+#
+CONFIG_LOGO=y
+# CONFIG_LOGO_LINUX_MONO is not set
+# CONFIG_LOGO_LINUX_VGA16 is not set
+CONFIG_LOGO_LINUX_CLUT224=y
+CONFIG_BACKLIGHT_LCD_SUPPORT=y
+CONFIG_BACKLIGHT_CLASS_DEVICE=m
+CONFIG_BACKLIGHT_DEVICE=y
+CONFIG_LCD_CLASS_DEVICE=m
+CONFIG_LCD_DEVICE=y
+
+#
+# Sound
+#
+CONFIG_SOUND=m
+
+#
+# Advanced Linux Sound Architecture
+#
+CONFIG_SND=m
+CONFIG_SND_TIMER=m
+CONFIG_SND_PCM=m
+CONFIG_SND_HWDEP=m
+CONFIG_SND_RAWMIDI=m
+CONFIG_SND_SEQUENCER=m
+CONFIG_SND_SEQ_DUMMY=m
+CONFIG_SND_OSSEMUL=y
+CONFIG_SND_MIXER_OSS=m
+CONFIG_SND_PCM_OSS=m
+CONFIG_SND_PCM_OSS_PLUGINS=y
+CONFIG_SND_SEQUENCER_OSS=y
+CONFIG_SND_DYNAMIC_MINORS=y
+# CONFIG_SND_SUPPORT_OLD_API is not set
+CONFIG_SND_VERBOSE_PROCFS=y
+# CONFIG_SND_VERBOSE_PRINTK is not set
+# CONFIG_SND_DEBUG is not set
+
+#
+# Generic devices
+#
+CONFIG_SND_MPU401_UART=m
+CONFIG_SND_OPL3_LIB=m
+CONFIG_SND_VX_LIB=m
+CONFIG_SND_AC97_CODEC=m
+CONFIG_SND_DUMMY=m
+CONFIG_SND_VIRMIDI=m
+CONFIG_SND_MTPAV=m
+# CONFIG_SND_MTS64 is not set
+# CONFIG_SND_SERIAL_U16550 is not set
+CONFIG_SND_MPU401=m
+# CONFIG_SND_PORTMAN2X4 is not set
+
+#
+# PCI devices
+#
+CONFIG_SND_AD1889=m
+CONFIG_SND_ALS300=m
+CONFIG_SND_ALS4000=m
+CONFIG_SND_ALI5451=m
+CONFIG_SND_ATIIXP=m
+CONFIG_SND_ATIIXP_MODEM=m
+CONFIG_SND_AU8810=m
+CONFIG_SND_AU8820=m
+CONFIG_SND_AU8830=m
+CONFIG_SND_AZT3328=m
+CONFIG_SND_BT87X=m
+# CONFIG_SND_BT87X_OVERCLOCK is not set
+CONFIG_SND_CA0106=m
+CONFIG_SND_CMIPCI=m
+CONFIG_SND_CS4281=m
+CONFIG_SND_CS46XX=m
+CONFIG_SND_CS46XX_NEW_DSP=y
+CONFIG_SND_DARLA20=m
+CONFIG_SND_GINA20=m
+CONFIG_SND_LAYLA20=m
+CONFIG_SND_DARLA24=m
+CONFIG_SND_GINA24=m
+CONFIG_SND_LAYLA24=m
+CONFIG_SND_MONA=m
+CONFIG_SND_MIA=m
+CONFIG_SND_ECHO3G=m
+CONFIG_SND_INDIGO=m
+CONFIG_SND_INDIGOIO=m
+CONFIG_SND_INDIGODJ=m
+CONFIG_SND_EMU10K1=m
+CONFIG_SND_EMU10K1X=m
+CONFIG_SND_ENS1370=m
+CONFIG_SND_ENS1371=m
+CONFIG_SND_ES1938=m
+CONFIG_SND_ES1968=m
+CONFIG_SND_FM801=m
+CONFIG_SND_FM801_TEA575X_BOOL=y
+CONFIG_SND_FM801_TEA575X=m
+CONFIG_SND_HDA_INTEL=m
+CONFIG_SND_HDSP=m
+CONFIG_SND_HDSPM=m
+CONFIG_SND_ICE1712=m
+CONFIG_SND_ICE1724=m
+CONFIG_SND_INTEL8X0=m
+CONFIG_SND_INTEL8X0M=m
+CONFIG_SND_KORG1212=m
+CONFIG_SND_MAESTRO3=m
+CONFIG_SND_MIXART=m
+CONFIG_SND_NM256=m
+CONFIG_SND_PCXHR=m
+CONFIG_SND_RIPTIDE=m
+CONFIG_SND_RME32=m
+CONFIG_SND_RME96=m
+CONFIG_SND_RME9652=m
+CONFIG_SND_SONICVIBES=m
+CONFIG_SND_TRIDENT=m
+CONFIG_SND_VIA82XX=m
+CONFIG_SND_VIA82XX_MODEM=m
+CONFIG_SND_VX222=m
+CONFIG_SND_YMFPCI=m
+# CONFIG_SND_AC97_POWER_SAVE is not set
+
+#
+# ALSA PowerMac devices
+#
+CONFIG_SND_POWERMAC=m
+CONFIG_SND_POWERMAC_AUTO_DRC=y
+
+#
+# Apple Onboard Audio driver
+#
+CONFIG_SND_AOA=m
+CONFIG_SND_AOA_FABRIC_LAYOUT=m
+CONFIG_SND_AOA_ONYX=m
+CONFIG_SND_AOA_TAS=m
+CONFIG_SND_AOA_TOONIE=m
+CONFIG_SND_AOA_SOUNDBUS=m
+CONFIG_SND_AOA_SOUNDBUS_I2S=m
+
+#
+# USB devices
+#
+CONFIG_SND_USB_AUDIO=m
+CONFIG_SND_USB_USX2Y=m
+
+#
+# PCMCIA devices
+#
+# CONFIG_SND_VXPOCKET is not set
+# CONFIG_SND_PDAUDIOCF is not set
+
+#
+# SoC audio support
+#
+# CONFIG_SND_SOC is not set
+
+#
+# Open Sound System
+#
+# CONFIG_SOUND_PRIME is not set
+CONFIG_AC97_BUS=m
+
+#
+# USB support
+#
+CONFIG_USB_ARCH_HAS_HCD=y
+CONFIG_USB_ARCH_HAS_OHCI=y
+CONFIG_USB_ARCH_HAS_EHCI=y
+CONFIG_USB=y
+# CONFIG_USB_DEBUG is not set
+
+#
+# Miscellaneous USB options
+#
+CONFIG_USB_DEVICEFS=y
+# CONFIG_USB_BANDWIDTH is not set
+# CONFIG_USB_DYNAMIC_MINORS is not set
+# CONFIG_USB_SUSPEND is not set
+# CONFIG_USB_OTG is not set
+
+#
+# USB Host Controller Drivers
+#
+CONFIG_USB_EHCI_HCD=m
+CONFIG_USB_EHCI_SPLIT_ISO=y
+CONFIG_USB_EHCI_ROOT_HUB_TT=y
+CONFIG_USB_EHCI_TT_NEWSCHED=y
+CONFIG_USB_ISP116X_HCD=m
+CONFIG_USB_OHCI_HCD=m
+# CONFIG_USB_OHCI_BIG_ENDIAN is not set
+CONFIG_USB_OHCI_LITTLE_ENDIAN=y
+CONFIG_USB_UHCI_HCD=m
+CONFIG_USB_SL811_HCD=m
+CONFIG_USB_SL811_CS=m
+
+#
+# USB Device Class drivers
+#
+CONFIG_USB_ACM=m
+CONFIG_USB_PRINTER=m
+
+#
+# NOTE: USB_STORAGE enables SCSI, and 'SCSI disk support'
+#
+
+#
+# may also be needed; see USB_STORAGE Help for more information
+#
+CONFIG_USB_STORAGE=m
+# CONFIG_USB_STORAGE_DEBUG is not set
+CONFIG_USB_STORAGE_DATAFAB=y
+CONFIG_USB_STORAGE_FREECOM=y
+CONFIG_USB_STORAGE_ISD200=y
+CONFIG_USB_STORAGE_DPCM=y
+CONFIG_USB_STORAGE_USBAT=y
+CONFIG_USB_STORAGE_SDDR09=y
+CONFIG_USB_STORAGE_SDDR55=y
+CONFIG_USB_STORAGE_JUMPSHOT=y
+CONFIG_USB_STORAGE_ALAUDA=y
+# CONFIG_USB_LIBUSUAL is not set
+
+#
+# USB Input Devices
+#
+CONFIG_USB_HID=y
+CONFIG_USB_HIDINPUT=y
+CONFIG_USB_HIDINPUT_POWERBOOK=y
+CONFIG_HID_FF=y
+CONFIG_HID_PID=y
+CONFIG_LOGITECH_FF=y
+CONFIG_THRUSTMASTER_FF=y
+CONFIG_USB_HIDDEV=y
+CONFIG_USB_AIPTEK=m
+CONFIG_USB_WACOM=m
+CONFIG_USB_ACECAD=m
+CONFIG_USB_KBTAB=m
+CONFIG_USB_POWERMATE=m
+CONFIG_USB_TOUCHSCREEN=m
+CONFIG_USB_TOUCHSCREEN_EGALAX=y
+CONFIG_USB_TOUCHSCREEN_PANJIT=y
+CONFIG_USB_TOUCHSCREEN_3M=y
+CONFIG_USB_TOUCHSCREEN_ITM=y
+# CONFIG_USB_YEALINK is not set
+CONFIG_USB_XPAD=m
+CONFIG_USB_ATI_REMOTE=m
+CONFIG_USB_ATI_REMOTE2=m
+CONFIG_USB_KEYSPAN_REMOTE=m
+CONFIG_USB_APPLETOUCH=m
+
+#
+# USB Imaging devices
+#
+CONFIG_USB_MDC800=m
+CONFIG_USB_MICROTEK=m
+
+#
+# USB Network Adapters
+#
+CONFIG_USB_CATC=m
+CONFIG_USB_KAWETH=m
+CONFIG_USB_PEGASUS=m
+CONFIG_USB_RTL8150=m
+CONFIG_USB_USBNET=m
+CONFIG_USB_NET_AX8817X=m
+CONFIG_USB_NET_CDCETHER=m
+CONFIG_USB_NET_DM9601=m
+CONFIG_USB_NET_GL620A=m
+CONFIG_USB_NET_NET1080=m
+CONFIG_USB_NET_PLUSB=m
+CONFIG_USB_NET_RNDIS_HOST=m
+CONFIG_USB_NET_CDC_SUBSET=m
+CONFIG_USB_ALI_M5632=y
+CONFIG_USB_AN2720=y
+CONFIG_USB_BELKIN=y
+CONFIG_USB_ARMLINUX=y
+CONFIG_USB_EPSON2888=y
+CONFIG_USB_NET_ZAURUS=m
+CONFIG_USB_MON=y
+
+#
+# USB port drivers
+#
+CONFIG_USB_USS720=m
+
+#
+# USB Serial Converter support
+#
+CONFIG_USB_SERIAL=m
+CONFIG_USB_SERIAL_GENERIC=y
+CONFIG_USB_SERIAL_AIRPRIME=m
+CONFIG_USB_SERIAL_ARK3116=m
+CONFIG_USB_SERIAL_BELKIN=m
+CONFIG_USB_SERIAL_WHITEHEAT=m
+CONFIG_USB_SERIAL_DIGI_ACCELEPORT=m
+CONFIG_USB_SERIAL_CP2101=m
+CONFIG_USB_SERIAL_CYPRESS_M8=m
+CONFIG_USB_SERIAL_EMPEG=m
+CONFIG_USB_SERIAL_FTDI_SIO=m
+CONFIG_USB_SERIAL_FUNSOFT=m
+CONFIG_USB_SERIAL_VISOR=m
+CONFIG_USB_SERIAL_IPAQ=m
+CONFIG_USB_SERIAL_IR=m
+CONFIG_USB_SERIAL_EDGEPORT=m
+CONFIG_USB_SERIAL_EDGEPORT_TI=m
+CONFIG_USB_SERIAL_GARMIN=m
+CONFIG_USB_SERIAL_IPW=m
+CONFIG_USB_SERIAL_KEYSPAN_PDA=m
+CONFIG_USB_SERIAL_KEYSPAN=m
+CONFIG_USB_SERIAL_KEYSPAN_MPR=y
+CONFIG_USB_SERIAL_KEYSPAN_USA28=y
+CONFIG_USB_SERIAL_KEYSPAN_USA28X=y
+CONFIG_USB_SERIAL_KEYSPAN_USA28XA=y
+CONFIG_USB_SERIAL_KEYSPAN_USA28XB=y
+CONFIG_USB_SERIAL_KEYSPAN_USA19=y
+CONFIG_USB_SERIAL_KEYSPAN_USA18X=y
+CONFIG_USB_SERIAL_KEYSPAN_USA19W=y
+CONFIG_USB_SERIAL_KEYSPAN_USA19QW=y
+CONFIG_USB_SERIAL_KEYSPAN_USA19QI=y
+CONFIG_USB_SERIAL_KEYSPAN_USA49W=y
+CONFIG_USB_SERIAL_KEYSPAN_USA49WLC=y
+CONFIG_USB_SERIAL_KLSI=m
+CONFIG_USB_SERIAL_KOBIL_SCT=m
+CONFIG_USB_SERIAL_MCT_U232=m
+CONFIG_USB_SERIAL_NAVMAN=m
+CONFIG_USB_SERIAL_PL2303=m
+CONFIG_USB_SERIAL_HP4X=m
+CONFIG_USB_SERIAL_SAFE=m
+CONFIG_USB_SERIAL_SAFE_PADDED=y
+CONFIG_USB_SERIAL_SIERRAWIRELESS=m
+CONFIG_USB_SERIAL_TI=m
+CONFIG_USB_SERIAL_CYBERJACK=m
+CONFIG_USB_SERIAL_XIRCOM=m
+CONFIG_USB_SERIAL_OPTION=m
+CONFIG_USB_SERIAL_OMNINET=m
+CONFIG_USB_EZUSB=y
+
+#
+# USB Miscellaneous drivers
+#
+CONFIG_USB_EMI62=m
+CONFIG_USB_EMI26=m
+CONFIG_USB_AUERSWALD=m
+CONFIG_USB_RIO500=m
+CONFIG_USB_LEGOTOWER=m
+CONFIG_USB_LCD=m
+CONFIG_USB_LED=m
+# CONFIG_USB_CYPRESS_CY7C63 is not set
+# CONFIG_USB_CYTHERM is not set
+CONFIG_USB_PHIDGETKIT=m
+CONFIG_USB_PHIDGETSERVO=m
+CONFIG_USB_IDMOUSE=m
+CONFIG_USB_APPLEDISPLAY=m
+CONFIG_USB_SISUSBVGA=m
+CONFIG_USB_SISUSBVGA_CON=y
+CONFIG_USB_LD=m
+CONFIG_USB_TEST=m
+
+#
+# USB DSL modem support
+#
+CONFIG_USB_ATM=m
+CONFIG_USB_SPEEDTOUCH=m
+CONFIG_USB_CXACRU=m
+CONFIG_USB_UEAGLEATM=m
+CONFIG_USB_XUSBATM=m
+
+#
+# USB Gadget Support
+#
+# CONFIG_USB_GADGET is not set
+
+#
+# MMC/SD Card support
+#
+CONFIG_MMC=m
+# CONFIG_MMC_DEBUG is not set
+CONFIG_MMC_BLOCK=m
+CONFIG_MMC_SDHCI=m
+# CONFIG_MMC_WBSD is not set
+
+#
+# LED devices
+#
+CONFIG_NEW_LEDS=y
+CONFIG_LEDS_CLASS=y
+
+#
+# LED drivers
+#
+
+#
+# LED Triggers
+#
+CONFIG_LEDS_TRIGGERS=y
+CONFIG_LEDS_TRIGGER_TIMER=m
+CONFIG_LEDS_TRIGGER_IDE_DISK=y
+CONFIG_LEDS_TRIGGER_HEARTBEAT=m
+# CONFIG_INFINIBAND is not set
+
+#
+# EDAC - error detection and reporting (RAS) (EXPERIMENTAL)
+#
+
+#
+# Real Time Clock
+#
+CONFIG_RTC_LIB=m
+CONFIG_RTC_CLASS=m
+
+#
+# RTC interfaces
+#
+CONFIG_RTC_INTF_SYSFS=m
+CONFIG_RTC_INTF_PROC=m
+CONFIG_RTC_INTF_DEV=m
+# CONFIG_RTC_INTF_DEV_UIE_EMUL is not set
+
+#
+# RTC drivers
+#
+CONFIG_RTC_DRV_X1205=m
+CONFIG_RTC_DRV_DS1307=m
+CONFIG_RTC_DRV_DS1553=m
+CONFIG_RTC_DRV_ISL1208=m
+CONFIG_RTC_DRV_DS1672=m
+CONFIG_RTC_DRV_DS1742=m
+CONFIG_RTC_DRV_PCF8563=m
+CONFIG_RTC_DRV_PCF8583=m
+CONFIG_RTC_DRV_RS5C372=m
+# CONFIG_RTC_DRV_M48T86 is not set
+# CONFIG_RTC_DRV_TEST is not set
+CONFIG_RTC_DRV_V3020=m
+
+#
+# DMA Engine support
+#
+CONFIG_DMA_ENGINE=y
+
+#
+# DMA Clients
+#
+CONFIG_NET_DMA=y
+
+#
+# DMA Devices
+#
+CONFIG_INTEL_IOATDMA=m
+
+#
+# File systems
+#
+CONFIG_EXT2_FS=y
+CONFIG_EXT2_FS_XATTR=y
+CONFIG_EXT2_FS_POSIX_ACL=y
+CONFIG_EXT2_FS_SECURITY=y
+CONFIG_EXT2_FS_XIP=y
+CONFIG_FS_XIP=y
+CONFIG_EXT3_FS=m
+CONFIG_EXT3_FS_XATTR=y
+CONFIG_EXT3_FS_POSIX_ACL=y
+CONFIG_EXT3_FS_SECURITY=y
+CONFIG_JBD=m
+# CONFIG_JBD_DEBUG is not set
+CONFIG_FS_MBCACHE=y
+CONFIG_REISERFS_FS=m
+# CONFIG_REISERFS_CHECK is not set
+CONFIG_REISERFS_PROC_INFO=y
+CONFIG_REISERFS_FS_XATTR=y
+CONFIG_REISERFS_FS_POSIX_ACL=y
+CONFIG_REISERFS_FS_SECURITY=y
+CONFIG_JFS_FS=m
+CONFIG_JFS_POSIX_ACL=y
+CONFIG_JFS_SECURITY=y
+# CONFIG_JFS_DEBUG is not set
+# CONFIG_JFS_STATISTICS is not set
+CONFIG_FS_POSIX_ACL=y
+CONFIG_XFS_FS=m
+CONFIG_XFS_QUOTA=y
+CONFIG_XFS_SECURITY=y
+CONFIG_XFS_POSIX_ACL=y
+# CONFIG_XFS_RT is not set
+CONFIG_GFS2_FS=m
+CONFIG_GFS2_FS_LOCKING_NOLOCK=m
+CONFIG_GFS2_FS_LOCKING_DLM=m
+CONFIG_OCFS2_FS=m
+# CONFIG_OCFS2_DEBUG_MASKLOG is not set
+CONFIG_MINIX_FS=m
+CONFIG_ROMFS_FS=m
+CONFIG_INOTIFY=y
+CONFIG_INOTIFY_USER=y
+CONFIG_QUOTA=y
+# CONFIG_QFMT_V1 is not set
+CONFIG_QFMT_V2=y
+CONFIG_QUOTACTL=y
+CONFIG_DNOTIFY=y
+CONFIG_AUTOFS_FS=m
+CONFIG_AUTOFS4_FS=m
+CONFIG_FUSE_FS=m
+
+#
+# Caches
+#
+CONFIG_FSCACHE=m
+
+#
+# CD-ROM/DVD Filesystems
+#
+CONFIG_ISO9660_FS=y
+CONFIG_JOLIET=y
+CONFIG_ZISOFS=y
+CONFIG_ZISOFS_FS=y
+CONFIG_UDF_FS=m
+CONFIG_UDF_NLS=y
+CONFIG_CACHEFILES=m
+CONFIG_CACHEFILES_DEBUG=y
+
+#
+# DOS/FAT/NT Filesystems
+#
+CONFIG_FAT_FS=m
+CONFIG_MSDOS_FS=m
+CONFIG_VFAT_FS=m
+CONFIG_FAT_DEFAULT_CODEPAGE=437
+CONFIG_FAT_DEFAULT_IOCHARSET="ascii"
+# CONFIG_NTFS_FS is not set
+
+#
+# Pseudo filesystems
+#
+CONFIG_PROC_FS=y
+CONFIG_PROC_KCORE=y
+CONFIG_SYSFS=y
+CONFIG_TMPFS=y
+CONFIG_HUGETLBFS=y
+CONFIG_HUGETLB_PAGE=y
+CONFIG_RAMFS=y
+CONFIG_CONFIGFS_FS=m
+
+#
+# Miscellaneous filesystems
+#
+# CONFIG_ADFS_FS is not set
+CONFIG_AFFS_FS=m
+CONFIG_ECRYPT_FS=m
+CONFIG_HFS_FS=m
+CONFIG_HFSPLUS_FS=m
+CONFIG_BEFS_FS=m
+# CONFIG_BEFS_DEBUG is not set
+CONFIG_BFS_FS=m
+CONFIG_EFS_FS=m
+# CONFIG_JFFS_FS is not set
+CONFIG_JFFS2_FS=m
+CONFIG_JFFS2_FS_DEBUG=0
+CONFIG_JFFS2_FS_WRITEBUFFER=y
+CONFIG_JFFS2_SUMMARY=y
+# CONFIG_JFFS2_FS_XATTR is not set
+# CONFIG_JFFS2_COMPRESSION_OPTIONS is not set
+CONFIG_JFFS2_ZLIB=y
+CONFIG_JFFS2_RTIME=y
+# CONFIG_JFFS2_RUBIN is not set
+CONFIG_CRAMFS=m
+CONFIG_SQUASHFS=m
+# CONFIG_SQUASHFS_EMBEDDED is not set
+CONFIG_SQUASHFS_FRAGMENT_CACHE_SIZE=3
+# CONFIG_SQUASHFS_VMALLOC is not set
+CONFIG_VXFS_FS=m
+# CONFIG_HPFS_FS is not set
+CONFIG_QNX4FS_FS=m
+CONFIG_SYSV_FS=m
+CONFIG_UFS_FS=m
+# CONFIG_UFS_FS_WRITE is not set
+# CONFIG_UFS_DEBUG is not set
+
+#
+# Network File Systems
+#
+CONFIG_NFS_FS=m
+CONFIG_NFS_V3=y
+CONFIG_NFS_V3_ACL=y
+CONFIG_NFS_V4=y
+CONFIG_NFS_FSCACHE=y
+CONFIG_NFS_DIRECTIO=y
+CONFIG_NFSD=m
+CONFIG_NFSD_V3=y
+CONFIG_NFSD_V3_ACL=y
+CONFIG_NFSD_V4=y
+CONFIG_NFSD_TCP=y
+CONFIG_LOCKD=m
+CONFIG_LOCKD_V4=y
+CONFIG_EXPORTFS=m
+CONFIG_NFS_ACL_SUPPORT=m
+CONFIG_NFS_COMMON=y
+CONFIG_SUNRPC=m
+CONFIG_SUNRPC_GSS=m
+CONFIG_RPCSEC_GSS_KRB5=m
+CONFIG_RPCSEC_GSS_SPKM3=m
+# CONFIG_SMB_FS is not set
+CONFIG_CIFS=m
+# CONFIG_CIFS_STATS is not set
+CONFIG_CIFS_WEAK_PW_HASH=y
+CONFIG_CIFS_XATTR=y
+CONFIG_CIFS_POSIX=y
+# CONFIG_CIFS_DEBUG2 is not set
+# CONFIG_CIFS_EXPERIMENTAL is not set
+# CONFIG_NCP_FS is not set
+CONFIG_CODA_FS=m
+# CONFIG_CODA_FS_OLD_API is not set
+# CONFIG_AFS_FS is not set
+CONFIG_9P_FS=m
+
+#
+# Partition Types
+#
+CONFIG_PARTITION_ADVANCED=y
+# CONFIG_ACORN_PARTITION is not set
+CONFIG_OSF_PARTITION=y
+CONFIG_AMIGA_PARTITION=y
+# CONFIG_ATARI_PARTITION is not set
+CONFIG_MAC_PARTITION=y
+CONFIG_MSDOS_PARTITION=y
+CONFIG_BSD_DISKLABEL=y
+CONFIG_MINIX_SUBPARTITION=y
+CONFIG_SOLARIS_X86_PARTITION=y
+CONFIG_UNIXWARE_DISKLABEL=y
+# CONFIG_LDM_PARTITION is not set
+CONFIG_SGI_PARTITION=y
+# CONFIG_ULTRIX_PARTITION is not set
+CONFIG_SUN_PARTITION=y
+CONFIG_KARMA_PARTITION=y
+CONFIG_EFI_PARTITION=y
+
+#
+# Native Language Support
+#
+CONFIG_NLS=y
+CONFIG_NLS_DEFAULT="utf8"
+CONFIG_NLS_CODEPAGE_437=y
+CONFIG_NLS_CODEPAGE_737=m
+CONFIG_NLS_CODEPAGE_775=m
+CONFIG_NLS_CODEPAGE_850=m
+CONFIG_NLS_CODEPAGE_852=m
+CONFIG_NLS_CODEPAGE_855=m
+CONFIG_NLS_CODEPAGE_857=m
+CONFIG_NLS_CODEPAGE_860=m
+CONFIG_NLS_CODEPAGE_861=m
+CONFIG_NLS_CODEPAGE_862=m
+CONFIG_NLS_CODEPAGE_863=m
+CONFIG_NLS_CODEPAGE_864=m
+CONFIG_NLS_CODEPAGE_865=m
+CONFIG_NLS_CODEPAGE_866=m
+CONFIG_NLS_CODEPAGE_869=m
+CONFIG_NLS_CODEPAGE_936=m
+CONFIG_NLS_CODEPAGE_950=m
+CONFIG_NLS_CODEPAGE_932=m
+CONFIG_NLS_CODEPAGE_949=m
+CONFIG_NLS_CODEPAGE_874=m
+CONFIG_NLS_ISO8859_8=m
+CONFIG_NLS_CODEPAGE_1250=m
+CONFIG_NLS_CODEPAGE_1251=m
+CONFIG_NLS_ASCII=y
+CONFIG_NLS_ISO8859_1=m
+CONFIG_NLS_ISO8859_2=m
+CONFIG_NLS_ISO8859_3=m
+CONFIG_NLS_ISO8859_4=m
+CONFIG_NLS_ISO8859_5=m
+CONFIG_NLS_ISO8859_6=m
+CONFIG_NLS_ISO8859_7=m
+CONFIG_NLS_ISO8859_9=m
+CONFIG_NLS_ISO8859_13=m
+CONFIG_NLS_ISO8859_14=m
+CONFIG_NLS_ISO8859_15=m
+CONFIG_NLS_KOI8_R=m
+CONFIG_NLS_KOI8_U=m
+CONFIG_NLS_UTF8=m
+
+#
+# Distributed Lock Manager
+#
+CONFIG_DLM=m
+CONFIG_DLM_DEBUG=y
+
+#
+# Library routines
+#
+CONFIG_CRC_CCITT=m
+CONFIG_CRC16=m
+CONFIG_CRC32=y
+CONFIG_LIBCRC32C=y
+CONFIG_ZLIB_INFLATE=y
+CONFIG_ZLIB_DEFLATE=m
+CONFIG_TEXTSEARCH=y
+CONFIG_TEXTSEARCH_KMP=m
+CONFIG_TEXTSEARCH_BM=m
+CONFIG_TEXTSEARCH_FSM=m
+CONFIG_PLIST=y
+CONFIG_HAS_IOMEM=y
+CONFIG_HAS_IOPORT=y
+CONFIG_HAS_DMA=y
+
+#
+# Instrumentation Support
+#
+CONFIG_PROFILING=y
+CONFIG_OPROFILE=m
+CONFIG_OPROFILE_CELL=y
+CONFIG_KPROBES=y
+
+#
+# Kernel hacking
+#
+# CONFIG_PRINTK_TIME is not set
+CONFIG_MAGIC_SYSRQ=y
+# CONFIG_UNUSED_SYMBOLS is not set
+CONFIG_DEBUG_KERNEL=y
+CONFIG_LOG_BUF_SHIFT=17
+CONFIG_DETECT_SOFTLOCKUP=y
+CONFIG_SCHEDSTATS=y
+# CONFIG_DEBUG_SLAB is not set
+# CONFIG_DEBUG_RT_MUTEXES is not set
+# CONFIG_RT_MUTEX_TESTER is not set
+# CONFIG_DEBUG_SPINLOCK is not set
+# CONFIG_DEBUG_MUTEXES is not set
+# CONFIG_DEBUG_RWSEMS is not set
+# CONFIG_DEBUG_SPINLOCK_SLEEP is not set
+# CONFIG_DEBUG_LOCKING_API_SELFTESTS is not set
+# CONFIG_DEBUG_KOBJECT is not set
+CONFIG_DEBUG_INFO=y
+CONFIG_DEBUG_FS=y
+# CONFIG_DEBUG_VM is not set
+CONFIG_DEBUG_LIST=y
+# CONFIG_FORCED_INLINING is not set
+CONFIG_BOOT_DELAY=y
+# CONFIG_RCU_TORTURE_TEST is not set
+CONFIG_DEBUG_STACKOVERFLOW=y
+CONFIG_DEBUG_STACK_USAGE=y
+CONFIG_DEBUGGER=y
+CONFIG_XMON=y
+CONFIG_XMON_DEFAULT=y
+CONFIG_IRQSTACKS=y
+CONFIG_BOOTX_TEXT=y
+# CONFIG_PPC_EARLY_DEBUG is not set
+
+#
+# Security options
+#
+CONFIG_KEYS=y
+CONFIG_KEYS_DEBUG_PROC_KEYS=y
+CONFIG_SECURITY=y
+CONFIG_SECURITY_NETWORK=y
+CONFIG_SECURITY_NETWORK_XFRM=y
+CONFIG_SECURITY_CAPABILITIES=y
+# CONFIG_SECURITY_ROOTPLUG is not set
+# CONFIG_SECURITY_SECLVL is not set
+CONFIG_SECURITY_SELINUX=y
+CONFIG_SECURITY_SELINUX_BOOTPARAM=y
+CONFIG_SECURITY_SELINUX_BOOTPARAM_VALUE=1
+CONFIG_SECURITY_SELINUX_DISABLE=y
+CONFIG_SECURITY_SELINUX_DEVELOP=y
+CONFIG_SECURITY_SELINUX_AVC_STATS=y
+CONFIG_SECURITY_SELINUX_CHECKREQPROT_VALUE=1
+CONFIG_SECURITY_SELINUX_ENABLE_SECMARK_DEFAULT=y
+# CONFIG_SECURITY_SELINUX_POLICYDB_VERSION_MAX is not set
+CONFIG_KEYS_COMPAT=y
+
+#
+# Cryptographic options
+#
+CONFIG_CRYPTO=y
+CONFIG_CRYPTO_API=m
+CONFIG_CRYPTO_ALGAPI=m
+CONFIG_CRYPTO_AEAD=m
+CONFIG_CRYPTO_BLKCIPHER=m
+CONFIG_CRYPTO_SEQIV=m
+CONFIG_CRYPTO_HASH=m
+CONFIG_CRYPTO_MANAGER=m
+CONFIG_CRYPTO_HMAC=y
+CONFIG_CRYPTO_NHMAC=m
+CONFIG_CRYPTO_XCBC=m
+CONFIG_CRYPTO_NULL=m
+CONFIG_CRYPTO_MD4=m
+CONFIG_CRYPTO_MD5=m
+CONFIG_CRYPTO_SHA1=y
+CONFIG_CRYPTO_SHA256=m
+CONFIG_CRYPTO_SHA512=m
+CONFIG_CRYPTO_WP512=m
+CONFIG_CRYPTO_TGR192=m
+CONFIG_CRYPTO_ECB=m
+CONFIG_CRYPTO_CBC=m
+CONFIG_CRYPTO_CTR=m
+CONFIG_CRYPTO_CCM=m
+CONFIG_CRYPTO_DES=m
+CONFIG_CRYPTO_BLOWFISH=m
+CONFIG_CRYPTO_TWOFISH=m
+CONFIG_CRYPTO_SERPENT=m
+CONFIG_CRYPTO_AES=m
+CONFIG_CRYPTO_CAST5=m
+CONFIG_CRYPTO_CAST6=m
+CONFIG_CRYPTO_TEA=m
+CONFIG_CRYPTO_ARC4=m
+CONFIG_CRYPTO_KHAZAD=m
+CONFIG_CRYPTO_ANUBIS=m
+CONFIG_CRYPTO_DEFLATE=m
+CONFIG_CRYPTO_MICHAEL_MIC=m
+CONFIG_CRYPTO_CRC32C=y
+# CONFIG_CRYPTO_TEST is not set
+CONFIG_CRYPTO_AUTHENC=m
+CONFIG_CRYPTO_SIGNATURE=y
+CONFIG_CRYPTO_SIGNATURE_DSA=y
+CONFIG_CRYPTO_MPILIB=y
+
+#
+# Hardware crypto devices
+#
diff --git a/lustre/kernel_patches/kernel_configs/kernel-2.6.22-2.6-sles10-i686-bigsmp.config b/lustre/kernel_patches/kernel_configs/kernel-2.6.22-2.6-vanilla-i686-bigsmp.config

similarity index 100%

rename from lustre/kernel_patches/kernel_configs/kernel-2.6.22-2.6-sles10-i686-bigsmp.config

rename to lustre/kernel_patches/kernel_configs/kernel-2.6.22-2.6-vanilla-i686-bigsmp.config
diff --git a/lustre/kernel_patches/kernel_configs/kernel-2.6.22-2.6-sles10-i686.config b/lustre/kernel_patches/kernel_configs/kernel-2.6.22-2.6-vanilla-i686.config

similarity index 100%

rename from lustre/kernel_patches/kernel_configs/kernel-2.6.22-2.6-sles10-i686.config

rename to lustre/kernel_patches/kernel_configs/kernel-2.6.22-2.6-vanilla-i686.config
diff --git a/lustre/kernel_patches/kernel_configs/kernel-2.6.22-2.6-sles10-x86_64-smp.config b/lustre/kernel_patches/kernel_configs/kernel-2.6.22-2.6-vanilla-x86_64-smp.config

similarity index 100%

rename from lustre/kernel_patches/kernel_configs/kernel-2.6.22-2.6-sles10-x86_64-smp.config

rename to lustre/kernel_patches/kernel_configs/kernel-2.6.22-2.6-vanilla-x86_64-smp.config
diff --git a/lustre/kernel_patches/kernel_configs/kernel-2.6.22-2.6-sles10-x86_64.config b/lustre/kernel_patches/kernel_configs/kernel-2.6.22-2.6-vanilla-x86_64.config

similarity index 100%

rename from lustre/kernel_patches/kernel_configs/kernel-2.6.22-2.6-sles10-x86_64.config

rename to lustre/kernel_patches/kernel_configs/kernel-2.6.22-2.6-vanilla-x86_64.config
diff --git a/lustre/kernel_patches/kernel_configs/kernel-2.6.5-2.6-suse-i686-smp.config b/lustre/kernel_patches/kernel_configs/kernel-2.6.5-2.6-suse-i686-smp.config

index 9971cfa..1cd6e57 100644 (file)
--- a/lustre/kernel_patches/kernel_configs/kernel-2.6.5-2.6-suse-i686-smp.config
+++ b/lustre/kernel_patches/kernel_configs/kernel-2.6.5-2.6-suse-i686-smp.config
@@ -2817,7 +2817,7 @@ CONFIG_MAGIC_SYSRQ=y
  # CONFIG_DEBUG_SPINLOCK is not set
  # CONFIG_DEBUG_PAGEALLOC is not set
  # CONFIG_DEBUG_HIGHMEM is not set
-# CONFIG_DEBUG_INFO is not set
+CONFIG_DEBUG_INFO=y
  # CONFIG_DEBUG_SPINLOCK_SLEEP is not set
  # CONFIG_FRAME_POINTER is not set
  # CONFIG_KDB is not set
diff --git a/lustre/kernel_patches/kernel_configs/kernel-2.6.5-2.6-suse-ia64-smp.config b/lustre/kernel_patches/kernel_configs/kernel-2.6.5-2.6-suse-ia64-smp.config

index c205dc4..6132cf5 100644 (file)
--- a/lustre/kernel_patches/kernel_configs/kernel-2.6.5-2.6-suse-ia64-smp.config
+++ b/lustre/kernel_patches/kernel_configs/kernel-2.6.5-2.6-suse-ia64-smp.config
@@ -2364,7 +2364,7 @@ CONFIG_MAGIC_SYSRQ=y
  # CONFIG_DEBUG_SPINLOCK_SLEEP is not set
  # CONFIG_IA64_DEBUG_CMPXCHG is not set
  # CONFIG_IA64_DEBUG_IRQ is not set
-# CONFIG_DEBUG_INFO is not set
+CONFIG_DEBUG_INFO=y
  CONFIG_SYSVIPC_COMPAT=y
  
  #
diff --git a/lustre/kernel_patches/kernel_configs/kernel-2.6.5-2.6-suse-ia64.config b/lustre/kernel_patches/kernel_configs/kernel-2.6.5-2.6-suse-ia64.config

index c205dc4..6132cf5 100644 (file)
--- a/lustre/kernel_patches/kernel_configs/kernel-2.6.5-2.6-suse-ia64.config
+++ b/lustre/kernel_patches/kernel_configs/kernel-2.6.5-2.6-suse-ia64.config
@@ -2364,7 +2364,7 @@ CONFIG_MAGIC_SYSRQ=y
  # CONFIG_DEBUG_SPINLOCK_SLEEP is not set
  # CONFIG_IA64_DEBUG_CMPXCHG is not set
  # CONFIG_IA64_DEBUG_IRQ is not set
-# CONFIG_DEBUG_INFO is not set
+CONFIG_DEBUG_INFO=y
  CONFIG_SYSVIPC_COMPAT=y
  
  #
diff --git a/lustre/kernel_patches/kernel_configs/kernel-2.6.5-2.6-suse-ppc-pseries64.config b/lustre/kernel_patches/kernel_configs/kernel-2.6.5-2.6-suse-ppc-pseries64.config

index 76b4290..149908b 100644 (file)
--- a/lustre/kernel_patches/kernel_configs/kernel-2.6.5-2.6-suse-ppc-pseries64.config
+++ b/lustre/kernel_patches/kernel_configs/kernel-2.6.5-2.6-suse-ppc-pseries64.config
@@ -1392,7 +1392,7 @@ CONFIG_KDB=y
  CONFIG_KDB_MODULES=y
  CONFIG_KDB_OFF=y
  # CONFIG_PPCDBG is not set
-# CONFIG_DEBUG_INFO is not set
+CONFIG_DEBUG_INFO=y
  CONFIG_IRQSTACKS=y
  
  #
diff --git a/lustre/kernel_patches/kernel_configs/kernel-2.6.5-2.6-suse-ppc.config b/lustre/kernel_patches/kernel_configs/kernel-2.6.5-2.6-suse-ppc.config

index b5e692b..c36face 100644 (file)
--- a/lustre/kernel_patches/kernel_configs/kernel-2.6.5-2.6-suse-ppc.config
+++ b/lustre/kernel_patches/kernel_configs/kernel-2.6.5-2.6-suse-ppc.config
@@ -1391,7 +1391,7 @@ CONFIG_KDB=y
  CONFIG_KDB_MODULES=y
  CONFIG_KDB_OFF=y
  # CONFIG_PPCDBG is not set
-# CONFIG_DEBUG_INFO is not set
+CONFIG_DEBUG_INFO=y
  CONFIG_IRQSTACKS=y
  
  #
diff --git a/lustre/kernel_patches/kernel_configs/kernel-2.6.5-2.6-suse-ppc64-smp.config b/lustre/kernel_patches/kernel_configs/kernel-2.6.5-2.6-suse-ppc64-smp.config

new file mode 100644 (file)

index 0000000..d033dbc
--- /dev/null
+++ b/lustre/kernel_patches/kernel_configs/kernel-2.6.5-2.6-suse-ppc64-smp.config
@@ -0,0 +1,1624 @@
+#
+# Automatically generated make config: don't edit
+#
+CONFIG_64BIT=y
+CONFIG_MMU=y
+CONFIG_RWSEM_XCHGADD_ALGORITHM=y
+CONFIG_GENERIC_ISA_DMA=y
+CONFIG_HAVE_DEC_LOCK=y
+CONFIG_EARLY_PRINTK=y
+CONFIG_COMPAT=y
+CONFIG_FRAME_POINTER=y
+CONFIG_FORCE_MAX_ZONEORDER=13
+
+#
+# Code maturity level options
+#
+CONFIG_EXPERIMENTAL=y
+CONFIG_CLEAN_COMPILE=y
+CONFIG_STANDALONE=y
+
+#
+# General setup
+#
+CONFIG_SWAP=y
+CONFIG_SYSVIPC=y
+CONFIG_POSIX_MQUEUE=y
+CONFIG_BSD_PROCESS_ACCT=y
+CONFIG_SYSCTL=y
+CONFIG_LOG_BUF_SHIFT=19
+CONFIG_HOTPLUG=y
+CONFIG_EVLOG=y
+# CONFIG_EVLOG_FWPRINTK is not set
+CONFIG_IKCONFIG=y
+CONFIG_IKCONFIG_PROC=y
+# CONFIG_EMBEDDED is not set
+
+#
+# Class Based Kernel Resource Management
+#
+CONFIG_CKRM=y
+CONFIG_RCFS_FS=m
+CONFIG_CKRM_TYPE_TASKCLASS=y
+CONFIG_CKRM_RES_NUMTASKS=m
+CONFIG_CKRM_CPU_SCHEDULE=y
+# CONFIG_CKRM_CPU_SCHEDULE_AT_BOOT is not set
+CONFIG_CKRM_RES_BLKIO=y
+CONFIG_CKRM_TYPE_SOCKETCLASS=y
+CONFIG_CKRM_RBCE=m
+CONFIG_CKRM_CRBCE=m
+CONFIG_DELAY_ACCT=y
+CONFIG_KALLSYMS=y
+CONFIG_FUTEX=y
+CONFIG_EPOLL=y
+CONFIG_IOSCHED_NOOP=y
+CONFIG_IOSCHED_AS=y
+CONFIG_IOSCHED_DEADLINE=y
+CONFIG_IOSCHED_CFQ=y
+CONFIG_IOSCHED_PS=y
+# CONFIG_CC_OPTIMIZE_FOR_SIZE is not set
+
+#
+# Loadable module support
+#
+CONFIG_MODULES=y
+CONFIG_MODULE_UNLOAD=y
+# CONFIG_MODULE_FORCE_UNLOAD is not set
+CONFIG_OBSOLETE_MODPARM=y
+CONFIG_MODVERSIONS=y
+CONFIG_KMOD=y
+CONFIG_STOP_MACHINE=y
+
+#
+# Platform support
+#
+# CONFIG_PPC_ISERIES is not set
+CONFIG_PPC_PSERIES=y
+CONFIG_PPC=y
+CONFIG_PPC64=y
+CONFIG_PPC_OF=y
+CONFIG_ALTIVEC=y
+# CONFIG_PPC_PMAC is not set
+CONFIG_PPC_SPLPAR=y
+# CONFIG_BOOTX_TEXT is not set
+# CONFIG_POWER4_ONLY is not set
+# CONFIG_IOMMU_VMERGE is not set
+CONFIG_SMP=y
+CONFIG_IRQ_ALL_CPUS=y
+CONFIG_NR_CPUS=128
+# CONFIG_HMT is not set
+CONFIG_DISCONTIGMEM=y
+CONFIG_NUMA=y
+CONFIG_SCHED_SMT=y
+CONFIG_PPC_RTAS=y
+CONFIG_RTAS_FLASH=m
+CONFIG_SCANLOG=m
+CONFIG_LPARCFG=y
+
+#
+# General setup
+#
+CONFIG_PCI=y
+CONFIG_PCI_DOMAINS=y
+CONFIG_BINFMT_ELF=y
+CONFIG_BINFMT_MISC=m
+# CONFIG_PCI_LEGACY_PROC is not set
+# CONFIG_PCI_NAMES is not set
+CONFIG_HOTPLUG_CPU=y
+
+#
+# PCMCIA/CardBus support
+#
+# CONFIG_PCMCIA is not set
+
+#
+# PCI Hotplug Support
+#
+CONFIG_HOTPLUG_PCI=y
+# CONFIG_HOTPLUG_PCI_FAKE is not set
+# CONFIG_HOTPLUG_PCI_CPCI is not set
+# CONFIG_HOTPLUG_PCI_PCIE is not set
+# CONFIG_HOTPLUG_PCI_SHPC is not set
+CONFIG_HOTPLUG_PCI_RPA=y
+CONFIG_HOTPLUG_PCI_RPA_DLPAR=y
+CONFIG_PROC_DEVICETREE=y
+# CONFIG_CMDLINE_BOOL is not set
+
+#
+# Device Drivers
+#
+
+#
+# Generic Driver Options
+#
+CONFIG_FW_LOADER=m
+# CONFIG_DEBUG_DRIVER is not set
+# CONFIG_DEBUG_DEVRES is not set
+
+#
+# Memory Technology Devices (MTD)
+#
+# CONFIG_MTD is not set
+
+#
+# Parallel port support
+#
+CONFIG_PARPORT=m
+CONFIG_PARPORT_PC=m
+CONFIG_PARPORT_PC_CML1=m
+CONFIG_PARPORT_SERIAL=m
+CONFIG_PARPORT_PC_FIFO=y
+CONFIG_PARPORT_PC_SUPERIO=y
+CONFIG_PARPORT_OTHER=y
+CONFIG_PARPORT_1284=y
+
+#
+# Plug and Play support
+#
+
+#
+# Block devices
+#
+CONFIG_BLK_DEV_FD=y
+# CONFIG_PARIDE is not set
+# CONFIG_BLK_CPQ_DA is not set
+# CONFIG_BLK_CPQ_CISS_DA is not set
+# CONFIG_BLK_DEV_DAC960 is not set
+# CONFIG_BLK_DEV_UMEM is not set
+CONFIG_BLK_DEV_LOOP=y
+CONFIG_BLK_DEV_CRYPTOLOOP=m
+CONFIG_BLK_DEV_NBD=m
+# CONFIG_BLK_DEV_CARMEL is not set
+CONFIG_BLK_DEV_RAM=y
+CONFIG_BLK_DEV_RAM_SIZE=123456
+CONFIG_BLK_DEV_INITRD=y
+CONFIG_CIPHER_TWOFISH=m
+
+#
+# ATA/ATAPI/MFM/RLL support
+#
+CONFIG_IDE=y
+CONFIG_BLK_DEV_IDE=y
+
+#
+# Please see Documentation/ide.txt for help/info on IDE drives
+#
+CONFIG_BLK_DEV_IDEDISK=y
+# CONFIG_IDEDISK_MULTI_MODE is not set
+# CONFIG_IDEDISK_STROKE is not set
+CONFIG_BLK_DEV_IDECD=y
+# CONFIG_BLK_DEV_IDETAPE is not set
+# CONFIG_BLK_DEV_IDEFLOPPY is not set
+# CONFIG_BLK_DEV_IDESCSI is not set
+CONFIG_IDE_TASK_IOCTL=y
+# CONFIG_IDE_TASKFILE_IO is not set
+
+#
+# IDE chipset support/bugfixes
+#
+CONFIG_IDE_GENERIC=y
+CONFIG_BLK_DEV_IDEPCI=y
+CONFIG_IDEPCI_SHARE_IRQ=y
+# CONFIG_BLK_DEV_OFFBOARD is not set
+CONFIG_BLK_DEV_GENERIC=y
+# CONFIG_BLK_DEV_OPTI621 is not set
+CONFIG_BLK_DEV_SL82C105=y
+CONFIG_BLK_DEV_IDEDMA_PCI=y
+CONFIG_BLK_DEV_IDEDMA_FORCED=y
+CONFIG_IDEDMA_PCI_AUTO=y
+# CONFIG_IDEDMA_ONLYDISK is not set
+CONFIG_BLK_DEV_ADMA=y
+# CONFIG_BLK_DEV_AEC62XX is not set
+# CONFIG_BLK_DEV_ALI15X3 is not set
+CONFIG_BLK_DEV_AMD74XX=y
+# CONFIG_BLK_DEV_CMD64X is not set
+# CONFIG_BLK_DEV_TRIFLEX is not set
+# CONFIG_BLK_DEV_CY82C693 is not set
+# CONFIG_BLK_DEV_CS5520 is not set
+# CONFIG_BLK_DEV_CS5530 is not set
+# CONFIG_BLK_DEV_HPT34X is not set
+# CONFIG_BLK_DEV_HPT366 is not set
+# CONFIG_BLK_DEV_SC1200 is not set
+# CONFIG_BLK_DEV_PIIX is not set
+# CONFIG_BLK_DEV_NS87415 is not set
+CONFIG_BLK_DEV_PDC202XX_OLD=y
+CONFIG_PDC202XX_BURST=y
+# CONFIG_BLK_DEV_PDC202XX_NEW is not set
+# CONFIG_BLK_DEV_SVWKS is not set
+CONFIG_BLK_DEV_SIIMAGE=y
+# CONFIG_BLK_DEV_SLC90E66 is not set
+# CONFIG_BLK_DEV_TRM290 is not set
+# CONFIG_BLK_DEV_VIA82CXXX is not set
+CONFIG_BLK_DEV_IDEDMA=y
+# CONFIG_IDEDMA_IVB is not set
+CONFIG_IDEDMA_AUTO=y
+# CONFIG_BLK_DEV_HD is not set
+
+#
+# SCSI device support
+#
+CONFIG_SCSI=m
+CONFIG_SCSI_PROC_FS=y
+
+#
+# SCSI support type (disk, tape, CD-ROM)
+#
+CONFIG_BLK_DEV_SD=m
+CONFIG_SCSI_DUMP=m
+CONFIG_SD_IOSTATS=y
+CONFIG_CHR_DEV_ST=m
+# CONFIG_CHR_DEV_OSST is not set
+CONFIG_BLK_DEV_SR=m
+CONFIG_BLK_DEV_SR_VENDOR=y
+CONFIG_CHR_DEV_SG=m
+CONFIG_CHR_DEV_SCH=m
+
+#
+# Some SCSI devices (e.g. CD jukebox) support multiple LUNs
+#
+CONFIG_SCSI_MULTI_LUN=y
+CONFIG_SCSI_CONSTANTS=y
+CONFIG_SCSI_LOGGING=y
+
+#
+# SCSI Transport Attributes
+#
+CONFIG_SCSI_SPI_ATTRS=m
+CONFIG_SCSI_FC_ATTRS=m
+
+#
+# SCSI low-level drivers
+#
+# CONFIG_BLK_DEV_3W_XXXX_RAID is not set
+CONFIG_SCSI_3W_9XXX=m
+# CONFIG_SCSI_ACARD is not set
+# CONFIG_SCSI_AACRAID is not set
+# CONFIG_SCSI_AIC7XXX is not set
+CONFIG_SCSI_ADP94XX=m
+# CONFIG_SCSI_AIC7XXX_OLD is not set
+# CONFIG_SCSI_AIC79XX is not set
+# CONFIG_SCSI_ADVANSYS is not set
+# CONFIG_MEGARAID_NEWGEN is not set
+# CONFIG_MEGARAID_LEGACY is not set
+# CONFIG_MEGARAID_SAS is not set
+# CONFIG_SCSI_BUSLOGIC is not set
+# CONFIG_SCSI_CPQFCTS is not set
+# CONFIG_SCSI_DMX3191D is not set
+# CONFIG_SCSI_EATA is not set
+# CONFIG_SCSI_EATA_PIO is not set
+# CONFIG_SCSI_FUTURE_DOMAIN is not set
+# CONFIG_SCSI_GDTH is not set
+# CONFIG_SCSI_IPS is not set
+CONFIG_SCSI_IBMVSCSI=m
+CONFIG_SCSI_IBMVSCSIS=m
+# CONFIG_SCSI_INIA100 is not set
+# CONFIG_SCSI_PPA is not set
+# CONFIG_SCSI_IMM is not set
+CONFIG_SCSI_SYM53C8XX_2=m
+CONFIG_SCSI_SYM53C8XX_DMA_ADDRESSING_MODE=0
+CONFIG_SCSI_SYM53C8XX_DEFAULT_TAGS=16
+CONFIG_SCSI_SYM53C8XX_MAX_TAGS=64
+# CONFIG_SCSI_SYM53C8XX_IOMAPPED is not set
+CONFIG_SCSI_SYM53C8XX_EEH_RECOVERY=y
+CONFIG_SCSI_LPFC=m
+CONFIG_SCSI_IPR=m
+CONFIG_SCSI_IPR_TRACE=y
+CONFIG_SCSI_IPR_DUMP=y
+CONFIG_SCSI_IPR_EEH_RECOVERY=y
+# CONFIG_SCSI_QLOGIC_ISP is not set
+# CONFIG_SCSI_QLOGIC_FC is not set
+# CONFIG_SCSI_QLOGIC_1280 is not set
+CONFIG_SCSI_QLA2XXX=m
+# CONFIG_SCSI_QLA21XX is not set
+# CONFIG_SCSI_QLA22XX is not set
+CONFIG_SCSI_QLA2300=m
+# CONFIG_SCSI_QLA2322 is not set
+CONFIG_SCSI_QLA24XX=m
+CONFIG_SCSI_QLA2XXX_FAILOVER=y
+CONFIG_SCSI_QLA4XXX=m
+CONFIG_SCSI_QLA4XXX_FAILOVER=y
+# CONFIG_SCSI_DC395x is not set
+# CONFIG_SCSI_DC390T is not set
+CONFIG_SCSI_DEBUG=m
+CONFIG_ATA=m
+# CONFIG_ATA_NONSTANDARD is not set
+# CONFIG_SATA_AHCI is not set
+# CONFIG_SATA_SVW is not set
+# CONFIG_ATA_PIIX is not set
+# CONFIG_SATA_MV is not set
+# CONFIG_SATA_NV is not set
+# CONFIG_PDC_ADMA is not set
+# CONFIG_SATA_QSTOR is not set
+# CONFIG_SATA_PROMISE is not set
+# CONFIG_SATA_SX4 is not set
+# CONFIG_SATA_SIL is not set
+# CONFIG_SATA_SIL24 is not set
+# CONFIG_SATA_SIS is not set
+# CONFIG_SATA_ULI is not set
+# CONFIG_SATA_VIA is not set
+# CONFIG_SATA_VITESSE is not set
+# CONFIG_SATA_INIC162X is not set
+# CONFIG_PATA_ALI is not set
+# CONFIG_PATA_AMD is not set
+# CONFIG_PATA_ARTOP is not set
+# CONFIG_PATA_ATIIXP is not set
+# CONFIG_PATA_CMD640_PCI is not set
+# CONFIG_PATA_CMD64X is not set
+# CONFIG_PATA_CS5520 is not set
+# CONFIG_PATA_CS5530 is not set
+# CONFIG_PATA_CYPRESS is not set
+# CONFIG_PATA_EFAR is not set
+# CONFIG_ATA_GENERIC is not set
+# CONFIG_PATA_HPT366 is not set
+# CONFIG_PATA_HPT37X is not set
+# CONFIG_PATA_HPT3X2N is not set
+# CONFIG_PATA_HPT3X3 is not set
+# CONFIG_PATA_IT821X is not set
+# CONFIG_PATA_IT8213 is not set
+# CONFIG_PATA_JMICRON is not set
+# CONFIG_PATA_TRIFLEX is not set
+# CONFIG_PATA_MARVELL is not set
+# CONFIG_PATA_MPIIX is not set
+# CONFIG_PATA_OLDPIIX is not set
+# CONFIG_PATA_NETCELL is not set
+# CONFIG_PATA_NS87410 is not set
+# CONFIG_PATA_OPTI is not set
+# CONFIG_PATA_OPTIDMA is not set
+# CONFIG_PATA_PDC_OLD is not set
+# CONFIG_PATA_RADISYS is not set
+# CONFIG_PATA_RZ1000 is not set
+# CONFIG_PATA_SC1200 is not set
+# CONFIG_PATA_SERVERWORKS is not set
+CONFIG_PATA_PDC2027X=m
+# CONFIG_PATA_SIL680 is not set
+# CONFIG_PATA_SIS is not set
+# CONFIG_PATA_VIA is not set
+# CONFIG_PATA_WINBOND is not set
+
+#
+# Multi-device support (RAID and LVM)
+#
+CONFIG_MD=y
+CONFIG_BLK_DEV_MD=y
+CONFIG_MD_LINEAR=m
+CONFIG_MD_RAID0=m
+CONFIG_MD_RAID1=m
+CONFIG_MD_RAID5=m
+CONFIG_MD_RAID6=m
+CONFIG_MD_MULTIPATH=m
+CONFIG_BLK_DEV_DM=m
+CONFIG_DM_CRYPT=m
+CONFIG_DM_SNAPSHOT=m
+CONFIG_DM_MIRROR=m
+CONFIG_DM_ZERO=m
+CONFIG_DM_MULTIPATH=m
+CONFIG_DM_MULTIPATH_EMC=m
+CONFIG_DM_FLAKEY=m
+CONFIG_BLK_DEV_DM_BBR=m
+
+#
+# Fusion MPT device support
+#
+CONFIG_FUSION=y
+# CONFIG_FUSION_SPI is not set
+# CONFIG_FUSION_FC is not set
+CONFIG_FUSION_SAS=m
+CONFIG_FUSION_MAX_SGE=128
+
+#
+# IEEE 1394 (FireWire) support
+#
+CONFIG_IEEE1394=m
+
+#
+# Subsystem Options
+#
+# CONFIG_IEEE1394_VERBOSEDEBUG is not set
+# CONFIG_IEEE1394_OUI_DB is not set
+CONFIG_IEEE1394_EXTRA_CONFIG_ROMS=y
+CONFIG_IEEE1394_CONFIG_ROM_IP1394=y
+
+#
+# Device Drivers
+#
+# CONFIG_IEEE1394_PCILYNX is not set
+CONFIG_IEEE1394_OHCI1394=m
+
+#
+# Protocol Drivers
+#
+CONFIG_IEEE1394_VIDEO1394=m
+CONFIG_IEEE1394_SBP2=m
+# CONFIG_IEEE1394_SBP2_PHYS_DMA is not set
+CONFIG_IEEE1394_ETH1394=m
+CONFIG_IEEE1394_DV1394=m
+CONFIG_IEEE1394_RAWIO=m
+CONFIG_IEEE1394_CMP=m
+CONFIG_IEEE1394_AMDTP=m
+
+#
+# I2O device support
+#
+# CONFIG_I2O is not set
+
+#
+# Macintosh device drivers
+#
+
+#
+# Networking support
+#
+CONFIG_NET=y
+
+#
+# Networking options
+#
+CONFIG_PACKET=y
+CONFIG_PACKET_MMAP=y
+CONFIG_NETLINK_DEV=y
+CONFIG_UNIX=y
+CONFIG_NET_KEY=y
+CONFIG_INET=y
+CONFIG_IP_MULTICAST=y
+CONFIG_IP_ADVANCED_ROUTER=y
+CONFIG_IP_MULTIPLE_TABLES=y
+# CONFIG_IP_ROUTE_FWMARK is not set
+CONFIG_IP_ROUTE_NAT=y
+CONFIG_IP_ROUTE_MULTIPATH=y
+CONFIG_IP_ROUTE_TOS=y
+# CONFIG_IP_ROUTE_VERBOSE is not set
+# CONFIG_IP_PNP is not set
+CONFIG_NET_IPIP=m
+CONFIG_NET_IPGRE=m
+CONFIG_NET_IPGRE_BROADCAST=y
+CONFIG_IP_MROUTE=y
+CONFIG_IP_PIMSM_V1=y
+CONFIG_IP_PIMSM_V2=y
+# CONFIG_ARPD is not set
+CONFIG_SYN_COOKIES=y
+CONFIG_INET_AH=m
+CONFIG_INET_ESP=m
+CONFIG_INET_IPCOMP=m
+# CONFIG_ACCEPT_QUEUES is not set
+
+#
+# IP: Virtual Server Configuration
+#
+CONFIG_IP_VS=m
+# CONFIG_IP_VS_DEBUG is not set
+CONFIG_IP_VS_TAB_BITS=12
+
+#
+# IPVS transport protocol load balancing support
+#
+CONFIG_IP_VS_PROTO_TCP=y
+CONFIG_IP_VS_PROTO_UDP=y
+CONFIG_IP_VS_PROTO_ESP=y
+CONFIG_IP_VS_PROTO_AH=y
+
+#
+# IPVS scheduler
+#
+CONFIG_IP_VS_RR=m
+CONFIG_IP_VS_WRR=m
+CONFIG_IP_VS_LC=m
+CONFIG_IP_VS_WLC=m
+CONFIG_IP_VS_LBLC=m
+CONFIG_IP_VS_LBLCR=m
+CONFIG_IP_VS_DH=m
+CONFIG_IP_VS_SH=m
+CONFIG_IP_VS_SED=m
+CONFIG_IP_VS_NQ=m
+
+#
+# IPVS application helper
+#
+CONFIG_IP_VS_FTP=m
+CONFIG_IPV6=m
+CONFIG_IPV6_SUBTREES=y
+CONFIG_IPV6_PRIVACY=y
+CONFIG_IPV6_NDISC_NEW=y
+CONFIG_INET6_AH=m
+CONFIG_INET6_ESP=m
+CONFIG_INET6_IPCOMP=m
+CONFIG_IPV6_TUNNEL=m
+
+#
+# MOBILE IPv6 (EXPERIMENTAL)
+#
+CONFIG_IPV6_MOBILITY=m
+CONFIG_IPV6_MOBILITY_MN=m
+CONFIG_IPV6_MOBILITY_HA=m
+# CONFIG_IPV6_MOBILITY_DEBUG is not set
+# CONFIG_DECNET is not set
+CONFIG_BRIDGE=m
+CONFIG_NETFILTER=y
+# CONFIG_NETFILTER_DEBUG is not set
+CONFIG_BRIDGE_NETFILTER=y
+
+#
+# IP: Netfilter Configuration
+#
+CONFIG_IP_NF_CONNTRACK=m
+CONFIG_IP_NF_CT_PROTO_SCTP=m
+CONFIG_IP_NF_FTP=m
+CONFIG_IP_NF_IRC=m
+CONFIG_IP_NF_TFTP=m
+CONFIG_IP_NF_AMANDA=m
+CONFIG_IP_NF_QUEUE=m
+CONFIG_IP_NF_IPTABLES=m
+CONFIG_IP_NF_MATCH_LIMIT=m
+CONFIG_IP_NF_MATCH_IPRANGE=m
+CONFIG_IP_NF_MATCH_MAC=m
+CONFIG_IP_NF_MATCH_PKTTYPE=m
+CONFIG_IP_NF_MATCH_POLICY=m
+CONFIG_IP_NF_MATCH_MARK=m
+CONFIG_IP_NF_MATCH_MULTIPORT=m
+CONFIG_IP_NF_MATCH_TOS=m
+CONFIG_IP_NF_MATCH_RECENT=m
+CONFIG_IP_NF_MATCH_ECN=m
+CONFIG_IP_NF_MATCH_DSCP=m
+CONFIG_IP_NF_MATCH_AH_ESP=m
+CONFIG_IP_NF_MATCH_LENGTH=m
+CONFIG_IP_NF_MATCH_TTL=m
+CONFIG_IP_NF_MATCH_TCPMSS=m
+CONFIG_IP_NF_MATCH_HELPER=m
+CONFIG_IP_NF_MATCH_STATE=m
+CONFIG_IP_NF_MATCH_CONNTRACK=m
+CONFIG_IP_NF_MATCH_OWNER=m
+CONFIG_IP_NF_MATCH_PHYSDEV=m
+CONFIG_IP_NF_MATCH_SCTP=m
+CONFIG_IP_NF_FILTER=m
+CONFIG_IP_NF_TARGET_REJECT=m
+CONFIG_IP_NF_NAT=m
+CONFIG_IP_NF_NAT_NEEDED=y
+CONFIG_IP_NF_TARGET_MASQUERADE=m
+CONFIG_IP_NF_TARGET_REDIRECT=m
+CONFIG_IP_NF_TARGET_NETMAP=m
+CONFIG_IP_NF_TARGET_SAME=m
+CONFIG_IP_NF_NAT_LOCAL=y
+CONFIG_IP_NF_NAT_SNMP_BASIC=m
+CONFIG_IP_NF_NAT_IRC=m
+CONFIG_IP_NF_NAT_FTP=m
+CONFIG_IP_NF_NAT_TFTP=m
+CONFIG_IP_NF_NAT_AMANDA=m
+CONFIG_IP_NF_MANGLE=m
+CONFIG_IP_NF_TARGET_TOS=m
+CONFIG_IP_NF_TARGET_ECN=m
+CONFIG_IP_NF_TARGET_DSCP=m
+CONFIG_IP_NF_TARGET_MARK=m
+CONFIG_IP_NF_TARGET_TTL=m
+CONFIG_IP_NF_TARGET_CLASSIFY=m
+CONFIG_IP_NF_TARGET_LOG=m
+CONFIG_IP_NF_TARGET_ULOG=m
+CONFIG_IP_NF_TARGET_TCPMSS=m
+CONFIG_IP_NF_ARPTABLES=m
+CONFIG_IP_NF_ARPFILTER=m
+CONFIG_IP_NF_ARP_MANGLE=m
+# CONFIG_IP_NF_COMPAT_IPCHAINS is not set
+# CONFIG_IP_NF_COMPAT_IPFWADM is not set
+CONFIG_IP_NF_CONNTRACK_MARK=y
+CONFIG_IP_NF_TARGET_CONNMARK=m
+CONFIG_IP_NF_MATCH_CONNMARK=m
+CONFIG_IP_NF_TARGET_CLUSTERIP=m
+CONFIG_IP_NF_MATCH_ADDRTYPE=m
+CONFIG_IP_NF_MATCH_HASHLIMIT=m
+# CONFIG_IP_NF_MATCH_IPV4OPTIONS is not set
+
+#
+# IPv6: Netfilter Configuration
+#
+CONFIG_IP6_NF_FTP=m
+CONFIG_IP6_NF_QUEUE=m
+CONFIG_IP6_NF_IPTABLES=m
+CONFIG_IP6_NF_MATCH_LIMIT=m
+CONFIG_IP6_NF_MATCH_MAC=m
+CONFIG_IP6_NF_MATCH_RT=m
+CONFIG_IP6_NF_MATCH_OPTS=m
+CONFIG_IP6_NF_MATCH_FRAG=m
+CONFIG_IP6_NF_MATCH_HL=m
+CONFIG_IP6_NF_MATCH_MULTIPORT=m
+CONFIG_IP6_NF_MATCH_OWNER=m
+CONFIG_IP6_NF_MATCH_MARK=m
+CONFIG_IP6_NF_MATCH_IPV6HEADER=m
+CONFIG_IP6_NF_MATCH_AHESP=m
+CONFIG_IP6_NF_MATCH_LENGTH=m
+CONFIG_IP6_NF_MATCH_EUI64=m
+CONFIG_IP6_NF_CONNTRACK=m
+CONFIG_IP6_NF_MATCH_STATE=m
+CONFIG_IP6_NF_FILTER=m
+CONFIG_IP6_NF_TARGET_LOG=m
+CONFIG_IP6_NF_TARGET_REJECT=m
+CONFIG_IP6_NF_MANGLE=m
+CONFIG_IP6_NF_TARGET_MARK=m
+
+#
+# Bridge: Netfilter Configuration
+#
+CONFIG_BRIDGE_NF_EBTABLES=m
+CONFIG_BRIDGE_EBT_BROUTE=m
+CONFIG_BRIDGE_EBT_T_FILTER=m
+CONFIG_BRIDGE_EBT_T_NAT=m
+CONFIG_BRIDGE_EBT_802_3=m
+CONFIG_BRIDGE_EBT_AMONG=m
+CONFIG_BRIDGE_EBT_ARP=m
+CONFIG_BRIDGE_EBT_IP=m
+CONFIG_BRIDGE_EBT_LIMIT=m
+CONFIG_BRIDGE_EBT_MARK=m
+CONFIG_BRIDGE_EBT_PKTTYPE=m
+CONFIG_BRIDGE_EBT_STP=m
+CONFIG_BRIDGE_EBT_VLAN=m
+CONFIG_BRIDGE_EBT_ARPREPLY=m
+CONFIG_BRIDGE_EBT_DNAT=m
+CONFIG_BRIDGE_EBT_MARK_T=m
+CONFIG_BRIDGE_EBT_REDIRECT=m
+# CONFIG_BRIDGE_EBT_SNAT is not set
+CONFIG_BRIDGE_EBT_LOG=m
+CONFIG_XFRM=y
+CONFIG_XFRM_USER=m
+
+#
+# SCTP Configuration (EXPERIMENTAL)
+#
+CONFIG_IP_SCTP=m
+# CONFIG_SCTP_DBG_MSG is not set
+# CONFIG_SCTP_DBG_OBJCNT is not set
+CONFIG_SCTP_HMAC_NONE=y
+# CONFIG_SCTP_HMAC_SHA1 is not set
+# CONFIG_SCTP_HMAC_MD5 is not set
+# CONFIG_ATM is not set
+CONFIG_VLAN_8021Q=m
+CONFIG_LLC=y
+CONFIG_LLC2=m
+CONFIG_IPX=m
+CONFIG_IPX_INTERN=y
+CONFIG_ATALK=m
+CONFIG_DEV_APPLETALK=y
+CONFIG_IPDDP=m
+CONFIG_IPDDP_ENCAP=y
+CONFIG_IPDDP_DECAP=y
+# CONFIG_X25 is not set
+# CONFIG_LAPB is not set
+# CONFIG_NET_DIVERT is not set
+# CONFIG_ECONET is not set
+# CONFIG_WAN_ROUTER is not set
+# CONFIG_NET_FASTROUTE is not set
+# CONFIG_NET_HW_FLOWCONTROL is not set
+
+#
+# QoS and/or fair queueing
+#
+CONFIG_NET_SCHED=y
+CONFIG_NET_SCH_CBQ=m
+CONFIG_NET_SCH_HTB=m
+CONFIG_NET_SCH_HFSC=m
+CONFIG_NET_SCH_CSZ=m
+CONFIG_NET_SCH_PRIO=m
+CONFIG_NET_SCH_RED=m
+CONFIG_NET_SCH_SFQ=m
+CONFIG_NET_SCH_TEQL=m
+CONFIG_NET_SCH_TBF=m
+CONFIG_NET_SCH_GRED=m
+CONFIG_NET_SCH_DSMARK=m
+CONFIG_NET_SCH_DELAY=m
+CONFIG_NET_SCH_INGRESS=m
+CONFIG_NET_QOS=y
+CONFIG_NET_ESTIMATOR=y
+CONFIG_NET_CLS=y
+CONFIG_NET_CLS_TCINDEX=m
+CONFIG_NET_CLS_ROUTE4=m
+CONFIG_NET_CLS_ROUTE=y
+CONFIG_NET_CLS_FW=m
+CONFIG_NET_CLS_U32=m
+CONFIG_NET_CLS_RSVP=m
+CONFIG_NET_CLS_RSVP6=m
+CONFIG_NET_CLS_POLICE=y
+
+#
+# Network testing
+#
+CONFIG_NET_PKTGEN=m
+CONFIG_NETDEVICES=y
+
+#
+# ARCnet devices
+#
+# CONFIG_ARCNET is not set
+CONFIG_DUMMY=m
+CONFIG_BONDING=m
+CONFIG_EQUALIZER=m
+CONFIG_TUN=m
+# CONFIG_ETHERTAP is not set
+
+#
+# Ethernet (10 or 100Mbit)
+#
+CONFIG_NET_ETHERNET=y
+CONFIG_MII=y
+# CONFIG_OAKNET is not set
+# CONFIG_HAPPYMEAL is not set
+# CONFIG_SUNGEM is not set
+CONFIG_NET_VENDOR_3COM=y
+CONFIG_VORTEX=m
+CONFIG_TYPHOON=m
+
+#
+# Tulip family network device support
+#
+# CONFIG_NET_TULIP is not set
+# CONFIG_HP100 is not set
+CONFIG_NET_PCI=y
+CONFIG_PCNET32=m
+# CONFIG_AMD8111_ETH is not set
+# CONFIG_ADAPTEC_STARFIRE is not set
+# CONFIG_B44 is not set
+# CONFIG_FORCEDETH is not set
+# CONFIG_DGRS is not set
+# CONFIG_EEPRO100 is not set
+CONFIG_E100=m
+CONFIG_E100_NAPI=y
+CONFIG_E100_EEH_RECOVERY=y
+# CONFIG_FEALNX is not set
+# CONFIG_NATSEMI is not set
+# CONFIG_NE2K_PCI is not set
+# CONFIG_8139CP is not set
+# CONFIG_8139TOO is not set
+# CONFIG_SIS900 is not set
+# CONFIG_EPIC100 is not set
+# CONFIG_SUNDANCE is not set
+# CONFIG_VIA_RHINE is not set
+
+#
+# Ethernet (1000 Mbit)
+#
+CONFIG_ACENIC=m
+CONFIG_ACENIC_OMIT_TIGON_I=y
+# CONFIG_DL2K is not set
+CONFIG_E1000=m
+CONFIG_E1000_NAPI=y
+CONFIG_E1000_EEH_RECOVERY=y
+CONFIG_IGB=m
+# CONFIG_NS83820 is not set
+# CONFIG_HAMACHI is not set
+# CONFIG_YELLOWFIN is not set
+# CONFIG_R8169 is not set
+# CONFIG_SIS190 is not set
+# CONFIG_SK98LIN is not set
+CONFIG_TIGON3=m
+CONFIG_NET_BROADCOM=m
+# CONFIG_NET_BCM44 is not set
+CONFIG_BNX2=m
+# CONFIG_2BUFF_MODE is not set
+
+#
+# Ethernet (10000 Mbit)
+#
+CONFIG_IXGB=m
+CONFIG_IXGB_NAPI=y
+CONFIG_QLA3XXX=m
+CONFIG_IXGB_EEH_RECOVERY=y
+CONFIG_S2IO=m
+CONFIG_S2IO_NAPI=y
+CONFIG_MYRI10GE=m
+CONFIG_NETXEN_NIC=m
+CONFIG_IBMVETH=m
+# CONFIG_FDDI is not set
+# CONFIG_HIPPI is not set
+# CONFIG_PLIP is not set
+CONFIG_PPP=m
+CONFIG_PPP_MULTILINK=y
+CONFIG_PPP_FILTER=y
+CONFIG_PPP_ASYNC=m
+CONFIG_PPP_SYNC_TTY=m
+CONFIG_PPP_DEFLATE=m
+CONFIG_PPP_BSDCOMP=m
+CONFIG_PPP_MPPE=m
+CONFIG_PPPOE=m
+CONFIG_SLIP=m
+CONFIG_SLIP_COMPRESSED=y
+CONFIG_SLIP_SMART=y
+# CONFIG_SLIP_MODE_SLIP6 is not set
+
+#
+# Wireless LAN (non-hamradio)
+#
+# CONFIG_NET_RADIO is not set
+
+#
+# Token Ring devices
+#
+CONFIG_TR=y
+CONFIG_IBMOL=m
+# CONFIG_IBMLS is not set
+# CONFIG_3C359 is not set
+# CONFIG_TMS380TR is not set
+CONFIG_NET_FC=y
+CONFIG_SHAPER=m
+CONFIG_NETCONSOLE=m
+
+#
+# Wan interfaces
+#
+# CONFIG_WAN is not set
+
+#
+# Amateur Radio support
+#
+# CONFIG_HAMRADIO is not set
+
+#
+# IrDA (infrared) support
+#
+# CONFIG_IRDA is not set
+
+#
+# Bluetooth support
+#
+# CONFIG_BT is not set
+CONFIG_NETPOLL=y
+CONFIG_NETPOLL_RX=y
+CONFIG_NETPOLL_TRAP=y
+CONFIG_NET_POLL_CONTROLLER=y
+
+#
+# ISDN subsystem
+#
+# CONFIG_ISDN is not set
+
+#
+# Telephony Support
+#
+# CONFIG_PHONE is not set
+
+#
+# Input device support
+#
+CONFIG_INPUT=y
+
+#
+# Userland interfaces
+#
+CONFIG_INPUT_MOUSEDEV=y
+CONFIG_INPUT_MOUSEDEV_PSAUX=y
+CONFIG_INPUT_MOUSEDEV_SCREEN_X=1024
+CONFIG_INPUT_MOUSEDEV_SCREEN_Y=768
+CONFIG_INPUT_JOYDEV=m
+CONFIG_INPUT_TSDEV=m
+CONFIG_INPUT_TSDEV_SCREEN_X=240
+CONFIG_INPUT_TSDEV_SCREEN_Y=320
+CONFIG_INPUT_EVDEV=m
+# CONFIG_INPUT_EVBUG is not set
+
+#
+# Input I/O drivers
+#
+# CONFIG_GAMEPORT is not set
+CONFIG_SOUND_GAMEPORT=y
+CONFIG_SERIO=y
+CONFIG_SERIO_I8042=y
+# CONFIG_SERIO_SERPORT is not set
+# CONFIG_SERIO_CT82C710 is not set
+# CONFIG_SERIO_PARKBD is not set
+# CONFIG_SERIO_PCIPS2 is not set
+
+#
+# Input Device Drivers
+#
+CONFIG_INPUT_KEYBOARD=y
+CONFIG_KEYBOARD_ATKBD=y
+# CONFIG_KEYBOARD_SUNKBD is not set
+# CONFIG_KEYBOARD_LKKBD is not set
+# CONFIG_KEYBOARD_XTKBD is not set
+# CONFIG_KEYBOARD_NEWTON is not set
+# CONFIG_KEYBOARD_POSFILTER is not set
+CONFIG_INPUT_MOUSE=y
+CONFIG_MOUSE_PS2=y
+# CONFIG_MOUSE_SERIAL is not set
+# CONFIG_MOUSE_VSXXXAA is not set
+# CONFIG_INPUT_JOYSTICK is not set
+# CONFIG_INPUT_TOUCHSCREEN is not set
+CONFIG_INPUT_MISC=y
+CONFIG_INPUT_PCSPKR=m
+CONFIG_INPUT_UINPUT=m
+
+#
+# Character devices
+#
+CONFIG_VT=y
+CONFIG_VT_CONSOLE=y
+CONFIG_HW_CONSOLE=y
+CONFIG_ECC=m
+# CONFIG_SERIAL_NONSTANDARD is not set
+
+#
+# Serial drivers
+#
+CONFIG_SERIAL_8250=y
+CONFIG_SERIAL_8250_CONSOLE=y
+CONFIG_SERIAL_8250_NR_UARTS=4
+# CONFIG_SERIAL_8250_EXTENDED is not set
+
+#
+# Non-8250 serial port support
+#
+CONFIG_SERIAL_CORE=y
+CONFIG_SERIAL_CORE_CONSOLE=y
+# CONFIG_SERIAL_PMACZILOG is not set
+CONFIG_SERIAL_ICOM=m
+CONFIG_SERIAL_JSM=m
+CONFIG_UNIX98_PTYS=y
+CONFIG_LEGACY_PTYS=y
+CONFIG_LEGACY_PTY_COUNT=256
+CONFIG_PRINTER=m
+# CONFIG_LP_CONSOLE is not set
+# CONFIG_PPDEV is not set
+# CONFIG_TIPAR is not set
+CONFIG_HVC_CONSOLE=y
+CONFIG_HVCS=m
+# CONFIG_QIC02_TAPE is not set
+
+#
+# IPMI
+#
+# CONFIG_IPMI_HANDLER is not set
+
+#
+# Watchdog Cards
+#
+CONFIG_WATCHDOG=y
+# CONFIG_WATCHDOG_NOWAYOUT is not set
+
+#
+# Watchdog Device Drivers
+#
+CONFIG_SOFT_WATCHDOG=m
+
+#
+# PCI-based Watchdog Cards
+#
+# CONFIG_PCIPCWATCHDOG is not set
+# CONFIG_WDTPCI is not set
+
+#
+# USB-based Watchdog Cards
+#
+# CONFIG_USBPCWATCHDOG is not set
+# CONFIG_RTC is not set
+# CONFIG_GEN_RTC is not set
+# CONFIG_DTLK is not set
+# CONFIG_R3964 is not set
+# CONFIG_APPLICOM is not set
+
+#
+# Ftape, the floppy tape device driver
+#
+
+#
+# TPM devices
+#
+CONFIG_TCG_TPM=m
+CONFIG_TCG_NSC=m
+CONFIG_TCG_ATMEL=m
+# CONFIG_AGP is not set
+# CONFIG_DRM is not set
+CONFIG_RAW_DRIVER=m
+CONFIG_MAX_RAW_DEVS=4096
+CONFIG_HANGCHECK_TIMER=m
+
+#
+# Linux InfraRed Controller
+#
+# CONFIG_LIRC_SUPPORT is not set
+# CONFIG_LIRC_HOMEBREW is not set
+
+#
+# I2C support
+#
+CONFIG_I2C=y
+CONFIG_I2C_CHARDEV=y
+
+#
+# I2C Algorithms
+#
+CONFIG_I2C_ALGOBIT=y
+# CONFIG_I2C_ALGOPCF is not set
+
+#
+# I2C Hardware Bus support
+#
+# CONFIG_I2C_ALI1535 is not set
+# CONFIG_I2C_ALI15X3 is not set
+# CONFIG_I2C_AMD756 is not set
+# CONFIG_I2C_AMD8111 is not set
+# CONFIG_I2C_I801 is not set
+# CONFIG_I2C_I810 is not set
+# CONFIG_I2C_ISA is not set
+# CONFIG_I2C_NFORCE2 is not set
+# CONFIG_I2C_PARPORT is not set
+# CONFIG_I2C_PARPORT_LIGHT is not set
+# CONFIG_I2C_PROSAVAGE is not set
+# CONFIG_I2C_SAVAGE4 is not set
+# CONFIG_SCx200_ACB is not set
+# CONFIG_I2C_SIS5595 is not set
+# CONFIG_I2C_SIS630 is not set
+# CONFIG_I2C_SIS96X is not set
+# CONFIG_I2C_VIA is not set
+# CONFIG_I2C_VIAPRO is not set
+# CONFIG_I2C_VOODOO3 is not set
+
+#
+# Hardware Sensors Chip support
+#
+# CONFIG_I2C_SENSOR is not set
+# CONFIG_SENSORS_ADM1021 is not set
+# CONFIG_SENSORS_ASB100 is not set
+# CONFIG_SENSORS_DS1621 is not set
+# CONFIG_SENSORS_FSCHER is not set
+# CONFIG_SENSORS_GL518SM is not set
+# CONFIG_SENSORS_IT87 is not set
+# CONFIG_SENSORS_LM75 is not set
+# CONFIG_SENSORS_LM78 is not set
+# CONFIG_SENSORS_LM80 is not set
+# CONFIG_SENSORS_LM83 is not set
+# CONFIG_SENSORS_LM85 is not set
+# CONFIG_SENSORS_LM90 is not set
+# CONFIG_SENSORS_VIA686A is not set
+# CONFIG_SENSORS_VT1211 is not set
+# CONFIG_SENSORS_W83781D is not set
+# CONFIG_SENSORS_W83L785TS is not set
+# CONFIG_SENSORS_W83627HF is not set
+# CONFIG_SENSORS_PCF8574 is not set
+
+#
+# Other I2C Chip support
+#
+# CONFIG_SENSORS_EEPROM is not set
+# CONFIG_I2C_DEBUG_CORE is not set
+# CONFIG_I2C_DEBUG_ALGO is not set
+# CONFIG_I2C_DEBUG_BUS is not set
+# CONFIG_I2C_DEBUG_CHIP is not set
+
+#
+# Misc devices
+#
+
+#
+# Multimedia devices
+#
+# CONFIG_VIDEO_DEV is not set
+
+#
+# Digital Video Broadcasting Devices
+#
+# CONFIG_DVB is not set
+
+#
+# Graphics support
+#
+CONFIG_FB=y
+# CONFIG_FB_PM2 is not set
+# CONFIG_FB_CYBER2000 is not set
+CONFIG_FB_OF=y
+# CONFIG_FB_CT65550 is not set
+# CONFIG_FB_IMSTT is not set
+# CONFIG_FB_S3TRIO is not set
+# CONFIG_FB_VGA16 is not set
+# CONFIG_FB_RIVA is not set
+CONFIG_FB_MATROX=y
+CONFIG_FB_MATROX_MILLENIUM=y
+CONFIG_FB_MATROX_MYSTIQUE=y
+CONFIG_FB_MATROX_G450=y
+CONFIG_FB_MATROX_G100=y
+# CONFIG_FB_MATROX_I2C is not set
+CONFIG_FB_MATROX_MULTIHEAD=y
+# CONFIG_FB_RADEON_OLD is not set
+CONFIG_FB_RADEON=y
+CONFIG_FB_RADEON_I2C=y
+# CONFIG_FB_RADEON_DEBUG is not set
+# CONFIG_FB_ATY128 is not set
+# CONFIG_FB_ATY is not set
+# CONFIG_FB_SIS is not set
+# CONFIG_FB_NEOMAGIC is not set
+# CONFIG_FB_KYRO is not set
+# CONFIG_FB_3DFX is not set
+# CONFIG_FB_VOODOO1 is not set
+# CONFIG_FB_TRIDENT is not set
+# CONFIG_FB_VIRTUAL is not set
+
+#
+# Console display driver support
+#
+# CONFIG_VGA_CONSOLE is not set
+# CONFIG_MDA_CONSOLE is not set
+CONFIG_DUMMY_CONSOLE=y
+CONFIG_FRAMEBUFFER_CONSOLE=y
+CONFIG_PCI_CONSOLE=y
+# CONFIG_FONTS is not set
+CONFIG_FONT_8x8=y
+CONFIG_FONT_8x16=y
+
+#
+# Logo configuration
+#
+# CONFIG_LOGO is not set
+
+#
+# Bootsplash configuration
+#
+
+#
+# Sound
+#
+# CONFIG_SOUND is not set
+
+#
+# USB support
+#
+CONFIG_USB=m
+# CONFIG_USB_DEBUG is not set
+
+#
+# Miscellaneous USB options
+#
+CONFIG_USB_DEVICEFS=y
+# CONFIG_USB_BANDWIDTH is not set
+# CONFIG_USB_DYNAMIC_MINORS is not set
+
+#
+# USB Host Controller Drivers
+#
+CONFIG_USB_EHCI_HCD=m
+CONFIG_USB_EHCI_SPLIT_ISO=y
+CONFIG_USB_EHCI_ROOT_HUB_TT=y
+CONFIG_USB_OHCI_HCD=m
+# CONFIG_USB_UHCI_HCD is not set
+
+#
+# USB Device Class drivers
+#
+# CONFIG_USB_BLUETOOTH_TTY is not set
+CONFIG_USB_ACM=m
+CONFIG_USB_PRINTER=m
+CONFIG_USB_STORAGE=m
+# CONFIG_USB_STORAGE_DEBUG is not set
+CONFIG_USB_STORAGE_DATAFAB=y
+CONFIG_USB_STORAGE_FREECOM=y
+CONFIG_USB_STORAGE_ISD200=y
+CONFIG_USB_STORAGE_DPCM=y
+CONFIG_USB_STORAGE_HP8200e=y
+CONFIG_USB_STORAGE_SDDR09=y
+CONFIG_USB_STORAGE_SDDR55=y
+CONFIG_USB_STORAGE_JUMPSHOT=y
+
+#
+# USB Human Interface Devices (HID)
+#
+CONFIG_USB_HID=m
+CONFIG_USB_HIDINPUT=y
+# CONFIG_HID_FF is not set
+CONFIG_USB_HIDDEV=y
+
+#
+# USB HID Boot Protocol drivers
+#
+# CONFIG_USB_KBD is not set
+# CONFIG_USB_MOUSE is not set
+CONFIG_USB_AIPTEK=m
+CONFIG_USB_WACOM=m
+CONFIG_USB_KBTAB=m
+CONFIG_USB_POWERMATE=m
+CONFIG_USB_MTOUCH=m
+CONFIG_USB_XPAD=m
+CONFIG_USB_ATI_REMOTE=m
+
+#
+# USB Imaging devices
+#
+CONFIG_USB_MDC800=m
+CONFIG_USB_MICROTEK=m
+CONFIG_USB_HPUSBSCSI=m
+
+#
+# USB Multimedia devices
+#
+# CONFIG_USB_DABUSB is not set
+
+#
+# Video4Linux support is needed for USB Multimedia device support
+#
+
+#
+# USB Network adaptors
+#
+CONFIG_USB_CATC=m
+CONFIG_USB_KAWETH=m
+CONFIG_USB_PEGASUS=m
+CONFIG_USB_RTL8150=m
+CONFIG_USB_USBNET=m
+
+#
+# USB Host-to-Host Cables
+#
+CONFIG_USB_ALI_M5632=y
+CONFIG_USB_AN2720=y
+CONFIG_USB_BELKIN=y
+CONFIG_USB_GENESYS=y
+CONFIG_USB_NET1080=y
+CONFIG_USB_PL2301=y
+
+#
+# Intelligent USB Devices/Gadgets
+#
+CONFIG_USB_ARMLINUX=y
+CONFIG_USB_EPSON2888=y
+CONFIG_USB_ZAURUS=y
+CONFIG_USB_CDCETHER=y
+
+#
+# USB Network Adapters
+#
+CONFIG_USB_AX8817X=y
+
+#
+# USB port drivers
+#
+# CONFIG_USB_USS720 is not set
+
+#
+# USB Serial Converter support
+#
+CONFIG_USB_SERIAL=m
+CONFIG_USB_SERIAL_GENERIC=y
+CONFIG_USB_SERIAL_BELKIN=m
+CONFIG_USB_SERIAL_DIGI_ACCELEPORT=m
+CONFIG_USB_SERIAL_EMPEG=m
+CONFIG_USB_SERIAL_FTDI_SIO=m
+CONFIG_USB_SERIAL_VISOR=m
+CONFIG_USB_SERIAL_IPAQ=m
+CONFIG_USB_SERIAL_IR=m
+CONFIG_USB_SERIAL_EDGEPORT=m
+CONFIG_USB_SERIAL_EDGEPORT_TI=m
+CONFIG_USB_SERIAL_KEYSPAN_PDA=m
+CONFIG_USB_SERIAL_KEYSPAN=m
+CONFIG_USB_SERIAL_KEYSPAN_MPR=y
+CONFIG_USB_SERIAL_KEYSPAN_USA28=y
+CONFIG_USB_SERIAL_KEYSPAN_USA28X=y
+CONFIG_USB_SERIAL_KEYSPAN_USA28XA=y
+CONFIG_USB_SERIAL_KEYSPAN_USA28XB=y
+CONFIG_USB_SERIAL_KEYSPAN_USA19=y
+CONFIG_USB_SERIAL_KEYSPAN_USA18X=y
+CONFIG_USB_SERIAL_KEYSPAN_USA19W=y
+CONFIG_USB_SERIAL_KEYSPAN_USA19QW=y
+CONFIG_USB_SERIAL_KEYSPAN_USA19QI=y
+CONFIG_USB_SERIAL_KEYSPAN_USA49W=y
+CONFIG_USB_SERIAL_KEYSPAN_USA49WLC=y
+CONFIG_USB_SERIAL_KLSI=m
+CONFIG_USB_SERIAL_KOBIL_SCT=m
+CONFIG_USB_SERIAL_MCT_U232=m
+CONFIG_USB_SERIAL_PL2303=m
+CONFIG_USB_SERIAL_SAFE=m
+CONFIG_USB_SERIAL_SAFE_PADDED=y
+CONFIG_USB_SERIAL_CYBERJACK=m
+CONFIG_USB_SERIAL_XIRCOM=m
+CONFIG_USB_SERIAL_OMNINET=m
+CONFIG_USB_EZUSB=y
+
+#
+# USB Miscellaneous drivers
+#
+CONFIG_USB_EMI62=m
+CONFIG_USB_EMI26=m
+# CONFIG_USB_TIGL is not set
+# CONFIG_USB_AUERSWALD is not set
+# CONFIG_USB_RIO500 is not set
+CONFIG_USB_LEGOTOWER=m
+# CONFIG_USB_LCD is not set
+CONFIG_USB_LED=m
+CONFIG_USB_CYTHERM=m
+# CONFIG_USB_TEST is not set
+
+#
+# USB Gadget Support
+#
+# CONFIG_USB_GADGET is not set
+
+#
+# InfiniBand support
+#
+# CONFIG_INFINIBAND is not set
+CONFIG_AUDIT=m
+
+#
+# File systems
+#
+CONFIG_EXT2_FS=y
+CONFIG_EXT2_FS_XATTR=y
+CONFIG_EXT2_FS_POSIX_ACL=y
+CONFIG_EXT2_FS_SECURITY=y
+CONFIG_EXT3_FS=y
+CONFIG_EXT3_FS_XATTR=y
+CONFIG_EXT3_FS_POSIX_ACL=y
+CONFIG_EXT3_FS_SECURITY=y
+CONFIG_JBD=y
+# CONFIG_JBD_DEBUG is not set
+CONFIG_FS_MBCACHE=y
+CONFIG_REISERFS_FS=y
+# CONFIG_REISERFS_CHECK is not set
+CONFIG_REISERFS_PROC_INFO=y
+CONFIG_REISERFS_FS_XATTR=y
+CONFIG_REISERFS_FS_POSIX_ACL=y
+CONFIG_REISERFS_FS_SECURITY=y
+CONFIG_JFS_FS=m
+CONFIG_JFS_POSIX_ACL=y
+CONFIG_JFS_DMAPI=y
+# CONFIG_JFS_DEBUG is not set
+CONFIG_JFS_STATISTICS=y
+CONFIG_FS_POSIX_ACL=y
+CONFIG_XFS_FS=m
+CONFIG_XFS_EXPORT=y
+CONFIG_XFS_RT=y
+CONFIG_XFS_QUOTA=m
+CONFIG_XFS_DMAPI=y
+CONFIG_XFS_SECURITY=y
+CONFIG_XFS_POSIX_ACL=y
+CONFIG_OCFS2_FS=m
+CONFIG_MINIX_FS=m
+# CONFIG_ROMFS_FS is not set
+CONFIG_DMAPI=m
+# CONFIG_DMAPI_DEBUG is not set
+CONFIG_QUOTA=y
+CONFIG_QFMT_V1=m
+CONFIG_QFMT_V2=m
+CONFIG_QUOTACTL=y
+CONFIG_AUTOFS_FS=m
+CONFIG_AUTOFS4_FS=m
+
+#
+# CD-ROM/DVD Filesystems
+#
+CONFIG_ISO9660_FS=y
+CONFIG_JOLIET=y
+CONFIG_ZISOFS=y
+CONFIG_ZISOFS_FS=y
+CONFIG_UDF_FS=m
+
+#
+# DOS/FAT/NT Filesystems
+#
+CONFIG_FAT_FS=y
+CONFIG_MSDOS_FS=y
+CONFIG_VFAT_FS=y
+# CONFIG_NTFS_FS is not set
+
+#
+# Pseudo filesystems
+#
+CONFIG_PROC_FS=y
+CONFIG_PROC_KCORE=y
+# CONFIG_DEVFS_FS is not set
+CONFIG_DEVPTS_FS_XATTR=y
+CONFIG_DEVPTS_FS_SECURITY=y
+CONFIG_TMPFS=y
+CONFIG_HUGETLBFS=y
+CONFIG_HUGETLB_PAGE=y
+CONFIG_RAMFS=y
+CONFIG_CONFIGFS_FS=m
+CONFIG_RELAYFS_FS=m
+# CONFIG_KLOG_CHANNEL is not set
+
+#
+# Miscellaneous filesystems
+#
+# CONFIG_ADFS_FS is not set
+# CONFIG_AFFS_FS is not set
+CONFIG_HFS_FS=m
+CONFIG_HFSPLUS_FS=m
+# CONFIG_BEFS_FS is not set
+# CONFIG_BFS_FS is not set
+# CONFIG_EFS_FS is not set
+CONFIG_CRAMFS=y
+# CONFIG_VXFS_FS is not set
+# CONFIG_HPFS_FS is not set
+# CONFIG_QNX4FS_FS is not set
+# CONFIG_SYSV_FS is not set
+CONFIG_UFS_FS=m
+# CONFIG_UFS_FS_WRITE is not set
+
+#
+# Network File Systems
+#
+CONFIG_NFS_FS=y
+CONFIG_NFS_V3=y
+CONFIG_NFS_ACL=y
+CONFIG_NFS_V4=y
+CONFIG_NFS_DIRECTIO=y
+CONFIG_NFSD=m
+CONFIG_NFSD_V3=y
+CONFIG_NFSD_ACL=y
+CONFIG_NFS_ACL_SUPPORT=y
+# CONFIG_NFSD_V4 is not set
+CONFIG_NFSD_TCP=y
+CONFIG_LOCKD=y
+CONFIG_STATD=y
+CONFIG_LOCKD_V4=y
+CONFIG_EXPORTFS=m
+CONFIG_SUNRPC=y
+CONFIG_SUNRPC_GSS=y
+CONFIG_RPCSEC_GSS_KRB5=y
+CONFIG_SMB_FS=m
+# CONFIG_SMB_NLS_DEFAULT is not set
+CONFIG_CIFS=m
+CONFIG_CIFS_STATS=y
+CONFIG_CIFS_XATTR=y
+CONFIG_CIFS_POSIX=y
+CONFIG_NCP_FS=m
+CONFIG_NCPFS_PACKET_SIGNING=y
+CONFIG_NCPFS_IOCTL_LOCKING=y
+CONFIG_NCPFS_STRONG=y
+CONFIG_NCPFS_NFS_NS=y
+CONFIG_NCPFS_OS2_NS=y
+CONFIG_NCPFS_SMALLDOS=y
+CONFIG_NCPFS_NLS=y
+CONFIG_NCPFS_EXTRAS=y
+# CONFIG_CODA_FS is not set
+# CONFIG_INTERMEZZO_FS is not set
+# CONFIG_AFS_FS is not set
+
+#
+# Partition Types
+#
+CONFIG_PARTITION_ADVANCED=y
+# CONFIG_ACORN_PARTITION is not set
+CONFIG_OSF_PARTITION=y
+CONFIG_AMIGA_PARTITION=y
+CONFIG_ATARI_PARTITION=y
+CONFIG_MAC_PARTITION=y
+CONFIG_MSDOS_PARTITION=y
+CONFIG_BSD_DISKLABEL=y
+CONFIG_MINIX_SUBPARTITION=y
+CONFIG_SOLARIS_X86_PARTITION=y
+CONFIG_UNIXWARE_DISKLABEL=y
+CONFIG_LDM_PARTITION=y
+# CONFIG_LDM_DEBUG is not set
+CONFIG_NEC98_PARTITION=y
+CONFIG_SGI_PARTITION=y
+CONFIG_ULTRIX_PARTITION=y
+CONFIG_SUN_PARTITION=y
+CONFIG_EFI_PARTITION=y
+
+#
+# Native Language Support
+#
+CONFIG_NLS=y
+CONFIG_NLS_DEFAULT="iso8859-1"
+CONFIG_NLS_CODEPAGE_437=m
+CONFIG_NLS_CODEPAGE_737=m
+CONFIG_NLS_CODEPAGE_775=m
+CONFIG_NLS_CODEPAGE_850=m
+CONFIG_NLS_CODEPAGE_852=m
+CONFIG_NLS_CODEPAGE_855=m
+CONFIG_NLS_CODEPAGE_857=m
+CONFIG_NLS_CODEPAGE_860=m
+CONFIG_NLS_CODEPAGE_861=m
+CONFIG_NLS_CODEPAGE_862=m
+CONFIG_NLS_CODEPAGE_863=m
+CONFIG_NLS_CODEPAGE_864=m
+CONFIG_NLS_CODEPAGE_865=m
+CONFIG_NLS_CODEPAGE_866=m
+CONFIG_NLS_CODEPAGE_869=m
+CONFIG_NLS_CODEPAGE_936=m
+CONFIG_NLS_CODEPAGE_950=m
+CONFIG_NLS_CODEPAGE_932=m
+CONFIG_NLS_CODEPAGE_949=m
+CONFIG_NLS_CODEPAGE_874=m
+CONFIG_NLS_ISO8859_8=m
+CONFIG_NLS_CODEPAGE_1250=m
+CONFIG_NLS_CODEPAGE_1251=m
+CONFIG_NLS_ISO8859_1=m
+CONFIG_NLS_ISO8859_2=m
+CONFIG_NLS_ISO8859_3=m
+CONFIG_NLS_ISO8859_4=m
+CONFIG_NLS_ISO8859_5=m
+CONFIG_NLS_ISO8859_6=m
+CONFIG_NLS_ISO8859_7=m
+CONFIG_NLS_ISO8859_9=m
+CONFIG_NLS_ISO8859_13=m
+CONFIG_NLS_ISO8859_14=m
+CONFIG_NLS_ISO8859_15=m
+CONFIG_NLS_KOI8_R=m
+CONFIG_NLS_KOI8_U=m
+CONFIG_NLS_UTF8=m
+CONFIG_FSHOOKS=y
+
+#
+# Profiling support
+#
+CONFIG_PROFILING=y
+CONFIG_OPROFILE=y
+
+#
+# Kernel hacking
+#
+CONFIG_KERNTYPES=y
+CONFIG_CRASH_DUMP=m
+CONFIG_CRASH_DUMP_BLOCKDEV=m
+CONFIG_CRASH_DUMP_NETDEV=m
+# CONFIG_CRASH_DUMP_MEMDEV is not set
+# CONFIG_CRASH_DUMP_SOFTBOOT is not set
+CONFIG_CRASH_DUMP_COMPRESS_RLE=m
+CONFIG_CRASH_DUMP_COMPRESS_GZIP=m
+CONFIG_DEBUG_KERNEL=y
+CONFIG_DEBUG_STACKOVERFLOW=y
+CONFIG_KPROBES=y
+CONFIG_DEBUG_STACK_USAGE=y
+# CONFIG_DEBUG_SLAB is not set
+CONFIG_MAGIC_SYSRQ=y
+CONFIG_DEBUGGER=y
+CONFIG_XMON=y
+# CONFIG_XMON_DEFAULT is not set
+CONFIG_KDB=y
+CONFIG_KDB_MODULES=y
+CONFIG_KDB_OFF=y
+# CONFIG_PPCDBG is not set
+CONFIG_DEBUG_INFO=y
+CONFIG_IRQSTACKS=y
+
+#
+# Security options
+#
+CONFIG_SECURITY=y
+CONFIG_SECURITY_NETWORK=y
+CONFIG_SECURITY_CAPABILITIES=m
+CONFIG_SECURITY_ROOTPLUG=m
+CONFIG_SECURITY_SELINUX=y
+CONFIG_SECURITY_SELINUX_BOOTPARAM=y
+CONFIG_SECURITY_SELINUX_DEVELOP=y
+CONFIG_SECURITY_SELINUX_MLS=y
+CONFIG_SECURITY_SUBDOMAIN=m
+
+#
+# IBM Crypto Hardware support
+#
+CONFIG_IBM_CRYPTO=m
+CONFIG_ICA_LEEDSLITE=m
+
+#
+# Cryptographic options
+#
+CONFIG_CRYPTO=y
+CONFIG_CRYPTO_HMAC=y
+CONFIG_CRYPTO_NULL=m
+CONFIG_CRYPTO_MD4=m
+CONFIG_CRYPTO_MD5=y
+CONFIG_CRYPTO_SHA1=m
+CONFIG_CRYPTO_SHA256=m
+CONFIG_CRYPTO_SHA512=m
+CONFIG_CRYPTO_DES=y
+CONFIG_CRYPTO_BLOWFISH=m
+CONFIG_CRYPTO_TWOFISH=m
+CONFIG_CRYPTO_SERPENT=m
+CONFIG_CRYPTO_AES=m
+CONFIG_CRYPTO_CAST5=m
+CONFIG_CRYPTO_CAST6=m
+CONFIG_CRYPTO_ARC4=m
+CONFIG_CRYPTO_DEFLATE=m
+CONFIG_CRYPTO_MICHAEL_MIC=m
+CONFIG_CRYPTO_TEST=m
+
+#
+# Library routines
+#
+CONFIG_CRC32=y
+CONFIG_QSORT=y
+CONFIG_ZLIB_INFLATE=y
+CONFIG_ZLIB_DEFLATE=m
+CONFIG_HAS_IOMEM=y
+CONFIG_HAS_IOPORT=y
+CONFIG_HAS_DMA=y
+
+#
+# Build options
+#
+CONFIG_SUSE_KERNEL=y
+CONFIG_CFGNAME="pseries64"
+CONFIG_RELEASE="7.312"
diff --git a/lustre/kernel_patches/kernel_configs/kernel-2.6.5-2.6-suse-ppc64.config b/lustre/kernel_patches/kernel_configs/kernel-2.6.5-2.6-suse-ppc64.config

new file mode 100644 (file)

index 0000000..bc8a338
--- /dev/null
+++ b/lustre/kernel_patches/kernel_configs/kernel-2.6.5-2.6-suse-ppc64.config
@@ -0,0 +1,1619 @@
+#
+# Automatically generated make config: don't edit
+#
+CONFIG_64BIT=y
+CONFIG_MMU=y
+CONFIG_RWSEM_XCHGADD_ALGORITHM=y
+CONFIG_GENERIC_ISA_DMA=y
+CONFIG_HAVE_DEC_LOCK=y
+CONFIG_EARLY_PRINTK=y
+CONFIG_COMPAT=y
+CONFIG_FRAME_POINTER=y
+CONFIG_FORCE_MAX_ZONEORDER=13
+
+#
+# Code maturity level options
+#
+CONFIG_EXPERIMENTAL=y
+CONFIG_CLEAN_COMPILE=y
+CONFIG_STANDALONE=y
+CONFIG_BROKEN_ON_SMP=y
+
+#
+# General setup
+#
+CONFIG_SWAP=y
+CONFIG_SYSVIPC=y
+CONFIG_POSIX_MQUEUE=y
+CONFIG_BSD_PROCESS_ACCT=y
+CONFIG_SYSCTL=y
+CONFIG_LOG_BUF_SHIFT=19
+CONFIG_HOTPLUG=y
+CONFIG_EVLOG=y
+# CONFIG_EVLOG_FWPRINTK is not set
+CONFIG_IKCONFIG=y
+CONFIG_IKCONFIG_PROC=y
+# CONFIG_EMBEDDED is not set
+
+#
+# Class Based Kernel Resource Management
+#
+CONFIG_CKRM=y
+CONFIG_RCFS_FS=m
+CONFIG_CKRM_TYPE_TASKCLASS=y
+CONFIG_CKRM_RES_NUMTASKS=m
+CONFIG_CKRM_CPU_SCHEDULE=y
+# CONFIG_CKRM_CPU_SCHEDULE_AT_BOOT is not set
+CONFIG_CKRM_RES_BLKIO=y
+CONFIG_CKRM_TYPE_SOCKETCLASS=y
+CONFIG_CKRM_RBCE=m
+CONFIG_CKRM_CRBCE=m
+CONFIG_DELAY_ACCT=y
+CONFIG_KALLSYMS=y
+CONFIG_FUTEX=y
+CONFIG_EPOLL=y
+CONFIG_IOSCHED_NOOP=y
+CONFIG_IOSCHED_AS=y
+CONFIG_IOSCHED_DEADLINE=y
+CONFIG_IOSCHED_CFQ=y
+CONFIG_IOSCHED_PS=y
+# CONFIG_CC_OPTIMIZE_FOR_SIZE is not set
+
+#
+# Loadable module support
+#
+CONFIG_MODULES=y
+CONFIG_MODULE_UNLOAD=y
+# CONFIG_MODULE_FORCE_UNLOAD is not set
+CONFIG_OBSOLETE_MODPARM=y
+CONFIG_MODVERSIONS=y
+CONFIG_KMOD=y
+
+#
+# Platform support
+#
+# CONFIG_PPC_ISERIES is not set
+CONFIG_PPC_PSERIES=y
+CONFIG_PPC=y
+CONFIG_PPC64=y
+CONFIG_PPC_OF=y
+CONFIG_ALTIVEC=y
+# CONFIG_PPC_PMAC is not set
+CONFIG_PPC_SPLPAR=y
+# CONFIG_BOOTX_TEXT is not set
+# CONFIG_POWER4_ONLY is not set
+# CONFIG_IOMMU_VMERGE is not set
+# CONFIG_SMP is not set
+CONFIG_PPC_RTAS=y
+CONFIG_RTAS_FLASH=m
+CONFIG_SCANLOG=m
+CONFIG_LPARCFG=y
+
+#
+# General setup
+#
+CONFIG_PCI=y
+CONFIG_PCI_DOMAINS=y
+CONFIG_BINFMT_ELF=y
+CONFIG_BINFMT_MISC=m
+# CONFIG_PCI_LEGACY_PROC is not set
+# CONFIG_PCI_NAMES is not set
+
+#
+# PCMCIA/CardBus support
+#
+# CONFIG_PCMCIA is not set
+
+#
+# PCI Hotplug Support
+#
+CONFIG_HOTPLUG_PCI=y
+# CONFIG_HOTPLUG_PCI_FAKE is not set
+# CONFIG_HOTPLUG_PCI_CPCI is not set
+# CONFIG_HOTPLUG_PCI_PCIE is not set
+# CONFIG_HOTPLUG_PCI_SHPC is not set
+CONFIG_HOTPLUG_PCI_RPA=y
+CONFIG_HOTPLUG_PCI_RPA_DLPAR=y
+CONFIG_PROC_DEVICETREE=y
+# CONFIG_CMDLINE_BOOL is not set
+
+#
+# Device Drivers
+#
+
+#
+# Generic Driver Options
+#
+CONFIG_FW_LOADER=m
+# CONFIG_DEBUG_DRIVER is not set
+# CONFIG_DEBUG_DEVRES is not set
+
+#
+# Memory Technology Devices (MTD)
+#
+# CONFIG_MTD is not set
+
+#
+# Parallel port support
+#
+CONFIG_PARPORT=m
+CONFIG_PARPORT_PC=m
+CONFIG_PARPORT_PC_CML1=m
+CONFIG_PARPORT_SERIAL=m
+CONFIG_PARPORT_PC_FIFO=y
+CONFIG_PARPORT_PC_SUPERIO=y
+CONFIG_PARPORT_OTHER=y
+CONFIG_PARPORT_1284=y
+
+#
+# Plug and Play support
+#
+
+#
+# Block devices
+#
+CONFIG_BLK_DEV_FD=y
+# CONFIG_PARIDE is not set
+# CONFIG_BLK_CPQ_DA is not set
+# CONFIG_BLK_CPQ_CISS_DA is not set
+# CONFIG_BLK_DEV_DAC960 is not set
+# CONFIG_BLK_DEV_UMEM is not set
+CONFIG_BLK_DEV_LOOP=y
+CONFIG_BLK_DEV_CRYPTOLOOP=m
+CONFIG_BLK_DEV_NBD=m
+# CONFIG_BLK_DEV_CARMEL is not set
+CONFIG_BLK_DEV_RAM=y
+CONFIG_BLK_DEV_RAM_SIZE=123456
+CONFIG_BLK_DEV_INITRD=y
+CONFIG_CIPHER_TWOFISH=m
+
+#
+# ATA/ATAPI/MFM/RLL support
+#
+CONFIG_IDE=y
+CONFIG_BLK_DEV_IDE=y
+
+#
+# Please see Documentation/ide.txt for help/info on IDE drives
+#
+CONFIG_BLK_DEV_IDEDISK=y
+# CONFIG_IDEDISK_MULTI_MODE is not set
+# CONFIG_IDEDISK_STROKE is not set
+CONFIG_BLK_DEV_IDECD=y
+# CONFIG_BLK_DEV_IDETAPE is not set
+# CONFIG_BLK_DEV_IDEFLOPPY is not set
+# CONFIG_BLK_DEV_IDESCSI is not set
+CONFIG_IDE_TASK_IOCTL=y
+# CONFIG_IDE_TASKFILE_IO is not set
+
+#
+# IDE chipset support/bugfixes
+#
+CONFIG_IDE_GENERIC=y
+CONFIG_BLK_DEV_IDEPCI=y
+CONFIG_IDEPCI_SHARE_IRQ=y
+# CONFIG_BLK_DEV_OFFBOARD is not set
+CONFIG_BLK_DEV_GENERIC=y
+# CONFIG_BLK_DEV_OPTI621 is not set
+CONFIG_BLK_DEV_SL82C105=y
+CONFIG_BLK_DEV_IDEDMA_PCI=y
+CONFIG_BLK_DEV_IDEDMA_FORCED=y
+CONFIG_IDEDMA_PCI_AUTO=y
+# CONFIG_IDEDMA_ONLYDISK is not set
+CONFIG_BLK_DEV_ADMA=y
+# CONFIG_BLK_DEV_AEC62XX is not set
+# CONFIG_BLK_DEV_ALI15X3 is not set
+CONFIG_BLK_DEV_AMD74XX=y
+# CONFIG_BLK_DEV_CMD64X is not set
+# CONFIG_BLK_DEV_TRIFLEX is not set
+# CONFIG_BLK_DEV_CY82C693 is not set
+# CONFIG_BLK_DEV_CS5520 is not set
+# CONFIG_BLK_DEV_CS5530 is not set
+# CONFIG_BLK_DEV_HPT34X is not set
+# CONFIG_BLK_DEV_HPT366 is not set
+# CONFIG_BLK_DEV_SC1200 is not set
+# CONFIG_BLK_DEV_PIIX is not set
+# CONFIG_BLK_DEV_NS87415 is not set
+CONFIG_BLK_DEV_PDC202XX_OLD=y
+CONFIG_PDC202XX_BURST=y
+# CONFIG_BLK_DEV_PDC202XX_NEW is not set
+# CONFIG_BLK_DEV_SVWKS is not set
+CONFIG_BLK_DEV_SIIMAGE=y
+# CONFIG_BLK_DEV_SLC90E66 is not set
+# CONFIG_BLK_DEV_TRM290 is not set
+# CONFIG_BLK_DEV_VIA82CXXX is not set
+CONFIG_BLK_DEV_IDEDMA=y
+# CONFIG_IDEDMA_IVB is not set
+CONFIG_IDEDMA_AUTO=y
+# CONFIG_BLK_DEV_HD is not set
+
+#
+# SCSI device support
+#
+CONFIG_SCSI=m
+CONFIG_SCSI_PROC_FS=y
+
+#
+# SCSI support type (disk, tape, CD-ROM)
+#
+CONFIG_BLK_DEV_SD=m
+CONFIG_SCSI_DUMP=m
+CONFIG_SD_IOSTATS=y
+CONFIG_CHR_DEV_ST=m
+# CONFIG_CHR_DEV_OSST is not set
+CONFIG_BLK_DEV_SR=m
+CONFIG_BLK_DEV_SR_VENDOR=y
+CONFIG_CHR_DEV_SG=m
+CONFIG_CHR_DEV_SCH=m
+
+#
+# Some SCSI devices (e.g. CD jukebox) support multiple LUNs
+#
+CONFIG_SCSI_MULTI_LUN=y
+CONFIG_SCSI_CONSTANTS=y
+CONFIG_SCSI_LOGGING=y
+
+#
+# SCSI Transport Attributes
+#
+CONFIG_SCSI_SPI_ATTRS=m
+CONFIG_SCSI_FC_ATTRS=m
+
+#
+# SCSI low-level drivers
+#
+# CONFIG_BLK_DEV_3W_XXXX_RAID is not set
+CONFIG_SCSI_3W_9XXX=m
+# CONFIG_SCSI_ACARD is not set
+# CONFIG_SCSI_AACRAID is not set
+# CONFIG_SCSI_AIC7XXX is not set
+CONFIG_SCSI_ADP94XX=m
+# CONFIG_SCSI_AIC7XXX_OLD is not set
+# CONFIG_SCSI_AIC79XX is not set
+# CONFIG_SCSI_ADVANSYS is not set
+# CONFIG_MEGARAID_NEWGEN is not set
+# CONFIG_MEGARAID_LEGACY is not set
+# CONFIG_MEGARAID_SAS is not set
+# CONFIG_SCSI_BUSLOGIC is not set
+# CONFIG_SCSI_CPQFCTS is not set
+# CONFIG_SCSI_DMX3191D is not set
+# CONFIG_SCSI_EATA is not set
+# CONFIG_SCSI_EATA_PIO is not set
+# CONFIG_SCSI_FUTURE_DOMAIN is not set
+# CONFIG_SCSI_GDTH is not set
+# CONFIG_SCSI_IPS is not set
+CONFIG_SCSI_IBMVSCSI=m
+CONFIG_SCSI_IBMVSCSIS=m
+# CONFIG_SCSI_INIA100 is not set
+# CONFIG_SCSI_PPA is not set
+# CONFIG_SCSI_IMM is not set
+CONFIG_SCSI_SYM53C8XX_2=m
+CONFIG_SCSI_SYM53C8XX_DMA_ADDRESSING_MODE=0
+CONFIG_SCSI_SYM53C8XX_DEFAULT_TAGS=16
+CONFIG_SCSI_SYM53C8XX_MAX_TAGS=64
+# CONFIG_SCSI_SYM53C8XX_IOMAPPED is not set
+CONFIG_SCSI_SYM53C8XX_EEH_RECOVERY=y
+CONFIG_SCSI_LPFC=m
+CONFIG_SCSI_IPR=m
+CONFIG_SCSI_IPR_TRACE=y
+CONFIG_SCSI_IPR_DUMP=y
+CONFIG_SCSI_IPR_EEH_RECOVERY=y
+# CONFIG_SCSI_QLOGIC_ISP is not set
+# CONFIG_SCSI_QLOGIC_FC is not set
+# CONFIG_SCSI_QLOGIC_1280 is not set
+CONFIG_SCSI_QLA2XXX=m
+# CONFIG_SCSI_QLA21XX is not set
+# CONFIG_SCSI_QLA22XX is not set
+CONFIG_SCSI_QLA2300=m
+# CONFIG_SCSI_QLA2322 is not set
+CONFIG_SCSI_QLA24XX=m
+CONFIG_SCSI_QLA2XXX_FAILOVER=y
+CONFIG_SCSI_QLA4XXX=m
+CONFIG_SCSI_QLA4XXX_FAILOVER=y
+# CONFIG_SCSI_DC395x is not set
+# CONFIG_SCSI_DC390T is not set
+CONFIG_SCSI_DEBUG=m
+CONFIG_ATA=m
+# CONFIG_ATA_NONSTANDARD is not set
+# CONFIG_SATA_AHCI is not set
+# CONFIG_SATA_SVW is not set
+# CONFIG_ATA_PIIX is not set
+# CONFIG_SATA_MV is not set
+# CONFIG_SATA_NV is not set
+# CONFIG_PDC_ADMA is not set
+# CONFIG_SATA_QSTOR is not set
+# CONFIG_SATA_PROMISE is not set
+# CONFIG_SATA_SX4 is not set
+# CONFIG_SATA_SIL is not set
+# CONFIG_SATA_SIL24 is not set
+# CONFIG_SATA_SIS is not set
+# CONFIG_SATA_ULI is not set
+# CONFIG_SATA_VIA is not set
+# CONFIG_SATA_VITESSE is not set
+# CONFIG_SATA_INIC162X is not set
+# CONFIG_PATA_ALI is not set
+# CONFIG_PATA_AMD is not set
+# CONFIG_PATA_ARTOP is not set
+# CONFIG_PATA_ATIIXP is not set
+# CONFIG_PATA_CMD640_PCI is not set
+# CONFIG_PATA_CMD64X is not set
+# CONFIG_PATA_CS5520 is not set
+# CONFIG_PATA_CS5530 is not set
+# CONFIG_PATA_CYPRESS is not set
+# CONFIG_PATA_EFAR is not set
+# CONFIG_ATA_GENERIC is not set
+# CONFIG_PATA_HPT366 is not set
+# CONFIG_PATA_HPT37X is not set
+# CONFIG_PATA_HPT3X2N is not set
+# CONFIG_PATA_HPT3X3 is not set
+# CONFIG_PATA_IT821X is not set
+# CONFIG_PATA_IT8213 is not set
+# CONFIG_PATA_JMICRON is not set
+# CONFIG_PATA_TRIFLEX is not set
+# CONFIG_PATA_MARVELL is not set
+# CONFIG_PATA_MPIIX is not set
+# CONFIG_PATA_OLDPIIX is not set
+# CONFIG_PATA_NETCELL is not set
+# CONFIG_PATA_NS87410 is not set
+# CONFIG_PATA_OPTI is not set
+# CONFIG_PATA_OPTIDMA is not set
+# CONFIG_PATA_PDC_OLD is not set
+# CONFIG_PATA_RADISYS is not set
+# CONFIG_PATA_RZ1000 is not set
+# CONFIG_PATA_SC1200 is not set
+# CONFIG_PATA_SERVERWORKS is not set
+CONFIG_PATA_PDC2027X=m
+# CONFIG_PATA_SIL680 is not set
+# CONFIG_PATA_SIS is not set
+# CONFIG_PATA_VIA is not set
+# CONFIG_PATA_WINBOND is not set
+
+#
+# Multi-device support (RAID and LVM)
+#
+CONFIG_MD=y
+CONFIG_BLK_DEV_MD=y
+CONFIG_MD_LINEAR=m
+CONFIG_MD_RAID0=m
+CONFIG_MD_RAID1=m
+CONFIG_MD_RAID5=m
+CONFIG_MD_RAID6=m
+CONFIG_MD_MULTIPATH=m
+CONFIG_BLK_DEV_DM=m
+CONFIG_DM_CRYPT=m
+CONFIG_DM_SNAPSHOT=m
+CONFIG_DM_MIRROR=m
+CONFIG_DM_ZERO=m
+CONFIG_DM_MULTIPATH=m
+CONFIG_DM_MULTIPATH_EMC=m
+CONFIG_DM_FLAKEY=m
+CONFIG_BLK_DEV_DM_BBR=m
+
+#
+# Fusion MPT device support
+#
+CONFIG_FUSION=y
+# CONFIG_FUSION_SPI is not set
+# CONFIG_FUSION_FC is not set
+CONFIG_FUSION_SAS=m
+CONFIG_FUSION_MAX_SGE=128
+
+#
+# IEEE 1394 (FireWire) support
+#
+CONFIG_IEEE1394=m
+
+#
+# Subsystem Options
+#
+# CONFIG_IEEE1394_VERBOSEDEBUG is not set
+# CONFIG_IEEE1394_OUI_DB is not set
+CONFIG_IEEE1394_EXTRA_CONFIG_ROMS=y
+CONFIG_IEEE1394_CONFIG_ROM_IP1394=y
+
+#
+# Device Drivers
+#
+# CONFIG_IEEE1394_PCILYNX is not set
+CONFIG_IEEE1394_OHCI1394=m
+
+#
+# Protocol Drivers
+#
+CONFIG_IEEE1394_VIDEO1394=m
+CONFIG_IEEE1394_SBP2=m
+# CONFIG_IEEE1394_SBP2_PHYS_DMA is not set
+CONFIG_IEEE1394_ETH1394=m
+CONFIG_IEEE1394_DV1394=m
+CONFIG_IEEE1394_RAWIO=m
+CONFIG_IEEE1394_CMP=m
+CONFIG_IEEE1394_AMDTP=m
+
+#
+# I2O device support
+#
+# CONFIG_I2O is not set
+
+#
+# Macintosh device drivers
+#
+
+#
+# Networking support
+#
+CONFIG_NET=y
+
+#
+# Networking options
+#
+CONFIG_PACKET=y
+CONFIG_PACKET_MMAP=y
+CONFIG_NETLINK_DEV=y
+CONFIG_UNIX=y
+CONFIG_NET_KEY=y
+CONFIG_INET=y
+CONFIG_IP_MULTICAST=y
+CONFIG_IP_ADVANCED_ROUTER=y
+CONFIG_IP_MULTIPLE_TABLES=y
+# CONFIG_IP_ROUTE_FWMARK is not set
+CONFIG_IP_ROUTE_NAT=y
+CONFIG_IP_ROUTE_MULTIPATH=y
+CONFIG_IP_ROUTE_TOS=y
+# CONFIG_IP_ROUTE_VERBOSE is not set
+# CONFIG_IP_PNP is not set
+CONFIG_NET_IPIP=m
+CONFIG_NET_IPGRE=m
+CONFIG_NET_IPGRE_BROADCAST=y
+CONFIG_IP_MROUTE=y
+CONFIG_IP_PIMSM_V1=y
+CONFIG_IP_PIMSM_V2=y
+# CONFIG_ARPD is not set
+CONFIG_SYN_COOKIES=y
+CONFIG_INET_AH=m
+CONFIG_INET_ESP=m
+CONFIG_INET_IPCOMP=m
+# CONFIG_ACCEPT_QUEUES is not set
+
+#
+# IP: Virtual Server Configuration
+#
+CONFIG_IP_VS=m
+# CONFIG_IP_VS_DEBUG is not set
+CONFIG_IP_VS_TAB_BITS=12
+
+#
+# IPVS transport protocol load balancing support
+#
+CONFIG_IP_VS_PROTO_TCP=y
+CONFIG_IP_VS_PROTO_UDP=y
+CONFIG_IP_VS_PROTO_ESP=y
+CONFIG_IP_VS_PROTO_AH=y
+
+#
+# IPVS scheduler
+#
+CONFIG_IP_VS_RR=m
+CONFIG_IP_VS_WRR=m
+CONFIG_IP_VS_LC=m
+CONFIG_IP_VS_WLC=m
+CONFIG_IP_VS_LBLC=m
+CONFIG_IP_VS_LBLCR=m
+CONFIG_IP_VS_DH=m
+CONFIG_IP_VS_SH=m
+CONFIG_IP_VS_SED=m
+CONFIG_IP_VS_NQ=m
+
+#
+# IPVS application helper
+#
+CONFIG_IP_VS_FTP=m
+CONFIG_IPV6=m
+CONFIG_IPV6_SUBTREES=y
+CONFIG_IPV6_PRIVACY=y
+CONFIG_IPV6_NDISC_NEW=y
+CONFIG_INET6_AH=m
+CONFIG_INET6_ESP=m
+CONFIG_INET6_IPCOMP=m
+CONFIG_IPV6_TUNNEL=m
+
+#
+# MOBILE IPv6 (EXPERIMENTAL)
+#
+CONFIG_IPV6_MOBILITY=m
+CONFIG_IPV6_MOBILITY_MN=m
+CONFIG_IPV6_MOBILITY_HA=m
+# CONFIG_IPV6_MOBILITY_DEBUG is not set
+# CONFIG_DECNET is not set
+CONFIG_BRIDGE=m
+CONFIG_NETFILTER=y
+# CONFIG_NETFILTER_DEBUG is not set
+CONFIG_BRIDGE_NETFILTER=y
+
+#
+# IP: Netfilter Configuration
+#
+CONFIG_IP_NF_CONNTRACK=m
+CONFIG_IP_NF_CT_PROTO_SCTP=m
+CONFIG_IP_NF_FTP=m
+CONFIG_IP_NF_IRC=m
+CONFIG_IP_NF_TFTP=m
+CONFIG_IP_NF_AMANDA=m
+CONFIG_IP_NF_QUEUE=m
+CONFIG_IP_NF_IPTABLES=m
+CONFIG_IP_NF_MATCH_LIMIT=m
+CONFIG_IP_NF_MATCH_IPRANGE=m
+CONFIG_IP_NF_MATCH_MAC=m
+CONFIG_IP_NF_MATCH_PKTTYPE=m
+CONFIG_IP_NF_MATCH_POLICY=m
+CONFIG_IP_NF_MATCH_MARK=m
+CONFIG_IP_NF_MATCH_MULTIPORT=m
+CONFIG_IP_NF_MATCH_TOS=m
+CONFIG_IP_NF_MATCH_RECENT=m
+CONFIG_IP_NF_MATCH_ECN=m
+CONFIG_IP_NF_MATCH_DSCP=m
+CONFIG_IP_NF_MATCH_AH_ESP=m
+CONFIG_IP_NF_MATCH_LENGTH=m
+CONFIG_IP_NF_MATCH_TTL=m
+CONFIG_IP_NF_MATCH_TCPMSS=m
+CONFIG_IP_NF_MATCH_HELPER=m
+CONFIG_IP_NF_MATCH_STATE=m
+CONFIG_IP_NF_MATCH_CONNTRACK=m
+CONFIG_IP_NF_MATCH_OWNER=m
+CONFIG_IP_NF_MATCH_PHYSDEV=m
+CONFIG_IP_NF_MATCH_SCTP=m
+CONFIG_IP_NF_FILTER=m
+CONFIG_IP_NF_TARGET_REJECT=m
+CONFIG_IP_NF_NAT=m
+CONFIG_IP_NF_NAT_NEEDED=y
+CONFIG_IP_NF_TARGET_MASQUERADE=m
+CONFIG_IP_NF_TARGET_REDIRECT=m
+CONFIG_IP_NF_TARGET_NETMAP=m
+CONFIG_IP_NF_TARGET_SAME=m
+CONFIG_IP_NF_NAT_LOCAL=y
+CONFIG_IP_NF_NAT_SNMP_BASIC=m
+CONFIG_IP_NF_NAT_IRC=m
+CONFIG_IP_NF_NAT_FTP=m
+CONFIG_IP_NF_NAT_TFTP=m
+CONFIG_IP_NF_NAT_AMANDA=m
+CONFIG_IP_NF_MANGLE=m
+CONFIG_IP_NF_TARGET_TOS=m
+CONFIG_IP_NF_TARGET_ECN=m
+CONFIG_IP_NF_TARGET_DSCP=m
+CONFIG_IP_NF_TARGET_MARK=m
+CONFIG_IP_NF_TARGET_TTL=m
+CONFIG_IP_NF_TARGET_CLASSIFY=m
+CONFIG_IP_NF_TARGET_LOG=m
+CONFIG_IP_NF_TARGET_ULOG=m
+CONFIG_IP_NF_TARGET_TCPMSS=m
+CONFIG_IP_NF_ARPTABLES=m
+CONFIG_IP_NF_ARPFILTER=m
+CONFIG_IP_NF_ARP_MANGLE=m
+# CONFIG_IP_NF_COMPAT_IPCHAINS is not set
+# CONFIG_IP_NF_COMPAT_IPFWADM is not set
+CONFIG_IP_NF_CONNTRACK_MARK=y
+CONFIG_IP_NF_TARGET_CONNMARK=m
+CONFIG_IP_NF_MATCH_CONNMARK=m
+CONFIG_IP_NF_TARGET_CLUSTERIP=m
+CONFIG_IP_NF_MATCH_ADDRTYPE=m
+CONFIG_IP_NF_MATCH_HASHLIMIT=m
+# CONFIG_IP_NF_MATCH_IPV4OPTIONS is not set
+
+#
+# IPv6: Netfilter Configuration
+#
+CONFIG_IP6_NF_FTP=m
+CONFIG_IP6_NF_QUEUE=m
+CONFIG_IP6_NF_IPTABLES=m
+CONFIG_IP6_NF_MATCH_LIMIT=m
+CONFIG_IP6_NF_MATCH_MAC=m
+CONFIG_IP6_NF_MATCH_RT=m
+CONFIG_IP6_NF_MATCH_OPTS=m
+CONFIG_IP6_NF_MATCH_FRAG=m
+CONFIG_IP6_NF_MATCH_HL=m
+CONFIG_IP6_NF_MATCH_MULTIPORT=m
+CONFIG_IP6_NF_MATCH_OWNER=m
+CONFIG_IP6_NF_MATCH_MARK=m
+CONFIG_IP6_NF_MATCH_IPV6HEADER=m
+CONFIG_IP6_NF_MATCH_AHESP=m
+CONFIG_IP6_NF_MATCH_LENGTH=m
+CONFIG_IP6_NF_MATCH_EUI64=m
+CONFIG_IP6_NF_CONNTRACK=m
+CONFIG_IP6_NF_MATCH_STATE=m
+CONFIG_IP6_NF_FILTER=m
+CONFIG_IP6_NF_TARGET_LOG=m
+CONFIG_IP6_NF_TARGET_REJECT=m
+CONFIG_IP6_NF_MANGLE=m
+CONFIG_IP6_NF_TARGET_MARK=m
+
+#
+# Bridge: Netfilter Configuration
+#
+CONFIG_BRIDGE_NF_EBTABLES=m
+CONFIG_BRIDGE_EBT_BROUTE=m
+CONFIG_BRIDGE_EBT_T_FILTER=m
+CONFIG_BRIDGE_EBT_T_NAT=m
+CONFIG_BRIDGE_EBT_802_3=m
+CONFIG_BRIDGE_EBT_AMONG=m
+CONFIG_BRIDGE_EBT_ARP=m
+CONFIG_BRIDGE_EBT_IP=m
+CONFIG_BRIDGE_EBT_LIMIT=m
+CONFIG_BRIDGE_EBT_MARK=m
+CONFIG_BRIDGE_EBT_PKTTYPE=m
+CONFIG_BRIDGE_EBT_STP=m
+CONFIG_BRIDGE_EBT_VLAN=m
+CONFIG_BRIDGE_EBT_ARPREPLY=m
+CONFIG_BRIDGE_EBT_DNAT=m
+CONFIG_BRIDGE_EBT_MARK_T=m
+CONFIG_BRIDGE_EBT_REDIRECT=m
+# CONFIG_BRIDGE_EBT_SNAT is not set
+CONFIG_BRIDGE_EBT_LOG=m
+CONFIG_XFRM=y
+CONFIG_XFRM_USER=m
+
+#
+# SCTP Configuration (EXPERIMENTAL)
+#
+CONFIG_IP_SCTP=m
+# CONFIG_SCTP_DBG_MSG is not set
+# CONFIG_SCTP_DBG_OBJCNT is not set
+CONFIG_SCTP_HMAC_NONE=y
+# CONFIG_SCTP_HMAC_SHA1 is not set
+# CONFIG_SCTP_HMAC_MD5 is not set
+# CONFIG_ATM is not set
+CONFIG_VLAN_8021Q=m
+CONFIG_LLC=y
+CONFIG_LLC2=m
+CONFIG_IPX=m
+CONFIG_IPX_INTERN=y
+CONFIG_ATALK=m
+CONFIG_DEV_APPLETALK=y
+CONFIG_IPDDP=m
+CONFIG_IPDDP_ENCAP=y
+CONFIG_IPDDP_DECAP=y
+# CONFIG_X25 is not set
+# CONFIG_LAPB is not set
+# CONFIG_NET_DIVERT is not set
+# CONFIG_ECONET is not set
+# CONFIG_WAN_ROUTER is not set
+# CONFIG_NET_FASTROUTE is not set
+# CONFIG_NET_HW_FLOWCONTROL is not set
+
+#
+# QoS and/or fair queueing
+#
+CONFIG_NET_SCHED=y
+CONFIG_NET_SCH_CBQ=m
+CONFIG_NET_SCH_HTB=m
+CONFIG_NET_SCH_HFSC=m
+CONFIG_NET_SCH_CSZ=m
+CONFIG_NET_SCH_PRIO=m
+CONFIG_NET_SCH_RED=m
+CONFIG_NET_SCH_SFQ=m
+CONFIG_NET_SCH_TEQL=m
+CONFIG_NET_SCH_TBF=m
+CONFIG_NET_SCH_GRED=m
+CONFIG_NET_SCH_DSMARK=m
+CONFIG_NET_SCH_DELAY=m
+CONFIG_NET_SCH_INGRESS=m
+CONFIG_NET_QOS=y
+CONFIG_NET_ESTIMATOR=y
+CONFIG_NET_CLS=y
+CONFIG_NET_CLS_TCINDEX=m
+CONFIG_NET_CLS_ROUTE4=m
+CONFIG_NET_CLS_ROUTE=y
+CONFIG_NET_CLS_FW=m
+CONFIG_NET_CLS_U32=m
+CONFIG_NET_CLS_RSVP=m
+CONFIG_NET_CLS_RSVP6=m
+CONFIG_NET_CLS_POLICE=y
+
+#
+# Network testing
+#
+CONFIG_NET_PKTGEN=m
+CONFIG_NETDEVICES=y
+
+#
+# ARCnet devices
+#
+# CONFIG_ARCNET is not set
+CONFIG_DUMMY=m
+CONFIG_BONDING=m
+CONFIG_EQUALIZER=m
+CONFIG_TUN=m
+# CONFIG_ETHERTAP is not set
+
+#
+# Ethernet (10 or 100Mbit)
+#
+CONFIG_NET_ETHERNET=y
+CONFIG_MII=y
+# CONFIG_OAKNET is not set
+# CONFIG_HAPPYMEAL is not set
+# CONFIG_SUNGEM is not set
+CONFIG_NET_VENDOR_3COM=y
+CONFIG_VORTEX=m
+CONFIG_TYPHOON=m
+
+#
+# Tulip family network device support
+#
+# CONFIG_NET_TULIP is not set
+# CONFIG_HP100 is not set
+CONFIG_NET_PCI=y
+CONFIG_PCNET32=m
+# CONFIG_AMD8111_ETH is not set
+# CONFIG_ADAPTEC_STARFIRE is not set
+# CONFIG_B44 is not set
+# CONFIG_FORCEDETH is not set
+# CONFIG_DGRS is not set
+# CONFIG_EEPRO100 is not set
+CONFIG_E100=m
+CONFIG_E100_NAPI=y
+CONFIG_E100_EEH_RECOVERY=y
+# CONFIG_FEALNX is not set
+# CONFIG_NATSEMI is not set
+# CONFIG_NE2K_PCI is not set
+# CONFIG_8139CP is not set
+# CONFIG_8139TOO is not set
+# CONFIG_SIS900 is not set
+# CONFIG_EPIC100 is not set
+# CONFIG_SUNDANCE is not set
+# CONFIG_VIA_RHINE is not set
+
+#
+# Ethernet (1000 Mbit)
+#
+CONFIG_ACENIC=m
+CONFIG_ACENIC_OMIT_TIGON_I=y
+# CONFIG_DL2K is not set
+CONFIG_E1000=m
+CONFIG_E1000_NAPI=y
+CONFIG_E1000_EEH_RECOVERY=y
+CONFIG_IGB=m
+# CONFIG_NS83820 is not set
+# CONFIG_HAMACHI is not set
+# CONFIG_YELLOWFIN is not set
+# CONFIG_R8169 is not set
+# CONFIG_SIS190 is not set
+# CONFIG_SK98LIN is not set
+CONFIG_TIGON3=m
+CONFIG_NET_BROADCOM=m
+# CONFIG_NET_BCM44 is not set
+CONFIG_BNX2=m
+# CONFIG_2BUFF_MODE is not set
+
+#
+# Ethernet (10000 Mbit)
+#
+CONFIG_IXGB=m
+CONFIG_IXGB_NAPI=y
+CONFIG_QLA3XXX=m
+CONFIG_IXGB_EEH_RECOVERY=y
+CONFIG_S2IO=m
+CONFIG_S2IO_NAPI=y
+CONFIG_MYRI10GE=m
+CONFIG_NETXEN_NIC=m
+CONFIG_IBMVETH=m
+# CONFIG_FDDI is not set
+# CONFIG_HIPPI is not set
+# CONFIG_PLIP is not set
+CONFIG_PPP=m
+CONFIG_PPP_MULTILINK=y
+CONFIG_PPP_FILTER=y
+CONFIG_PPP_ASYNC=m
+CONFIG_PPP_SYNC_TTY=m
+CONFIG_PPP_DEFLATE=m
+CONFIG_PPP_BSDCOMP=m
+CONFIG_PPP_MPPE=m
+CONFIG_PPPOE=m
+CONFIG_SLIP=m
+CONFIG_SLIP_COMPRESSED=y
+CONFIG_SLIP_SMART=y
+# CONFIG_SLIP_MODE_SLIP6 is not set
+
+#
+# Wireless LAN (non-hamradio)
+#
+# CONFIG_NET_RADIO is not set
+
+#
+# Token Ring devices
+#
+CONFIG_TR=y
+CONFIG_IBMOL=m
+# CONFIG_IBMLS is not set
+# CONFIG_3C359 is not set
+# CONFIG_TMS380TR is not set
+CONFIG_NET_FC=y
+CONFIG_SHAPER=m
+CONFIG_NETCONSOLE=m
+
+#
+# Wan interfaces
+#
+# CONFIG_WAN is not set
+
+#
+# Amateur Radio support
+#
+# CONFIG_HAMRADIO is not set
+
+#
+# IrDA (infrared) support
+#
+# CONFIG_IRDA is not set
+
+#
+# Bluetooth support
+#
+# CONFIG_BT is not set
+CONFIG_NETPOLL=y
+CONFIG_NETPOLL_RX=y
+CONFIG_NETPOLL_TRAP=y
+CONFIG_NET_POLL_CONTROLLER=y
+
+#
+# ISDN subsystem
+#
+# CONFIG_ISDN is not set
+
+#
+# Telephony Support
+#
+# CONFIG_PHONE is not set
+
+#
+# Input device support
+#
+CONFIG_INPUT=y
+
+#
+# Userland interfaces
+#
+CONFIG_INPUT_MOUSEDEV=y
+CONFIG_INPUT_MOUSEDEV_PSAUX=y
+CONFIG_INPUT_MOUSEDEV_SCREEN_X=1024
+CONFIG_INPUT_MOUSEDEV_SCREEN_Y=768
+CONFIG_INPUT_JOYDEV=m
+CONFIG_INPUT_TSDEV=m
+CONFIG_INPUT_TSDEV_SCREEN_X=240
+CONFIG_INPUT_TSDEV_SCREEN_Y=320
+CONFIG_INPUT_EVDEV=m
+# CONFIG_INPUT_EVBUG is not set
+
+#
+# Input I/O drivers
+#
+# CONFIG_GAMEPORT is not set
+CONFIG_SOUND_GAMEPORT=y
+CONFIG_SERIO=y
+CONFIG_SERIO_I8042=y
+# CONFIG_SERIO_SERPORT is not set
+# CONFIG_SERIO_CT82C710 is not set
+# CONFIG_SERIO_PARKBD is not set
+# CONFIG_SERIO_PCIPS2 is not set
+
+#
+# Input Device Drivers
+#
+CONFIG_INPUT_KEYBOARD=y
+CONFIG_KEYBOARD_ATKBD=y
+# CONFIG_KEYBOARD_SUNKBD is not set
+# CONFIG_KEYBOARD_LKKBD is not set
+# CONFIG_KEYBOARD_XTKBD is not set
+# CONFIG_KEYBOARD_NEWTON is not set
+# CONFIG_KEYBOARD_POSFILTER is not set
+CONFIG_INPUT_MOUSE=y
+CONFIG_MOUSE_PS2=y
+# CONFIG_MOUSE_SERIAL is not set
+# CONFIG_MOUSE_VSXXXAA is not set
+# CONFIG_INPUT_JOYSTICK is not set
+# CONFIG_INPUT_TOUCHSCREEN is not set
+CONFIG_INPUT_MISC=y
+CONFIG_INPUT_PCSPKR=m
+CONFIG_INPUT_UINPUT=m
+
+#
+# Character devices
+#
+CONFIG_VT=y
+CONFIG_VT_CONSOLE=y
+CONFIG_HW_CONSOLE=y
+CONFIG_ECC=m
+# CONFIG_SERIAL_NONSTANDARD is not set
+
+#
+# Serial drivers
+#
+CONFIG_SERIAL_8250=y
+CONFIG_SERIAL_8250_CONSOLE=y
+CONFIG_SERIAL_8250_NR_UARTS=4
+# CONFIG_SERIAL_8250_EXTENDED is not set
+
+#
+# Non-8250 serial port support
+#
+CONFIG_SERIAL_CORE=y
+CONFIG_SERIAL_CORE_CONSOLE=y
+# CONFIG_SERIAL_PMACZILOG is not set
+CONFIG_SERIAL_ICOM=m
+CONFIG_SERIAL_JSM=m
+CONFIG_UNIX98_PTYS=y
+CONFIG_LEGACY_PTYS=y
+CONFIG_LEGACY_PTY_COUNT=256
+CONFIG_PRINTER=m
+# CONFIG_LP_CONSOLE is not set
+# CONFIG_PPDEV is not set
+# CONFIG_TIPAR is not set
+CONFIG_HVC_CONSOLE=y
+CONFIG_HVCS=m
+# CONFIG_QIC02_TAPE is not set
+
+#
+# IPMI
+#
+# CONFIG_IPMI_HANDLER is not set
+
+#
+# Watchdog Cards
+#
+CONFIG_WATCHDOG=y
+# CONFIG_WATCHDOG_NOWAYOUT is not set
+
+#
+# Watchdog Device Drivers
+#
+CONFIG_SOFT_WATCHDOG=m
+
+#
+# PCI-based Watchdog Cards
+#
+# CONFIG_PCIPCWATCHDOG is not set
+# CONFIG_WDTPCI is not set
+
+#
+# USB-based Watchdog Cards
+#
+# CONFIG_USBPCWATCHDOG is not set
+# CONFIG_RTC is not set
+# CONFIG_GEN_RTC is not set
+# CONFIG_DTLK is not set
+# CONFIG_R3964 is not set
+# CONFIG_APPLICOM is not set
+
+#
+# Ftape, the floppy tape device driver
+#
+# CONFIG_FTAPE is not set
+
+#
+# TPM devices
+#
+CONFIG_TCG_TPM=m
+CONFIG_TCG_NSC=m
+CONFIG_TCG_ATMEL=m
+# CONFIG_AGP is not set
+# CONFIG_DRM is not set
+CONFIG_RAW_DRIVER=m
+CONFIG_MAX_RAW_DEVS=4096
+CONFIG_HANGCHECK_TIMER=m
+
+#
+# Linux InfraRed Controller
+#
+# CONFIG_LIRC_SUPPORT is not set
+# CONFIG_LIRC_HOMEBREW is not set
+
+#
+# I2C support
+#
+CONFIG_I2C=y
+CONFIG_I2C_CHARDEV=y
+
+#
+# I2C Algorithms
+#
+CONFIG_I2C_ALGOBIT=y
+# CONFIG_I2C_ALGOPCF is not set
+
+#
+# I2C Hardware Bus support
+#
+# CONFIG_I2C_ALI1535 is not set
+# CONFIG_I2C_ALI15X3 is not set
+# CONFIG_I2C_AMD756 is not set
+# CONFIG_I2C_AMD8111 is not set
+# CONFIG_I2C_I801 is not set
+# CONFIG_I2C_I810 is not set
+# CONFIG_I2C_ISA is not set
+# CONFIG_I2C_NFORCE2 is not set
+# CONFIG_I2C_PARPORT is not set
+# CONFIG_I2C_PARPORT_LIGHT is not set
+# CONFIG_I2C_PROSAVAGE is not set
+# CONFIG_I2C_SAVAGE4 is not set
+# CONFIG_SCx200_ACB is not set
+# CONFIG_I2C_SIS5595 is not set
+# CONFIG_I2C_SIS630 is not set
+# CONFIG_I2C_SIS96X is not set
+# CONFIG_I2C_VIA is not set
+# CONFIG_I2C_VIAPRO is not set
+# CONFIG_I2C_VOODOO3 is not set
+
+#
+# Hardware Sensors Chip support
+#
+# CONFIG_I2C_SENSOR is not set
+# CONFIG_SENSORS_ADM1021 is not set
+# CONFIG_SENSORS_ASB100 is not set
+# CONFIG_SENSORS_DS1621 is not set
+# CONFIG_SENSORS_FSCHER is not set
+# CONFIG_SENSORS_GL518SM is not set
+# CONFIG_SENSORS_IT87 is not set
+# CONFIG_SENSORS_LM75 is not set
+# CONFIG_SENSORS_LM78 is not set
+# CONFIG_SENSORS_LM80 is not set
+# CONFIG_SENSORS_LM83 is not set
+# CONFIG_SENSORS_LM85 is not set
+# CONFIG_SENSORS_LM90 is not set
+# CONFIG_SENSORS_VIA686A is not set
+# CONFIG_SENSORS_VT1211 is not set
+# CONFIG_SENSORS_W83781D is not set
+# CONFIG_SENSORS_W83L785TS is not set
+# CONFIG_SENSORS_W83627HF is not set
+# CONFIG_SENSORS_PCF8574 is not set
+
+#
+# Other I2C Chip support
+#
+# CONFIG_SENSORS_EEPROM is not set
+# CONFIG_I2C_DEBUG_CORE is not set
+# CONFIG_I2C_DEBUG_ALGO is not set
+# CONFIG_I2C_DEBUG_BUS is not set
+# CONFIG_I2C_DEBUG_CHIP is not set
+
+#
+# Misc devices
+#
+
+#
+# Multimedia devices
+#
+# CONFIG_VIDEO_DEV is not set
+
+#
+# Digital Video Broadcasting Devices
+#
+# CONFIG_DVB is not set
+
+#
+# Graphics support
+#
+CONFIG_FB=y
+# CONFIG_FB_PM2 is not set
+# CONFIG_FB_CYBER2000 is not set
+CONFIG_FB_OF=y
+# CONFIG_FB_CT65550 is not set
+# CONFIG_FB_IMSTT is not set
+# CONFIG_FB_S3TRIO is not set
+# CONFIG_FB_VGA16 is not set
+# CONFIG_FB_RIVA is not set
+CONFIG_FB_MATROX=y
+CONFIG_FB_MATROX_MILLENIUM=y
+CONFIG_FB_MATROX_MYSTIQUE=y
+CONFIG_FB_MATROX_G450=y
+CONFIG_FB_MATROX_G100=y
+# CONFIG_FB_MATROX_I2C is not set
+CONFIG_FB_MATROX_MULTIHEAD=y
+# CONFIG_FB_RADEON_OLD is not set
+CONFIG_FB_RADEON=y
+CONFIG_FB_RADEON_I2C=y
+# CONFIG_FB_RADEON_DEBUG is not set
+# CONFIG_FB_ATY128 is not set
+# CONFIG_FB_ATY is not set
+# CONFIG_FB_SIS is not set
+# CONFIG_FB_NEOMAGIC is not set
+# CONFIG_FB_KYRO is not set
+# CONFIG_FB_3DFX is not set
+# CONFIG_FB_VOODOO1 is not set
+# CONFIG_FB_TRIDENT is not set
+# CONFIG_FB_VIRTUAL is not set
+
+#
+# Console display driver support
+#
+# CONFIG_VGA_CONSOLE is not set
+# CONFIG_MDA_CONSOLE is not set
+CONFIG_DUMMY_CONSOLE=y
+CONFIG_FRAMEBUFFER_CONSOLE=y
+CONFIG_PCI_CONSOLE=y
+# CONFIG_FONTS is not set
+CONFIG_FONT_8x8=y
+CONFIG_FONT_8x16=y
+
+#
+# Logo configuration
+#
+# CONFIG_LOGO is not set
+
+#
+# Bootsplash configuration
+#
+
+#
+# Sound
+#
+# CONFIG_SOUND is not set
+
+#
+# USB support
+#
+CONFIG_USB=m
+# CONFIG_USB_DEBUG is not set
+
+#
+# Miscellaneous USB options
+#
+CONFIG_USB_DEVICEFS=y
+# CONFIG_USB_BANDWIDTH is not set
+# CONFIG_USB_DYNAMIC_MINORS is not set
+
+#
+# USB Host Controller Drivers
+#
+CONFIG_USB_EHCI_HCD=m
+CONFIG_USB_EHCI_SPLIT_ISO=y
+CONFIG_USB_EHCI_ROOT_HUB_TT=y
+CONFIG_USB_OHCI_HCD=m
+# CONFIG_USB_UHCI_HCD is not set
+
+#
+# USB Device Class drivers
+#
+# CONFIG_USB_BLUETOOTH_TTY is not set
+CONFIG_USB_ACM=m
+CONFIG_USB_PRINTER=m
+CONFIG_USB_STORAGE=m
+# CONFIG_USB_STORAGE_DEBUG is not set
+CONFIG_USB_STORAGE_DATAFAB=y
+CONFIG_USB_STORAGE_FREECOM=y
+CONFIG_USB_STORAGE_ISD200=y
+CONFIG_USB_STORAGE_DPCM=y
+CONFIG_USB_STORAGE_HP8200e=y
+CONFIG_USB_STORAGE_SDDR09=y
+CONFIG_USB_STORAGE_SDDR55=y
+CONFIG_USB_STORAGE_JUMPSHOT=y
+
+#
+# USB Human Interface Devices (HID)
+#
+CONFIG_USB_HID=m
+CONFIG_USB_HIDINPUT=y
+# CONFIG_HID_FF is not set
+CONFIG_USB_HIDDEV=y
+
+#
+# USB HID Boot Protocol drivers
+#
+# CONFIG_USB_KBD is not set
+# CONFIG_USB_MOUSE is not set
+CONFIG_USB_AIPTEK=m
+CONFIG_USB_WACOM=m
+CONFIG_USB_KBTAB=m
+CONFIG_USB_POWERMATE=m
+CONFIG_USB_MTOUCH=m
+CONFIG_USB_XPAD=m
+CONFIG_USB_ATI_REMOTE=m
+
+#
+# USB Imaging devices
+#
+CONFIG_USB_MDC800=m
+CONFIG_USB_MICROTEK=m
+CONFIG_USB_HPUSBSCSI=m
+
+#
+# USB Multimedia devices
+#
+# CONFIG_USB_DABUSB is not set
+
+#
+# Video4Linux support is needed for USB Multimedia device support
+#
+
+#
+# USB Network adaptors
+#
+CONFIG_USB_CATC=m
+CONFIG_USB_KAWETH=m
+CONFIG_USB_PEGASUS=m
+CONFIG_USB_RTL8150=m
+CONFIG_USB_USBNET=m
+
+#
+# USB Host-to-Host Cables
+#
+CONFIG_USB_ALI_M5632=y
+CONFIG_USB_AN2720=y
+CONFIG_USB_BELKIN=y
+CONFIG_USB_GENESYS=y
+CONFIG_USB_NET1080=y
+CONFIG_USB_PL2301=y
+
+#
+# Intelligent USB Devices/Gadgets
+#
+CONFIG_USB_ARMLINUX=y
+CONFIG_USB_EPSON2888=y
+CONFIG_USB_ZAURUS=y
+CONFIG_USB_CDCETHER=y
+
+#
+# USB Network Adapters
+#
+CONFIG_USB_AX8817X=y
+
+#
+# USB port drivers
+#
+# CONFIG_USB_USS720 is not set
+
+#
+# USB Serial Converter support
+#
+CONFIG_USB_SERIAL=m
+CONFIG_USB_SERIAL_GENERIC=y
+CONFIG_USB_SERIAL_BELKIN=m
+# CONFIG_USB_SERIAL_WHITEHEAT is not set
+CONFIG_USB_SERIAL_DIGI_ACCELEPORT=m
+CONFIG_USB_SERIAL_EMPEG=m
+CONFIG_USB_SERIAL_FTDI_SIO=m
+CONFIG_USB_SERIAL_VISOR=m
+CONFIG_USB_SERIAL_IPAQ=m
+CONFIG_USB_SERIAL_IR=m
+CONFIG_USB_SERIAL_EDGEPORT=m
+CONFIG_USB_SERIAL_EDGEPORT_TI=m
+CONFIG_USB_SERIAL_KEYSPAN_PDA=m
+CONFIG_USB_SERIAL_KEYSPAN=m
+CONFIG_USB_SERIAL_KEYSPAN_MPR=y
+CONFIG_USB_SERIAL_KEYSPAN_USA28=y
+CONFIG_USB_SERIAL_KEYSPAN_USA28X=y
+CONFIG_USB_SERIAL_KEYSPAN_USA28XA=y
+CONFIG_USB_SERIAL_KEYSPAN_USA28XB=y
+CONFIG_USB_SERIAL_KEYSPAN_USA19=y
+CONFIG_USB_SERIAL_KEYSPAN_USA18X=y
+CONFIG_USB_SERIAL_KEYSPAN_USA19W=y
+CONFIG_USB_SERIAL_KEYSPAN_USA19QW=y
+CONFIG_USB_SERIAL_KEYSPAN_USA19QI=y
+CONFIG_USB_SERIAL_KEYSPAN_USA49W=y
+CONFIG_USB_SERIAL_KEYSPAN_USA49WLC=y
+CONFIG_USB_SERIAL_KLSI=m
+CONFIG_USB_SERIAL_KOBIL_SCT=m
+CONFIG_USB_SERIAL_MCT_U232=m
+CONFIG_USB_SERIAL_PL2303=m
+CONFIG_USB_SERIAL_SAFE=m
+CONFIG_USB_SERIAL_SAFE_PADDED=y
+CONFIG_USB_SERIAL_CYBERJACK=m
+CONFIG_USB_SERIAL_XIRCOM=m
+CONFIG_USB_SERIAL_OMNINET=m
+CONFIG_USB_EZUSB=y
+
+#
+# USB Miscellaneous drivers
+#
+CONFIG_USB_EMI62=m
+CONFIG_USB_EMI26=m
+# CONFIG_USB_TIGL is not set
+# CONFIG_USB_AUERSWALD is not set
+# CONFIG_USB_RIO500 is not set
+CONFIG_USB_LEGOTOWER=m
+# CONFIG_USB_LCD is not set
+CONFIG_USB_LED=m
+CONFIG_USB_CYTHERM=m
+# CONFIG_USB_TEST is not set
+
+#
+# USB Gadget Support
+#
+# CONFIG_USB_GADGET is not set
+
+#
+# InfiniBand support
+#
+# CONFIG_INFINIBAND is not set
+CONFIG_AUDIT=m
+
+#
+# File systems
+#
+CONFIG_EXT2_FS=y
+CONFIG_EXT2_FS_XATTR=y
+CONFIG_EXT2_FS_POSIX_ACL=y
+CONFIG_EXT2_FS_SECURITY=y
+CONFIG_EXT3_FS=y
+CONFIG_EXT3_FS_XATTR=y
+CONFIG_EXT3_FS_POSIX_ACL=y
+CONFIG_EXT3_FS_SECURITY=y
+CONFIG_JBD=y
+# CONFIG_JBD_DEBUG is not set
+CONFIG_FS_MBCACHE=y
+CONFIG_REISERFS_FS=y
+# CONFIG_REISERFS_CHECK is not set
+CONFIG_REISERFS_PROC_INFO=y
+CONFIG_REISERFS_FS_XATTR=y
+CONFIG_REISERFS_FS_POSIX_ACL=y
+CONFIG_REISERFS_FS_SECURITY=y
+CONFIG_JFS_FS=m
+CONFIG_JFS_POSIX_ACL=y
+CONFIG_JFS_DMAPI=y
+# CONFIG_JFS_DEBUG is not set
+CONFIG_JFS_STATISTICS=y
+CONFIG_FS_POSIX_ACL=y
+CONFIG_XFS_FS=m
+CONFIG_XFS_EXPORT=y
+CONFIG_XFS_RT=y
+CONFIG_XFS_QUOTA=m
+CONFIG_XFS_DMAPI=y
+CONFIG_XFS_SECURITY=y
+CONFIG_XFS_POSIX_ACL=y
+CONFIG_OCFS2_FS=m
+CONFIG_MINIX_FS=m
+# CONFIG_ROMFS_FS is not set
+CONFIG_DMAPI=m
+# CONFIG_DMAPI_DEBUG is not set
+CONFIG_QUOTA=y
+CONFIG_QFMT_V1=m
+CONFIG_QFMT_V2=m
+CONFIG_QUOTACTL=y
+CONFIG_AUTOFS_FS=m
+CONFIG_AUTOFS4_FS=m
+
+#
+# CD-ROM/DVD Filesystems
+#
+CONFIG_ISO9660_FS=y
+CONFIG_JOLIET=y
+CONFIG_ZISOFS=y
+CONFIG_ZISOFS_FS=y
+CONFIG_UDF_FS=m
+
+#
+# DOS/FAT/NT Filesystems
+#
+CONFIG_FAT_FS=y
+CONFIG_MSDOS_FS=y
+CONFIG_VFAT_FS=y
+# CONFIG_NTFS_FS is not set
+
+#
+# Pseudo filesystems
+#
+CONFIG_PROC_FS=y
+CONFIG_PROC_KCORE=y
+# CONFIG_DEVFS_FS is not set
+CONFIG_DEVPTS_FS_XATTR=y
+CONFIG_DEVPTS_FS_SECURITY=y
+CONFIG_TMPFS=y
+CONFIG_HUGETLBFS=y
+CONFIG_HUGETLB_PAGE=y
+CONFIG_RAMFS=y
+CONFIG_CONFIGFS_FS=m
+CONFIG_RELAYFS_FS=m
+# CONFIG_KLOG_CHANNEL is not set
+
+#
+# Miscellaneous filesystems
+#
+# CONFIG_ADFS_FS is not set
+# CONFIG_AFFS_FS is not set
+CONFIG_HFS_FS=m
+CONFIG_HFSPLUS_FS=m
+# CONFIG_BEFS_FS is not set
+# CONFIG_BFS_FS is not set
+# CONFIG_EFS_FS is not set
+CONFIG_CRAMFS=y
+# CONFIG_VXFS_FS is not set
+# CONFIG_HPFS_FS is not set
+# CONFIG_QNX4FS_FS is not set
+# CONFIG_SYSV_FS is not set
+CONFIG_UFS_FS=m
+# CONFIG_UFS_FS_WRITE is not set
+
+#
+# Network File Systems
+#
+CONFIG_NFS_FS=y
+CONFIG_NFS_V3=y
+CONFIG_NFS_ACL=y
+CONFIG_NFS_V4=y
+CONFIG_NFS_DIRECTIO=y
+CONFIG_NFSD=m
+CONFIG_NFSD_V3=y
+CONFIG_NFSD_ACL=y
+CONFIG_NFS_ACL_SUPPORT=y
+# CONFIG_NFSD_V4 is not set
+CONFIG_NFSD_TCP=y
+CONFIG_LOCKD=y
+CONFIG_STATD=y
+CONFIG_LOCKD_V4=y
+CONFIG_EXPORTFS=m
+CONFIG_SUNRPC=y
+CONFIG_SUNRPC_GSS=y
+CONFIG_RPCSEC_GSS_KRB5=y
+CONFIG_SMB_FS=m
+# CONFIG_SMB_NLS_DEFAULT is not set
+CONFIG_CIFS=m
+CONFIG_CIFS_STATS=y
+CONFIG_CIFS_XATTR=y
+CONFIG_CIFS_POSIX=y
+CONFIG_NCP_FS=m
+CONFIG_NCPFS_PACKET_SIGNING=y
+CONFIG_NCPFS_IOCTL_LOCKING=y
+CONFIG_NCPFS_STRONG=y
+CONFIG_NCPFS_NFS_NS=y
+CONFIG_NCPFS_OS2_NS=y
+CONFIG_NCPFS_SMALLDOS=y
+CONFIG_NCPFS_NLS=y
+CONFIG_NCPFS_EXTRAS=y
+# CONFIG_CODA_FS is not set
+# CONFIG_INTERMEZZO_FS is not set
+# CONFIG_AFS_FS is not set
+
+#
+# Partition Types
+#
+CONFIG_PARTITION_ADVANCED=y
+# CONFIG_ACORN_PARTITION is not set
+CONFIG_OSF_PARTITION=y
+CONFIG_AMIGA_PARTITION=y
+CONFIG_ATARI_PARTITION=y
+CONFIG_MAC_PARTITION=y
+CONFIG_MSDOS_PARTITION=y
+CONFIG_BSD_DISKLABEL=y
+CONFIG_MINIX_SUBPARTITION=y
+CONFIG_SOLARIS_X86_PARTITION=y
+CONFIG_UNIXWARE_DISKLABEL=y
+CONFIG_LDM_PARTITION=y
+# CONFIG_LDM_DEBUG is not set
+CONFIG_NEC98_PARTITION=y
+CONFIG_SGI_PARTITION=y
+CONFIG_ULTRIX_PARTITION=y
+CONFIG_SUN_PARTITION=y
+CONFIG_EFI_PARTITION=y
+
+#
+# Native Language Support
+#
+CONFIG_NLS=y
+CONFIG_NLS_DEFAULT="iso8859-1"
+CONFIG_NLS_CODEPAGE_437=m
+CONFIG_NLS_CODEPAGE_737=m
+CONFIG_NLS_CODEPAGE_775=m
+CONFIG_NLS_CODEPAGE_850=m
+CONFIG_NLS_CODEPAGE_852=m
+CONFIG_NLS_CODEPAGE_855=m
+CONFIG_NLS_CODEPAGE_857=m
+CONFIG_NLS_CODEPAGE_860=m
+CONFIG_NLS_CODEPAGE_861=m
+CONFIG_NLS_CODEPAGE_862=m
+CONFIG_NLS_CODEPAGE_863=m
+CONFIG_NLS_CODEPAGE_864=m
+CONFIG_NLS_CODEPAGE_865=m
+CONFIG_NLS_CODEPAGE_866=m
+CONFIG_NLS_CODEPAGE_869=m
+CONFIG_NLS_CODEPAGE_936=m
+CONFIG_NLS_CODEPAGE_950=m
+CONFIG_NLS_CODEPAGE_932=m
+CONFIG_NLS_CODEPAGE_949=m
+CONFIG_NLS_CODEPAGE_874=m
+CONFIG_NLS_ISO8859_8=m
+CONFIG_NLS_CODEPAGE_1250=m
+CONFIG_NLS_CODEPAGE_1251=m
+CONFIG_NLS_ISO8859_1=m
+CONFIG_NLS_ISO8859_2=m
+CONFIG_NLS_ISO8859_3=m
+CONFIG_NLS_ISO8859_4=m
+CONFIG_NLS_ISO8859_5=m
+CONFIG_NLS_ISO8859_6=m
+CONFIG_NLS_ISO8859_7=m
+CONFIG_NLS_ISO8859_9=m
+CONFIG_NLS_ISO8859_13=m
+CONFIG_NLS_ISO8859_14=m
+CONFIG_NLS_ISO8859_15=m
+CONFIG_NLS_KOI8_R=m
+CONFIG_NLS_KOI8_U=m
+CONFIG_NLS_UTF8=m
+CONFIG_FSHOOKS=y
+
+#
+# Profiling support
+#
+CONFIG_PROFILING=y
+CONFIG_OPROFILE=y
+
+#
+# Kernel hacking
+#
+CONFIG_KERNTYPES=y
+CONFIG_CRASH_DUMP=m
+CONFIG_CRASH_DUMP_BLOCKDEV=m
+CONFIG_CRASH_DUMP_NETDEV=m
+# CONFIG_CRASH_DUMP_MEMDEV is not set
+# CONFIG_CRASH_DUMP_SOFTBOOT is not set
+CONFIG_CRASH_DUMP_COMPRESS_RLE=m
+CONFIG_CRASH_DUMP_COMPRESS_GZIP=m
+CONFIG_DEBUG_KERNEL=y
+CONFIG_DEBUG_STACKOVERFLOW=y
+CONFIG_KPROBES=y
+CONFIG_DEBUG_STACK_USAGE=y
+# CONFIG_DEBUG_SLAB is not set
+CONFIG_MAGIC_SYSRQ=y
+CONFIG_DEBUGGER=y
+CONFIG_XMON=y
+# CONFIG_XMON_DEFAULT is not set
+CONFIG_KDB=y
+CONFIG_KDB_MODULES=y
+CONFIG_KDB_OFF=y
+# CONFIG_PPCDBG is not set
+CONFIG_DEBUG_INFO=y
+CONFIG_IRQSTACKS=y
+
+#
+# Security options
+#
+CONFIG_SECURITY=y
+CONFIG_SECURITY_NETWORK=y
+CONFIG_SECURITY_CAPABILITIES=m
+CONFIG_SECURITY_ROOTPLUG=m
+CONFIG_SECURITY_SELINUX=y
+CONFIG_SECURITY_SELINUX_BOOTPARAM=y
+CONFIG_SECURITY_SELINUX_DEVELOP=y
+CONFIG_SECURITY_SELINUX_MLS=y
+CONFIG_SECURITY_SUBDOMAIN=m
+
+#
+# IBM Crypto Hardware support
+#
+CONFIG_IBM_CRYPTO=m
+CONFIG_ICA_LEEDSLITE=m
+
+#
+# Cryptographic options
+#
+CONFIG_CRYPTO=y
+CONFIG_CRYPTO_HMAC=y
+CONFIG_CRYPTO_NULL=m
+CONFIG_CRYPTO_MD4=m
+CONFIG_CRYPTO_MD5=y
+CONFIG_CRYPTO_SHA1=m
+CONFIG_CRYPTO_SHA256=m
+CONFIG_CRYPTO_SHA512=m
+CONFIG_CRYPTO_DES=y
+CONFIG_CRYPTO_BLOWFISH=m
+CONFIG_CRYPTO_TWOFISH=m
+CONFIG_CRYPTO_SERPENT=m
+CONFIG_CRYPTO_AES=m
+CONFIG_CRYPTO_CAST5=m
+CONFIG_CRYPTO_CAST6=m
+CONFIG_CRYPTO_ARC4=m
+CONFIG_CRYPTO_DEFLATE=m
+CONFIG_CRYPTO_MICHAEL_MIC=m
+CONFIG_CRYPTO_TEST=m
+
+#
+# Library routines
+#
+CONFIG_CRC32=y
+CONFIG_QSORT=y
+CONFIG_ZLIB_INFLATE=y
+CONFIG_ZLIB_DEFLATE=m
+CONFIG_HAS_IOMEM=y
+CONFIG_HAS_IOPORT=y
+CONFIG_HAS_DMA=y
+
+#
+# Build options
+#
+CONFIG_SUSE_KERNEL=y
+CONFIG_CFGNAME="pseries64"
+CONFIG_RELEASE="7.312"
diff --git a/lustre/kernel_patches/kernel_configs/kernel-2.6.9-2.6-rhel4-i686-smp.config b/lustre/kernel_patches/kernel_configs/kernel-2.6.9-2.6-rhel4-i686-smp.config

index be52d34..df54dfe 100644 (file)
--- a/lustre/kernel_patches/kernel_configs/kernel-2.6.9-2.6-rhel4-i686-smp.config
+++ b/lustre/kernel_patches/kernel_configs/kernel-2.6.9-2.6-rhel4-i686-smp.config
@@ -2390,7 +2390,7 @@ CONFIG_MAGIC_SYSRQ=y
  CONFIG_DEBUG_SPINLOCK=y
  CONFIG_DEBUG_SPINLOCK_SLEEP=y
  CONFIG_DEBUG_HIGHMEM=y
-# CONFIG_DEBUG_INFO is not set
+CONFIG_DEBUG_INFO=y
  # CONFIG_FRAME_POINTER is not set
  CONFIG_EARLY_PRINTK=y
  CONFIG_DEBUG_STACKOVERFLOW=y
diff --git a/lustre/kernel_patches/kernel_configs/kernel-2.6.9-2.6-rhel4-ia64-smp.config b/lustre/kernel_patches/kernel_configs/kernel-2.6.9-2.6-rhel4-ia64-smp.config

index 6fab5b2..c1fe7ac 100644 (file)
--- a/lustre/kernel_patches/kernel_configs/kernel-2.6.9-2.6-rhel4-ia64-smp.config
+++ b/lustre/kernel_patches/kernel_configs/kernel-2.6.9-2.6-rhel4-ia64-smp.config
@@ -1967,7 +1967,7 @@ CONFIG_MAGIC_SYSRQ=y
  # CONFIG_DEBUG_SLAB is not set
  CONFIG_DEBUG_SPINLOCK=y
  CONFIG_DEBUG_SPINLOCK_SLEEP=y
-# CONFIG_DEBUG_INFO is not set
+CONFIG_DEBUG_INFO=y
  CONFIG_KPROBES=y
  CONFIG_IA64_GRANULE_16MB=y
  # CONFIG_IA64_GRANULE_64MB is not set
diff --git a/lustre/kernel_patches/kernel_configs/kernel-2.6.9-2.6-rhel4-ia64.config b/lustre/kernel_patches/kernel_configs/kernel-2.6.9-2.6-rhel4-ia64.config

index 0345fbe..4584cf4 100644 (file)
--- a/lustre/kernel_patches/kernel_configs/kernel-2.6.9-2.6-rhel4-ia64.config
+++ b/lustre/kernel_patches/kernel_configs/kernel-2.6.9-2.6-rhel4-ia64.config
@@ -1967,7 +1967,7 @@ CONFIG_MAGIC_SYSRQ=y
  # CONFIG_DEBUG_SLAB is not set
  CONFIG_DEBUG_SPINLOCK=y
  CONFIG_DEBUG_SPINLOCK_SLEEP=y
-# CONFIG_DEBUG_INFO is not set
+CONFIG_DEBUG_INFO=y
  CONFIG_KPROBES=y
  CONFIG_IA64_GRANULE_16MB=y
  # CONFIG_IA64_GRANULE_64MB is not set
diff --git a/lustre/kernel_patches/kernel_configs/kernel-2.6.9-2.6-rhel4-x86_64-smp.config b/lustre/kernel_patches/kernel_configs/kernel-2.6.9-2.6-rhel4-x86_64-smp.config

index 72d0c0d..5df3558 100644 (file)
--- a/lustre/kernel_patches/kernel_configs/kernel-2.6.9-2.6-rhel4-x86_64-smp.config
+++ b/lustre/kernel_patches/kernel_configs/kernel-2.6.9-2.6-rhel4-x86_64-smp.config
@@ -2150,7 +2150,7 @@ CONFIG_MAGIC_SYSRQ=y
  # CONFIG_DEBUG_SLAB is not set
  CONFIG_DEBUG_SPINLOCK=y
  CONFIG_DEBUG_SPINLOCK_SLEEP=y
-# CONFIG_DEBUG_INFO is not set
+CONFIG_DEBUG_INFO=y
  CONFIG_INIT_DEBUG=y
  # CONFIG_SCHEDSTATS is not set
  # CONFIG_IOMMU_DEBUG is not set
diff --git a/lustre/kernel_patches/kernel_configs/kernel-2.6.9-2.6-rhel4-x86_64.config b/lustre/kernel_patches/kernel_configs/kernel-2.6.9-2.6-rhel4-x86_64.config

index 00293e2..e1ee70c 100644 (file)
--- a/lustre/kernel_patches/kernel_configs/kernel-2.6.9-2.6-rhel4-x86_64.config
+++ b/lustre/kernel_patches/kernel_configs/kernel-2.6.9-2.6-rhel4-x86_64.config
@@ -2150,7 +2150,7 @@ CONFIG_MAGIC_SYSRQ=y
  # CONFIG_DEBUG_SLAB is not set
  CONFIG_DEBUG_SPINLOCK=y
  CONFIG_DEBUG_SPINLOCK_SLEEP=y
-# CONFIG_DEBUG_INFO is not set
+CONFIG_DEBUG_INFO=y
  CONFIG_INIT_DEBUG=y
  # CONFIG_SCHEDSTATS is not set
  # CONFIG_IOMMU_DEBUG is not set
diff --git a/lustre/kernel_patches/patches/dev_read_only-2.6-fc5.patch b/lustre/kernel_patches/patches/dev_read_only-2.6-fc5.patch

index 5cab63e..51c7f66 100644 (file)
--- a/lustre/kernel_patches/patches/dev_read_only-2.6-fc5.patch
+++ b/lustre/kernel_patches/patches/dev_read_only-2.6-fc5.patch
@@ -1,7 +1,8 @@
-diff -rup linux-2.6.16.i686.orig/block/ll_rw_blk.c linux-2.6.16.i686/block/ll_rw_blk.c
---- linux-2.6.16.i686.orig/block/ll_rw_blk.c   2007-05-29 15:24:36.000000000 +0300
-+++ linux-2.6.16.i686/block/ll_rw_blk.c        2007-05-29 15:33:50.000000000 +0300
-@@ -2940,6 +2940,8 @@ static void handle_bad_sector(struct bio
+Index: linux-2.6.16.i686/block/ll_rw_blk.c
+===================================================================
+--- linux-2.6.16.i686.orig/block/ll_rw_blk.c
++++ linux-2.6.16.i686/block/ll_rw_blk.c
+@@ -2989,6 +2989,8 @@ static void handle_bad_sector(struct bio
         set_bit(BIO_EOF, &bio->bi_flags);
   }
   
@@ -10,7 +11,7 @@ diff -rup linux-2.6.16.i686.orig/block/ll_rw_blk.c linux-2.6.16.i686/block/ll_rw
   /**
    * generic_make_request: hand a buffer to its device driver for I/O
    * @bio:  The bio describing the location in memory and on the device.
-@@ -3020,6 +3022,12 @@ end_io:
+@@ -3075,6 +3077,12 @@ end_io:
   
                 if (unlikely(test_bit(QUEUE_FLAG_DEAD, &q->queue_flags)))
                         goto end_io;
@@ -23,7 +24,7 @@ diff -rup linux-2.6.16.i686.orig/block/ll_rw_blk.c linux-2.6.16.i686/block/ll_rw
   
                 /*
                  * If this device has partitions, remap block n
-@@ -3593,6 +3601,91 @@ void swap_io_context(struct io_context *
+@@ -3697,6 +3705,91 @@ void swap_io_context(struct io_context *
         *ioc2 = temp;
   }
   EXPORT_SYMBOL(swap_io_context);
@@ -115,21 +116,23 @@ diff -rup linux-2.6.16.i686.orig/block/ll_rw_blk.c linux-2.6.16.i686/block/ll_rw
   
   /*
    * sysfs parts below
-diff -rup linux-2.6.16.i686.orig/fs/block_dev.c linux-2.6.16.i686/fs/block_dev.c
---- linux-2.6.16.i686.orig/fs/block_dev.c      2006-03-20 07:53:29.000000000 +0200
-+++ linux-2.6.16.i686/fs/block_dev.c   2007-05-29 15:35:00.000000000 +0300
-@@ -60,6 +60,7 @@ static void kill_bdev(struct block_devic
- {
-       invalidate_bdev(bdev, 1);
-       truncate_inode_pages(bdev->bd_inode->i_mapping, 0);
-+      dev_clear_rdonly(bdev);
- }     
- 
- int set_blocksize(struct block_device *bdev, int size)
-diff -rup linux-2.6.16.i686.orig/include/linux/fs.h linux-2.6.16.i686/include/linux/fs.h
---- linux-2.6.16.i686.orig/include/linux/fs.h  2007-05-29 15:24:38.000000000 +0300
-+++ linux-2.6.16.i686/include/linux/fs.h       2007-05-29 15:33:50.000000000 +0300
-@@ -1541,6 +1541,10 @@ extern void file_kill(struct file *f);
+Index: linux-2.6.16.i686/fs/block_dev.c
+===================================================================
+--- linux-2.6.16.i686.orig/fs/block_dev.c
++++ linux-2.6.16.i686/fs/block_dev.c
+@@ -763,6 +763,7 @@ int blkdev_put(struct block_device *bdev
+                       blkdev_put(bdev->bd_contains);
+               }
+               bdev->bd_contains = NULL;
++              dev_clear_rdonly(bdev);
+       }
+       unlock_kernel();
+       up(&bdev->bd_sem);
+Index: linux-2.6.16.i686/include/linux/fs.h
+===================================================================
+--- linux-2.6.16.i686.orig/include/linux/fs.h
++++ linux-2.6.16.i686/include/linux/fs.h
+@@ -1595,6 +1595,10 @@ extern void file_kill(struct file *f);
   struct bio;
   extern void submit_bio(int, struct bio *);
   extern int bdev_read_only(struct block_device *);
diff --git a/lustre/kernel_patches/patches/dev_read_only-2.6-lnxi.patch b/lustre/kernel_patches/patches/dev_read_only-2.6-lnxi.patch

index c6b38ab..097e604 100644 (file)
--- a/lustre/kernel_patches/patches/dev_read_only-2.6-lnxi.patch
+++ b/lustre/kernel_patches/patches/dev_read_only-2.6-lnxi.patch
@@ -1,7 +1,8 @@
-diff -ur linux-2.6.5-lnxi.orig/drivers/block/ll_rw_blk.c linux-2.6.5-lnxi/drivers/block/ll_rw_blk.c
---- linux-2.6.5-lnxi.orig/drivers/block/ll_rw_blk.c    2004-11-11 07:28:51.000000000 -0800
-+++ linux-2.6.5-lnxi/drivers/block/ll_rw_blk.c 2005-04-11 09:42:22.750936924 -0700
-@@ -2458,7 +2458,7 @@ static inline void blk_partition_remap(s
+Index: linux-2.6.5-lnxi/drivers/block/ll_rw_blk.c
+===================================================================
+--- linux-2.6.5-lnxi.orig/drivers/block/ll_rw_blk.c
++++ linux-2.6.5-lnxi/drivers/block/ll_rw_blk.c
+@@ -2718,7 +2718,7 @@ static inline void blk_partition_remap(s
         }
   }
   
@@ -10,7 +11,7 @@ diff -ur linux-2.6.5-lnxi.orig/drivers/block/ll_rw_blk.c linux-2.6.5-lnxi/driver
   
   /**
    * generic_make_request: hand a buffer to its device driver for I/O
-@@ -2550,7 +2550,7 @@ end_io:
+@@ -2810,7 +2810,7 @@ end_io:
   
                 /* this is cfs's dev_rdonly check */
                 if (bio->bi_rw == WRITE &&
@@ -19,7 +20,7 @@ diff -ur linux-2.6.5-lnxi.orig/drivers/block/ll_rw_blk.c linux-2.6.5-lnxi/driver
                         bio_endio(bio, bio->bi_size, 0);
                         break;
                 }
-@@ -3086,53 +3086,86 @@ void swap_io_context(struct io_context *
+@@ -3395,53 +3395,86 @@ void swap_io_context(struct io_context *
         *ioc2 = temp;
   }
   
@@ -140,21 +141,23 @@ diff -ur linux-2.6.5-lnxi.orig/drivers/block/ll_rw_blk.c linux-2.6.5-lnxi/driver
   }
   
   EXPORT_SYMBOL(dev_set_rdonly);
-diff -ur linux-2.6.5-lnxi.orig/fs/block_dev.c linux-2.6.5-lnxi/fs/block_dev.c
---- linux-2.6.5-lnxi.orig/fs/block_dev.c       2004-11-11 07:28:30.000000000 -0800
-+++ linux-2.6.5-lnxi/fs/block_dev.c    2005-04-11 09:49:01.891407856 -0700
-@@ -739,6 +739,7 @@ int blkdev_put(struct block_device *bdev
+Index: linux-2.6.5-lnxi/fs/block_dev.c
+===================================================================
+--- linux-2.6.5-lnxi.orig/fs/block_dev.c
++++ linux-2.6.5-lnxi/fs/block_dev.c
+@@ -767,6 +767,7 @@ int blkdev_put(struct block_device *bdev
+                       blkdev_put(bdev->bd_contains);
+               }
+               bdev->bd_contains = NULL;
++              dev_clear_rdonly(bdev);
         }
         unlock_kernel();
         up(&bdev->bd_sem);
-+      dev_clear_rdonly(bdev);
-       bdput(bdev);
-       return ret;
- }
-diff -ur linux-2.6.5-lnxi.orig/include/linux/fs.h linux-2.6.5-lnxi/include/linux/fs.h
---- linux-2.6.5-lnxi.orig/include/linux/fs.h   2004-11-11 07:28:45.000000000 -0800
-+++ linux-2.6.5-lnxi/include/linux/fs.h        2005-04-11 09:43:27.423116140 -0700
-@@ -1385,6 +1385,10 @@ extern void file_kill(struct file *f);
+Index: linux-2.6.5-lnxi/include/linux/fs.h
+===================================================================
+--- linux-2.6.5-lnxi.orig/include/linux/fs.h
++++ linux-2.6.5-lnxi/include/linux/fs.h
+@@ -1424,6 +1424,10 @@ extern void file_kill(struct file *f);
   struct bio;
   extern int submit_bio(int, struct bio *);
   extern int bdev_read_only(struct block_device *);
diff --git a/lustre/kernel_patches/patches/dev_read_only-2.6-suse.patch b/lustre/kernel_patches/patches/dev_read_only-2.6-suse.patch

index e486944..a93ffa5 100644 (file)
--- a/lustre/kernel_patches/patches/dev_read_only-2.6-suse.patch
+++ b/lustre/kernel_patches/patches/dev_read_only-2.6-suse.patch
@@ -2,7 +2,7 @@ Index: linux-2.6.9/drivers/block/ll_rw_blk.c
  ===================================================================
  --- linux-2.6.9.orig/drivers/block/ll_rw_blk.c
  +++ linux-2.6.9/drivers/block/ll_rw_blk.c
-@@ -2326,6 +2326,8 @@ static inline int attempt_front_merge(re
+@@ -2331,6 +2331,8 @@ static inline int attempt_front_merge(re
         return 0;
   }
   
@@ -11,7 +11,7 @@ Index: linux-2.6.9/drivers/block/ll_rw_blk.c
   /**
    * blk_attempt_remerge  - attempt to remerge active head with next request
    * @q:    The &request_queue_t belonging to the device
-@@ -2631,6 +2633,13 @@ end_io:
+@@ -2636,6 +2638,13 @@ end_io:
                 if (test_bit(QUEUE_FLAG_DEAD, &q->queue_flags))
                         goto end_io;
   
@@ -25,7 +25,7 @@ Index: linux-2.6.9/drivers/block/ll_rw_blk.c
                 /*
                  * If this device has partitions, remap block n
                  * of partition p to block n+start(p) of the disk.
-@@ -3180,6 +3189,92 @@ void swap_io_context(struct io_context *
+@@ -3185,6 +3194,92 @@ void swap_io_context(struct io_context *
   
   
   /*
@@ -122,19 +122,19 @@ Index: linux-2.6.9/fs/block_dev.c
  ===================================================================
  --- linux-2.6.9.orig/fs/block_dev.c
  +++ linux-2.6.9/fs/block_dev.c
-@@ -60,6 +60,7 @@ static void kill_bdev(struct block_devic
- {
-       invalidate_bdev(bdev, 1);
-       truncate_inode_pages(bdev->bd_inode->i_mapping, 0);
-+      dev_clear_rdonly(bdev);
- }     
- 
- int set_blocksize(struct block_device *bdev, int size)
+@@ -744,6 +744,7 @@ int blkdev_put(struct block_device *bdev
+                       blkdev_put(bdev->bd_contains);
+               }
+               bdev->bd_contains = NULL;
++              dev_clear_rdonly(bdev);
+       }
+       unlock_kernel();
+       up(&bdev->bd_sem);
  Index: linux-2.6.9/include/linux/fs.h
  ===================================================================
  --- linux-2.6.9.orig/include/linux/fs.h
  +++ linux-2.6.9/include/linux/fs.h
-@@ -1492,6 +1492,10 @@ extern void file_kill(struct file *f);
+@@ -1479,6 +1479,10 @@ extern void file_kill(struct file *f);
   struct bio;
   extern void submit_bio(int, struct bio *);
   extern int bdev_read_only(struct block_device *);
diff --git a/lustre/kernel_patches/patches/dev_read_only-2.6.18-vanilla.patch b/lustre/kernel_patches/patches/dev_read_only-2.6.18-vanilla.patch

index ff6cf91..b4704a5 100644 (file)
--- a/lustre/kernel_patches/patches/dev_read_only-2.6.18-vanilla.patch
+++ b/lustre/kernel_patches/patches/dev_read_only-2.6.18-vanilla.patch
@@ -1,7 +1,8 @@
-diff -urp linux-2.6.18.1.orig/block/ll_rw_blk.c linux-2.6.18.1/block/ll_rw_blk.c
---- linux-2.6.18.1.orig/block/ll_rw_blk.c      2006-10-14 06:34:03.000000000 +0300
-+++ linux-2.6.18.1/block/ll_rw_blk.c   2007-05-29 14:50:46.000000000 +0300
-@@ -2993,6 +2993,8 @@ static void handle_bad_sector(struct bio
+Index: linux-2.6.18.1/block/ll_rw_blk.c
+===================================================================
+--- linux-2.6.18.1.orig/block/ll_rw_blk.c
++++ linux-2.6.18.1/block/ll_rw_blk.c
+@@ -3067,6 +3067,8 @@ static void handle_bad_sector(struct bio
         set_bit(BIO_EOF, &bio->bi_flags);
   }
   
@@ -10,7 +11,7 @@ diff -urp linux-2.6.18.1.orig/block/ll_rw_blk.c linux-2.6.18.1/block/ll_rw_blk.c
   /**
    * generic_make_request: hand a buffer to its device driver for I/O
    * @bio:  The bio describing the location in memory and on the device.
-@@ -3076,6 +3078,12 @@ end_io:
+@@ -3151,6 +3153,12 @@ end_io:
   
                 if (unlikely(test_bit(QUEUE_FLAG_DEAD, &q->queue_flags)))
                         goto end_io;
@@ -23,7 +24,7 @@ diff -urp linux-2.6.18.1.orig/block/ll_rw_blk.c linux-2.6.18.1/block/ll_rw_blk.c
   
                 /*
                  * If this device has partitions, remap block n
-@@ -3675,6 +3683,91 @@ void swap_io_context(struct io_context *
+@@ -3765,6 +3773,91 @@ void swap_io_context(struct io_context *
         *ioc2 = temp;
   }
   EXPORT_SYMBOL(swap_io_context);
@@ -115,21 +116,23 @@ diff -urp linux-2.6.18.1.orig/block/ll_rw_blk.c linux-2.6.18.1/block/ll_rw_blk.c
   
   /*
    * sysfs parts below
-diff -urp linux-2.6.18.1.orig/fs/block_dev.c linux-2.6.18.1/fs/block_dev.c
---- linux-2.6.18.1.orig/fs/block_dev.c 2006-10-14 06:34:03.000000000 +0300
-+++ linux-2.6.18.1/fs/block_dev.c      2007-05-29 14:53:38.000000000 +0300
-@@ -58,6 +58,7 @@ static void kill_bdev(struct block_devic
- {
-       invalidate_bdev(bdev, 1);
-       truncate_inode_pages(bdev->bd_inode->i_mapping, 0);
-+      dev_clear_rdonly(bdev);
- }     
- 
- int set_blocksize(struct block_device *bdev, int size)
-diff -urp linux-2.6.18.1.orig/include/linux/fs.h linux-2.6.18.1/include/linux/fs.h
---- linux-2.6.18.1.orig/include/linux/fs.h     2006-10-14 06:34:03.000000000 +0300
-+++ linux-2.6.18.1/include/linux/fs.h  2007-05-29 14:50:46.000000000 +0300
-@@ -1632,6 +1632,10 @@ extern void file_kill(struct file *f);
+Index: linux-2.6.18.1/fs/block_dev.c
+===================================================================
+--- linux-2.6.18.1.orig/fs/block_dev.c
++++ linux-2.6.18.1/fs/block_dev.c
+@@ -1059,6 +1059,7 @@ static int __blkdev_put(struct block_dev
+               if (bdev != bdev->bd_contains)
+                       victim = bdev->bd_contains;
+               bdev->bd_contains = NULL;
++              dev_clear_rdonly(bdev);
+       }
+       unlock_kernel();
+       mutex_unlock(&bdev->bd_mutex);
+Index: linux-2.6.18.1/include/linux/fs.h
+===================================================================
+--- linux-2.6.18.1.orig/include/linux/fs.h
++++ linux-2.6.18.1/include/linux/fs.h
+@@ -1685,6 +1685,10 @@ extern void file_kill(struct file *f);
   struct bio;
   extern void submit_bio(int, struct bio *);
   extern int bdev_read_only(struct block_device *);
diff --git a/lustre/kernel_patches/patches/dev_read_only-2.6.22-vanilla.patch b/lustre/kernel_patches/patches/dev_read_only-2.6.22-vanilla.patch

index 8d144d6..8a46dc5 100644 (file)
--- a/lustre/kernel_patches/patches/dev_read_only-2.6.22-vanilla.patch
+++ b/lustre/kernel_patches/patches/dev_read_only-2.6.22-vanilla.patch
@@ -1,8 +1,8 @@
  Index: linux-2.6.22.5/block/ll_rw_blk.c
  ===================================================================
---- linux-2.6.22.5.orig/block/ll_rw_blk.c      2007-08-22 17:23:54.000000000 -0600
-+++ linux-2.6.22.5/block/ll_rw_blk.c   2008-02-21 01:07:16.000000000 -0700
-@@ -3101,6 +3101,8 @@
+--- linux-2.6.22.5.orig/block/ll_rw_blk.c
++++ linux-2.6.22.5/block/ll_rw_blk.c
+@@ -3101,6 +3101,8 @@ static inline int should_fail_request(st
   
   #endif /* CONFIG_FAIL_MAKE_REQUEST */
   
@@ -11,7 +11,7 @@ Index: linux-2.6.22.5/block/ll_rw_blk.c
   /**
    * generic_make_request: hand a buffer to its device driver for I/O
    * @bio:  The bio describing the location in memory and on the device.
-@@ -3185,6 +3187,12 @@
+@@ -3185,6 +3187,12 @@ end_io:
   
                 if (unlikely(test_bit(QUEUE_FLAG_DEAD, &q->queue_flags)))
                         goto end_io;
@@ -24,7 +24,7 @@ Index: linux-2.6.22.5/block/ll_rw_blk.c
   
                 if (should_fail_request(bio))
                         goto end_io;
-@@ -3850,6 +3858,91 @@
+@@ -3850,6 +3858,91 @@ void swap_io_context(struct io_context *
         *ioc2 = temp;
   }
   EXPORT_SYMBOL(swap_io_context);
@@ -118,21 +118,21 @@ Index: linux-2.6.22.5/block/ll_rw_blk.c
    * sysfs parts below
  Index: linux-2.6.22.5/fs/block_dev.c
  ===================================================================
---- linux-2.6.22.5.orig/fs/block_dev.c 2007-08-22 17:23:54.000000000 -0600
-+++ linux-2.6.22.5/fs/block_dev.c      2008-02-21 01:07:16.000000000 -0700
-@@ -63,6 +63,7 @@
-               return;
-       invalidate_bh_lrus();
-       truncate_inode_pages(bdev->bd_inode->i_mapping, 0);
-+      dev_clear_rdonly(bdev);
- }     
- 
- int set_blocksize(struct block_device *bdev, int size)
+--- linux-2.6.22.5.orig/fs/block_dev.c
++++ linux-2.6.22.5/fs/block_dev.c
+@@ -1294,6 +1294,7 @@ static int __blkdev_put(struct block_dev
+               if (bdev != bdev->bd_contains)
+                       victim = bdev->bd_contains;
+               bdev->bd_contains = NULL;
++              dev_clear_rdonly(bdev);
+       }
+       unlock_kernel();
+       mutex_unlock(&bdev->bd_mutex);
  Index: linux-2.6.22.5/include/linux/fs.h
  ===================================================================
---- linux-2.6.22.5.orig/include/linux/fs.h     2008-02-21 00:58:18.000000000 -0700
-+++ linux-2.6.22.5/include/linux/fs.h  2008-02-21 01:07:16.000000000 -0700
-@@ -1744,6 +1744,10 @@
+--- linux-2.6.22.5.orig/include/linux/fs.h
++++ linux-2.6.22.5/include/linux/fs.h
+@@ -1744,6 +1744,10 @@ struct bio;
   extern void submit_bio(int, struct bio *);
   extern int bdev_read_only(struct block_device *);
   #endif
diff --git a/lustre/kernel_patches/patches/md-rebuild-policy.patch b/lustre/kernel_patches/patches/md-rebuild-policy.patch

index e6c9f9c..62bb484 100644 (file)
--- a/lustre/kernel_patches/patches/md-rebuild-policy.patch
+++ b/lustre/kernel_patches/patches/md-rebuild-policy.patch
@@ -33,15 +33,16 @@ diff -pur linux-2.6.18-53.orig/drivers/md/md.c linux-2.6.18-53/drivers/md/md.c
         { .ctl_name = 0 }
   };
   
-@@ -4980,14 +4998,15 @@ static int is_mddev_idle(mddev_t *mddev)
+@@ -4980,15 +4998,16 @@ static int is_mddev_idle(mddev_t *mddev)
+ {
         mdk_rdev_t * rdev;
-       struct list_head *tmp;
         int idle;
  -      unsigned long curr_events;
  +      unsigned long rw, sync;
   
         idle = 1;
-       ITERATE_RDEV(mddev,rdev,tmp) {
+       rcu_read_lock();
+       rdev_for_each_rcu(rdev, mddev) {
                 struct gendisk *disk = rdev->bdev->bd_contains->bd_disk;
  -              curr_events = disk_stat_read(disk, sectors[0]) + 
  -                              disk_stat_read(disk, sectors[1]) - 
diff --git a/lustre/kernel_patches/patches/md-soft-lockups.patch b/lustre/kernel_patches/patches/md-soft-lockups.patch

new file mode 100644 (file)

index 0000000..cde9a34
--- /dev/null
+++ b/lustre/kernel_patches/patches/md-soft-lockups.patch
@@ -0,0 +1,13 @@
+Index: linux-2.6.18-92.1.10/drivers/md/raid5.c
+===================================================================
+--- linux-2.6.18-92.1.10.orig/drivers/md/raid5.c       2008-11-10 11:00:51.000000000 +0900
++++ linux-2.6.18-92.1.10/drivers/md/raid5.c    2008-11-10 11:02:38.000000000 +0900
+@@ -3251,6 +3251,8 @@
+               handle_stripe(sh, conf->spare_page, NULL);
+               release_stripe(sh);
+ 
++              cond_resched();
++
+               spin_lock_irq(&conf->device_lock);
+       }
+       PRINTK("%d stripes handled\n", handled);
diff --git a/lustre/kernel_patches/patches/qsnet-rhel4-2.6.patch b/lustre/kernel_patches/patches/qsnet-rhel4-2.6.patch

index 6d584b4..f198a43 100644 (file)
--- a/lustre/kernel_patches/patches/qsnet-rhel4-2.6.patch
+++ b/lustre/kernel_patches/patches/qsnet-rhel4-2.6.patch
@@ -1693,7 +1693,7 @@ Index: linux-269-5502/include/linux/ptrack.h
  +      .ptrack_list = LIST_HEAD_INIT(tsk.ptrack_list)
  +
  +#else
-+#define ptrack_call_callbacks (phase, child) (0)
++#define ptrack_call_callbacks(phase, child) (0)
  +
  +#define INIT_TASK_PTRACK(tsk)
  +
diff --git a/lustre/kernel_patches/patches/qsnet-suse-2.6.patch b/lustre/kernel_patches/patches/qsnet-suse-2.6.patch

index b312ab0..27b5a52 100644 (file)
--- a/lustre/kernel_patches/patches/qsnet-suse-2.6.patch
+++ b/lustre/kernel_patches/patches/qsnet-suse-2.6.patch
@@ -991,7 +991,7 @@ Index: LINUX-SRC-TREE/include/linux/ptrack.h
  +      .ptrack_list = LIST_HEAD_INIT(tsk.ptrack_list)
  +
  +#else
-+#define ptrack_call_callbacks (phase, child) (0)
++#define ptrack_call_callbacks(phase, child) (0)
  +
  +#define INIT_TASK_PTRACK(tsk)
  +
diff --git a/lustre/kernel_patches/patches/quota-large-limits-rhel5.patch b/lustre/kernel_patches/patches/quota-large-limits-rhel5.patch

new file mode 100644 (file)

index 0000000..4f3a3bc
--- /dev/null
+++ b/lustre/kernel_patches/patches/quota-large-limits-rhel5.patch
@@ -0,0 +1,616 @@
+diff -rNpu linux-2.6.16.54-0.2.5/fs/dquot.c linux-2.6.16.54-0.2.5-quota/fs/dquot.c
+--- linux-2.6.16.54-0.2.5/fs/dquot.c   2008-03-18 15:48:26.000000000 +0300
++++ linux-2.6.16.54-0.2.5-quota/fs/dquot.c     2008-03-17 22:43:11.000000000 +0300
+@@ -1588,10 +1588,19 @@ int vfs_get_dqblk(struct super_block *sb
+ }
+ 
+ /* Generic routine for setting common part of quota structure */
+-static void do_set_dqblk(struct dquot *dquot, struct if_dqblk *di)
++static int do_set_dqblk(struct dquot *dquot, struct if_dqblk *di)
+ {
+       struct mem_dqblk *dm = &dquot->dq_dqb;
+       int check_blim = 0, check_ilim = 0;
++      struct mem_dqinfo *dqi = &sb_dqopt(dquot->dq_sb)->info[dquot->dq_type];
++
++      if ((di->dqb_valid & QIF_BLIMITS &&
++           (di->dqb_bhardlimit > dqi->dqi_maxblimit ||
++            di->dqb_bsoftlimit > dqi->dqi_maxblimit)) ||
++          (di->dqb_valid & QIF_ILIMITS &&
++           (di->dqb_ihardlimit > dqi->dqi_maxilimit ||
++            di->dqb_isoftlimit > dqi->dqi_maxilimit)))
++              return -ERANGE;
+ 
+       spin_lock(&dq_data_lock);
+       if (di->dqb_valid & QIF_SPACE) {
+@@ -1623,7 +1632,7 @@ static void do_set_dqblk(struct dquot *d
+                       clear_bit(DQ_BLKS_B, &dquot->dq_flags);
+               }
+               else if (!(di->dqb_valid & QIF_BTIME))  /* Set grace only if user hasn't provided his own... */
+-                      dm->dqb_btime = get_seconds() + sb_dqopt(dquot->dq_sb)->info[dquot->dq_type].dqi_bgrace;
++                      dm->dqb_btime = get_seconds() + dqi->dqi_bgrace;
+       }
+       if (check_ilim) {
+               if (!dm->dqb_isoftlimit || dm->dqb_curinodes < dm->dqb_isoftlimit) {
+@@ -1631,7 +1640,7 @@ static void do_set_dqblk(struct dquot *d
+                       clear_bit(DQ_INODES_B, &dquot->dq_flags);
+               }
+               else if (!(di->dqb_valid & QIF_ITIME))  /* Set grace only if user hasn't provided his own... */
+-                      dm->dqb_itime = get_seconds() + sb_dqopt(dquot->dq_sb)->info[dquot->dq_type].dqi_igrace;
++                      dm->dqb_itime = get_seconds() + dqi->dqi_igrace;
+       }
+       if (dm->dqb_bhardlimit || dm->dqb_bsoftlimit || dm->dqb_ihardlimit || dm->dqb_isoftlimit)
+               clear_bit(DQ_FAKE_B, &dquot->dq_flags);
+@@ -1639,21 +1648,24 @@ static void do_set_dqblk(struct dquot *d
+               set_bit(DQ_FAKE_B, &dquot->dq_flags);
+       spin_unlock(&dq_data_lock);
+       mark_dquot_dirty(dquot);
++
++      return 0;
+ }
+ 
+ int vfs_set_dqblk(struct super_block *sb, int type, qid_t id, struct if_dqblk *di)
+ {
+       struct dquot *dquot;
++      int rc;
+ 
+       mutex_lock(&sb_dqopt(sb)->dqonoff_mutex);
+       if (!(dquot = dqget(sb, id, type))) {
+               mutex_unlock(&sb_dqopt(sb)->dqonoff_mutex);
+               return -ESRCH;
+       }
+-      do_set_dqblk(dquot, di);
++      rc = do_set_dqblk(dquot, di);
+       dqput(dquot);
+       mutex_unlock(&sb_dqopt(sb)->dqonoff_mutex);
+-      return 0;
++      return rc;
+ }
+ 
+ /* Generic routine for getting common part of quota file information */
+diff -rNpu linux-2.6.16.54-0.2.5/fs/quota_v1.c linux-2.6.16.54-0.2.5-quota/fs/quota_v1.c
+--- linux-2.6.16.54-0.2.5/fs/quota_v1.c        2006-03-20 08:53:29.000000000 +0300
++++ linux-2.6.16.54-0.2.5-quota/fs/quota_v1.c  2008-03-17 22:42:47.000000000 +0300
+@@ -139,6 +139,9 @@ static int v1_read_file_info(struct supe
+               goto out;
+       }
+       ret = 0;
++      /* limits are stored as unsigned 32-bit data */
++      dqopt->info[type].dqi_maxblimit = 0xffffffff;
++      dqopt->info[type].dqi_maxilimit = 0xffffffff;
+       dqopt->info[type].dqi_igrace = dqblk.dqb_itime ? dqblk.dqb_itime : MAX_IQ_TIME;
+       dqopt->info[type].dqi_bgrace = dqblk.dqb_btime ? dqblk.dqb_btime : MAX_DQ_TIME;
+ out:
+diff -rNpu linux-2.6.16.54-0.2.5/fs/quota_v2.c linux-2.6.16.54-0.2.5-quota/fs/quota_v2.c
+--- linux-2.6.16.54-0.2.5/fs/quota_v2.c        2006-03-20 08:53:29.000000000 +0300
++++ linux-2.6.16.54-0.2.5-quota/fs/quota_v2.c  2008-03-18 11:58:02.000000000 +0300
+@@ -23,26 +23,64 @@ MODULE_LICENSE("GPL");
+ typedef char *dqbuf_t;
+ 
+ #define GETIDINDEX(id, depth) (((id) >> ((V2_DQTREEDEPTH-(depth)-1)*8)) & 0xff)
+-#define GETENTRIES(buf) ((struct v2_disk_dqblk *)(((char *)buf)+sizeof(struct v2_disk_dqdbheader)))
++#define GETENTRIES(buf) ((union v2_disk_dqblk *)(((char *)buf) + \
++                       sizeof(struct v2_disk_dqdbheader)))
++#define REV_ASSERT(r) BUG_ON((rev) != 0 && (rev) != 1)
++
++static const union v2_disk_dqblk emptydquot;
++static const union v2_disk_dqblk fakedquot[2] = {
++      {.r0 = {.dqb_itime = __constant_cpu_to_le64(1LLU)} },
++      {.r1 = {.dqb_itime = __constant_cpu_to_le64(1LLU)} }
++};
+ 
+-/* Check whether given file is really vfsv0 quotafile */
+-static int v2_check_quota_file(struct super_block *sb, int type)
++static inline uint v2_dqblksz(uint rev)
++{
++      uint sz;
++
++      REV_ASSERT(rev);
++
++      if (rev == 0)
++              sz = sizeof(struct v2_disk_dqblk_r0);
++      else
++              sz = sizeof(struct v2_disk_dqblk_r1);
++
++      return sz;
++}
++
++/* Number of quota entries in a block */
++static inline int v2_dqstrinblk(uint rev)
++{
++      return (V2_DQBLKSIZE-sizeof(struct v2_disk_dqdbheader))/v2_dqblksz(rev);
++}
++
++/* Get revision of a quota file, -1 if it does not look a quota file */
++static int v2_quota_file_revision(struct super_block *sb, int type)
+ {
+       struct v2_disk_dqheader dqhead;
+       ssize_t size;
+       static const uint quota_magics[] = V2_INITQMAGICS;
+-      static const uint quota_versions[] = V2_INITQVERSIONS;
++      static const uint quota_versions_r0[] = V2_INITQVERSIONS_R0;
++      static const uint quota_versions_r1[] = V2_INITQVERSIONS_R1;
+  
+       size = sb->s_op->quota_read(sb, type, (char *)&dqhead, sizeof(struct v2_disk_dqheader), 0);
+       if (size != sizeof(struct v2_disk_dqheader)) {
+               printk("quota_v2: failed read expected=%zd got=%zd\n",
+                       sizeof(struct v2_disk_dqheader), size);
+-              return 0;
++              return -1;
+       }
+-      if (le32_to_cpu(dqhead.dqh_magic) != quota_magics[type] ||
+-          le32_to_cpu(dqhead.dqh_version) != quota_versions[type])
+-              return 0;
+-      return 1;
++      if (le32_to_cpu(dqhead.dqh_magic) == quota_magics[type]) {
++              if (le32_to_cpu(dqhead.dqh_version) == quota_versions_r0[type])
++                      return 0;
++              if (le32_to_cpu(dqhead.dqh_version) == quota_versions_r1[type])
++                      return 1;
++      }
++      return -1;
++}
++
++/* Check whether given file is really vfsv0 quotafile */
++static inline int v2_check_quota_file(struct super_block *sb, int type)
++{
++      return v2_quota_file_revision(sb, type) != -1;
+ }
+ 
+ /* Read information header from quota file */
+@@ -51,6 +89,13 @@ static int v2_read_file_info(struct supe
+       struct v2_disk_dqinfo dinfo;
+       struct mem_dqinfo *info = sb_dqopt(sb)->info+type;
+       ssize_t size;
++      int rev;
++
++      rev = v2_quota_file_revision(sb, type);
++      if (rev < 0) {
++              printk(KERN_WARNING "Second quota file check failed.\n");
++              return -1;
++      }
+ 
+       size = sb->s_op->quota_read(sb, type, (char *)&dinfo,
+              sizeof(struct v2_disk_dqinfo), V2_DQINFOOFF);
+@@ -65,6 +110,16 @@ static int v2_read_file_info(struct supe
+       info->u.v2_i.dqi_blocks = le32_to_cpu(dinfo.dqi_blocks);
+       info->u.v2_i.dqi_free_blk = le32_to_cpu(dinfo.dqi_free_blk);
+       info->u.v2_i.dqi_free_entry = le32_to_cpu(dinfo.dqi_free_entry);
++
++      info->u.v2_i.dqi_revision = rev;
++      if (rev == 0) {
++              info->dqi_maxblimit = 0xffffffffULL;
++              info->dqi_maxilimit = 0xffffffffULL;
++      } else {
++              info->dqi_maxblimit = 0xffffffffffffffffULL;
++              info->dqi_maxilimit = 0xffffffffffffffffULL;
++      }
++
+       return 0;
+ }
+ 
+@@ -94,29 +149,61 @@ static int v2_write_file_info(struct sup
+       return 0;
+ }
+ 
+-static void disk2memdqb(struct mem_dqblk *m, struct v2_disk_dqblk *d)
++static void disk2memdqb(struct mem_dqblk *m, union v2_disk_dqblk *d, uint rev)
+ {
+-      m->dqb_ihardlimit = le32_to_cpu(d->dqb_ihardlimit);
+-      m->dqb_isoftlimit = le32_to_cpu(d->dqb_isoftlimit);
+-      m->dqb_curinodes = le32_to_cpu(d->dqb_curinodes);
+-      m->dqb_itime = le64_to_cpu(d->dqb_itime);
+-      m->dqb_bhardlimit = le32_to_cpu(d->dqb_bhardlimit);
+-      m->dqb_bsoftlimit = le32_to_cpu(d->dqb_bsoftlimit);
+-      m->dqb_curspace = le64_to_cpu(d->dqb_curspace);
+-      m->dqb_btime = le64_to_cpu(d->dqb_btime);
+-}
+-
+-static void mem2diskdqb(struct v2_disk_dqblk *d, struct mem_dqblk *m, qid_t id)
+-{
+-      d->dqb_ihardlimit = cpu_to_le32(m->dqb_ihardlimit);
+-      d->dqb_isoftlimit = cpu_to_le32(m->dqb_isoftlimit);
+-      d->dqb_curinodes = cpu_to_le32(m->dqb_curinodes);
+-      d->dqb_itime = cpu_to_le64(m->dqb_itime);
+-      d->dqb_bhardlimit = cpu_to_le32(m->dqb_bhardlimit);
+-      d->dqb_bsoftlimit = cpu_to_le32(m->dqb_bsoftlimit);
+-      d->dqb_curspace = cpu_to_le64(m->dqb_curspace);
+-      d->dqb_btime = cpu_to_le64(m->dqb_btime);
+-      d->dqb_id = cpu_to_le32(id);
++      REV_ASSERT(rev);
++
++      if (rev == 0) {
++              struct v2_disk_dqblk_r0 *ddqblk = &d->r0;
++              m->dqb_ihardlimit = le32_to_cpu(ddqblk->dqb_ihardlimit);
++              m->dqb_isoftlimit = le32_to_cpu(ddqblk->dqb_isoftlimit);
++              m->dqb_curinodes = le32_to_cpu(ddqblk->dqb_curinodes);
++              m->dqb_itime = le64_to_cpu(ddqblk->dqb_itime);
++              m->dqb_bhardlimit = le32_to_cpu(ddqblk->dqb_bhardlimit);
++              m->dqb_bsoftlimit = le32_to_cpu(ddqblk->dqb_bsoftlimit);
++              m->dqb_curspace = le64_to_cpu(ddqblk->dqb_curspace);
++              m->dqb_btime = le64_to_cpu(ddqblk->dqb_btime);
++      } else {
++              struct v2_disk_dqblk_r1 *ddqblk = &d->r1;
++              m->dqb_ihardlimit = le64_to_cpu(ddqblk->dqb_ihardlimit);
++              m->dqb_isoftlimit = le64_to_cpu(ddqblk->dqb_isoftlimit);
++              m->dqb_curinodes = le64_to_cpu(ddqblk->dqb_curinodes);
++              m->dqb_itime = le64_to_cpu(ddqblk->dqb_itime);
++              m->dqb_bhardlimit = le64_to_cpu(ddqblk->dqb_bhardlimit);
++              m->dqb_bsoftlimit = le64_to_cpu(ddqblk->dqb_bsoftlimit);
++              m->dqb_curspace = le64_to_cpu(ddqblk->dqb_curspace);
++              m->dqb_btime = le64_to_cpu(ddqblk->dqb_btime);
++      }
++}
++
++static void mem2diskdqb(union v2_disk_dqblk *d, struct mem_dqblk *m,
++                      qid_t id, uint rev)
++{
++      REV_ASSERT(rev);
++
++      if (rev == 0) {
++              struct v2_disk_dqblk_r0 *ddqblk = &d->r0;
++              ddqblk->dqb_id = cpu_to_le32(id);
++              ddqblk->dqb_ihardlimit = cpu_to_le32((__u32)m->dqb_ihardlimit);
++              ddqblk->dqb_isoftlimit = cpu_to_le32((__u32)m->dqb_isoftlimit);
++              ddqblk->dqb_curinodes = cpu_to_le32((__u32)m->dqb_curinodes);
++              ddqblk->dqb_itime = cpu_to_le64(m->dqb_itime);
++              ddqblk->dqb_bhardlimit = cpu_to_le32((__u32)m->dqb_bhardlimit);
++              ddqblk->dqb_bsoftlimit = cpu_to_le32((__u32)m->dqb_bsoftlimit);
++              ddqblk->dqb_curspace = cpu_to_le64(m->dqb_curspace);
++              ddqblk->dqb_btime = cpu_to_le64(ddqblk->dqb_btime);
++      } else {
++              struct v2_disk_dqblk_r1 *ddqblk = &d->r1;
++              ddqblk->dqb_id = cpu_to_le32(id);
++              ddqblk->dqb_ihardlimit = cpu_to_le64(m->dqb_ihardlimit);
++              ddqblk->dqb_isoftlimit = cpu_to_le64(m->dqb_isoftlimit);
++              ddqblk->dqb_curinodes = cpu_to_le64(m->dqb_curinodes);
++              ddqblk->dqb_itime = cpu_to_le64(m->dqb_itime);
++              ddqblk->dqb_bhardlimit = cpu_to_le64(m->dqb_bhardlimit);
++              ddqblk->dqb_bsoftlimit = cpu_to_le64(m->dqb_bsoftlimit);
++              ddqblk->dqb_curspace = cpu_to_le64(m->dqb_curspace);
++              ddqblk->dqb_btime = cpu_to_le64(ddqblk->dqb_btime);
++      }
+ }
+ 
+ static dqbuf_t getdqbuf(void)
+@@ -268,10 +355,10 @@ static uint find_free_dqentry(struct dqu
+ {
+       struct super_block *sb = dquot->dq_sb;
+       struct mem_dqinfo *info = sb_dqopt(sb)->info+dquot->dq_type;
+-      uint blk, i;
++      uint blk, i, rev = info->u.v2_i.dqi_revision;
++      uint dqblksz = v2_dqblksz(rev), dqstrinblk = v2_dqstrinblk(rev);
+       struct v2_disk_dqdbheader *dh;
+-      struct v2_disk_dqblk *ddquot;
+-      struct v2_disk_dqblk fakedquot;
++      union v2_disk_dqblk *ddquot;
+       dqbuf_t buf;
+ 
+       *err = 0;
+@@ -298,17 +385,18 @@ static uint find_free_dqentry(struct dqu
+               info->u.v2_i.dqi_free_entry = blk;
+               mark_info_dirty(sb, dquot->dq_type);
+       }
+-      if (le16_to_cpu(dh->dqdh_entries)+1 >= V2_DQSTRINBLK)   /* Block will be full? */
++      /* Block will be full? */
++      if (le16_to_cpu(dh->dqdh_entries)+1 >= dqstrinblk)
+               if ((*err = remove_free_dqentry(sb, dquot->dq_type, buf, blk)) < 0) {
+                       printk(KERN_ERR "VFS: find_free_dqentry(): Can't remove block (%u) from entry free list.\n", blk);
+                       goto out_buf;
+               }
+       dh->dqdh_entries = cpu_to_le16(le16_to_cpu(dh->dqdh_entries)+1);
+-      memset(&fakedquot, 0, sizeof(struct v2_disk_dqblk));
+       /* Find free structure in block */
+-      for (i = 0; i < V2_DQSTRINBLK && memcmp(&fakedquot, ddquot+i, sizeof(struct v2_disk_dqblk)); i++);
++      for (i = 0; i < dqstrinblk && memcmp(&emptydquot, ddquot, dqblksz);
++           i++, ddquot = (char *)ddquot + dqblksz);
+ #ifdef __QUOTA_V2_PARANOIA
+-      if (i == V2_DQSTRINBLK) {
++      if (i == dqstrinblk) {
+               printk(KERN_ERR "VFS: find_free_dqentry(): Data block full but it shouldn't.\n");
+               *err = -EIO;
+               goto out_buf;
+@@ -318,7 +406,8 @@ static uint find_free_dqentry(struct dqu
+               printk(KERN_ERR "VFS: find_free_dqentry(): Can't write quota data block %u.\n", blk);
+               goto out_buf;
+       }
+-      dquot->dq_off = (blk<<V2_DQBLKSIZE_BITS)+sizeof(struct v2_disk_dqdbheader)+i*sizeof(struct v2_disk_dqblk);
++      dquot->dq_off = (blk<<V2_DQBLKSIZE_BITS)+
++                      ((char *)ddquot - (char *)buf);
+       freedqbuf(buf);
+       return blk;
+ out_buf:
+@@ -392,7 +481,9 @@ static int v2_write_dquot(struct dquot *
+ {
+       int type = dquot->dq_type;
+       ssize_t ret;
+-      struct v2_disk_dqblk ddquot, empty;
++      union v2_disk_dqblk ddquot;
++      uint rev = sb_dqopt(dquot->dq_sb)->info[type].u.v2_i.dqi_revision;
++      uint dqblksz = v2_dqblksz(rev);
+ 
+       /* dq_off is guarded by dqio_sem */
+       if (!dquot->dq_off)
+@@ -401,18 +492,22 @@ static int v2_write_dquot(struct dquot *
+                       return ret;
+               }
+       spin_lock(&dq_data_lock);
+-      mem2diskdqb(&ddquot, &dquot->dq_dqb, dquot->dq_id);
++      mem2diskdqb(&ddquot, &dquot->dq_dqb, dquot->dq_id, rev);
+       /* Argh... We may need to write structure full of zeroes but that would be
+        * treated as an empty place by the rest of the code. Format change would
+        * be definitely cleaner but the problems probably are not worth it */
+-      memset(&empty, 0, sizeof(struct v2_disk_dqblk));
+-      if (!memcmp(&empty, &ddquot, sizeof(struct v2_disk_dqblk)))
+-              ddquot.dqb_itime = cpu_to_le64(1);
++      if (!memcmp(&emptydquot, &ddquot, dqblksz)) {
++              if (rev == 0)
++                      ddquot.r0.dqb_itime = cpu_to_le64(1);
++              else
++                      ddquot.r1.dqb_itime = cpu_to_le64(1);
++      }
+       spin_unlock(&dq_data_lock);
+       ret = dquot->dq_sb->s_op->quota_write(dquot->dq_sb, type,
+-            (char *)&ddquot, sizeof(struct v2_disk_dqblk), dquot->dq_off);
+-      if (ret != sizeof(struct v2_disk_dqblk)) {
+-              printk(KERN_WARNING "VFS: dquota write failed on dev %s\n", dquot->dq_sb->s_id);
++            (char *)&ddquot, dqblksz, dquot->dq_off);
++      if (ret != dqblksz) {
++              printk(KERN_WARNING "VFS: dquota write failed on dev %s\n",
++                      dquot->dq_sb->s_id);
+               if (ret >= 0)
+                       ret = -ENOSPC;
+       }
+@@ -431,6 +526,7 @@ static int free_dqentry(struct dquot *dq
+       struct v2_disk_dqdbheader *dh;
+       dqbuf_t buf = getdqbuf();
+       int ret = 0;
++      uint rev = sb_dqopt(sb)->info[type].u.v2_i.dqi_revision;
+ 
+       if (!buf)
+               return -ENOMEM;
+@@ -456,8 +552,8 @@ static int free_dqentry(struct dquot *dq
+       }
+       else {
+               memset(buf+(dquot->dq_off & ((1 << V2_DQBLKSIZE_BITS)-1)), 0,
+-                sizeof(struct v2_disk_dqblk));
+-              if (le16_to_cpu(dh->dqdh_entries) == V2_DQSTRINBLK-1) {
++                v2_dqblksz(rev));
++              if (le16_to_cpu(dh->dqdh_entries) == v2_dqstrinblk(rev)-1) {
+                       /* Insert will write block itself */
+                       if ((ret = insert_free_dqentry(sb, type, buf, blk)) < 0) {
+                               printk(KERN_ERR "VFS: Can't insert quota data block (%u) to free entry list.\n", blk);
+@@ -529,41 +625,56 @@ static int v2_delete_dquot(struct dquot 
+       return remove_tree(dquot, &tmp, 0);
+ }
+ 
++static inline __u32 dqid(union v2_disk_dqblk *ddquot, uint rev)
++{
++      __u32 dq_id;
++
++      REV_ASSERT(rev);
++
++      if (rev == 0)
++              dq_id = le32_to_cpu(ddquot->r0.dqb_id);
++      else
++              dq_id = le32_to_cpu(ddquot->r1.dqb_id);
++
++      return dq_id;
++}
++
+ /* Find entry in block */
+ static loff_t find_block_dqentry(struct dquot *dquot, uint blk)
+ {
+       dqbuf_t buf = getdqbuf();
+       loff_t ret = 0;
+       int i;
+-      struct v2_disk_dqblk *ddquot = GETENTRIES(buf);
++      union v2_disk_dqblk *ddquot = GETENTRIES(buf);
++      int type = dquot->dq_type;
++      uint rev = sb_dqopt(dquot->dq_sb)->info[type].u.v2_i.dqi_revision;
++      uint dqblksz = v2_dqblksz(rev), dqstrinblk = v2_dqstrinblk(rev);
+ 
+       if (!buf)
+               return -ENOMEM;
+-      if ((ret = read_blk(dquot->dq_sb, dquot->dq_type, blk, buf)) < 0) {
++
++      ret = read_blk(dquot->dq_sb, type, blk, buf);
++      if (ret < 0) {
+               printk(KERN_ERR "VFS: Can't read quota tree block %u.\n", blk);
+               goto out_buf;
+       }
+       if (dquot->dq_id)
+-              for (i = 0; i < V2_DQSTRINBLK &&
+-                   le32_to_cpu(ddquot[i].dqb_id) != dquot->dq_id; i++);
++              for (i = 0; i < dqstrinblk && dqid(ddquot, rev) != dquot->dq_id;
++                   i++, ddquot = (char *)ddquot + dqblksz);
+       else {  /* ID 0 as a bit more complicated searching... */
+-              struct v2_disk_dqblk fakedquot;
+-
+-              memset(&fakedquot, 0, sizeof(struct v2_disk_dqblk));
+-              for (i = 0; i < V2_DQSTRINBLK; i++)
+-                      if (!le32_to_cpu(ddquot[i].dqb_id) &&
+-                          memcmp(&fakedquot, ddquot+i, sizeof(struct v2_disk_dqblk)))
++              for (i = 0; i < dqstrinblk; i++, ddquot = (char *)ddquot+dqblksz)
++                      if (!dqid(ddquot, rev) &&
++                          memcmp(&emptydquot, ddquot, dqblksz))
+                               break;
+       }
+-      if (i == V2_DQSTRINBLK) {
++      if (i == dqstrinblk) {
+               printk(KERN_ERR "VFS: Quota for id %u referenced "
+                 "but not present.\n", dquot->dq_id);
+               ret = -EIO;
+               goto out_buf;
+       }
+       else
+-              ret = (blk << V2_DQBLKSIZE_BITS) + sizeof(struct
+-                v2_disk_dqdbheader) + i * sizeof(struct v2_disk_dqblk);
++              ret = (blk << V2_DQBLKSIZE_BITS)+((char *)ddquot-(char *)buf);
+ out_buf:
+       freedqbuf(buf);
+       return ret;
+@@ -605,7 +716,7 @@ static int v2_read_dquot(struct dquot *d
+ {
+       int type = dquot->dq_type;
+       loff_t offset;
+-      struct v2_disk_dqblk ddquot, empty;
++      union v2_disk_dqblk ddquot;
+       int ret = 0;
+ 
+ #ifdef __QUOTA_V2_PARANOIA
+@@ -626,25 +737,30 @@ static int v2_read_dquot(struct dquot *d
+               ret = offset;
+       }
+       else {
++              uint rev = sb_dqopt(dquot->dq_sb)->info[type].u.v2_i.
++                         dqi_revision;
++              uint  dqblksz = v2_dqblksz(rev);
+               dquot->dq_off = offset;
+-              if ((ret = dquot->dq_sb->s_op->quota_read(dquot->dq_sb, type,
+-                  (char *)&ddquot, sizeof(struct v2_disk_dqblk), offset))
+-                  != sizeof(struct v2_disk_dqblk)) {
++              ret = dquot->dq_sb->s_op->quota_read(dquot->dq_sb, type,
++                                         (char *)&ddquot, dqblksz, offset);
++              if (ret != dqblksz) {
+                       if (ret >= 0)
+                               ret = -EIO;
+                       printk(KERN_ERR "VFS: Error while reading quota "
+                         "structure for id %u.\n", dquot->dq_id);
+-                      memset(&ddquot, 0, sizeof(struct v2_disk_dqblk));
++                      memset(&ddquot, 0, dqblksz);
+               }
+               else {
+                       ret = 0;
+                       /* We need to escape back all-zero structure */
+-                      memset(&empty, 0, sizeof(struct v2_disk_dqblk));
+-                      empty.dqb_itime = cpu_to_le64(1);
+-                      if (!memcmp(&empty, &ddquot, sizeof(struct v2_disk_dqblk)))
+-                              ddquot.dqb_itime = 0;
++                      if (!memcmp(&fakedquot[rev], &ddquot, dqblksz)) {
++                              if (rev == 0)
++                                      ddquot.r0.dqb_itime = cpu_to_le64(0);
++                              else
++                                      ddquot.r1.dqb_itime = cpu_to_le64(0);
++                      }
+               }
+-              disk2memdqb(&dquot->dq_dqb, &ddquot);
++              disk2memdqb(&dquot->dq_dqb, &ddquot, rev);
+               if (!dquot->dq_dqb.dqb_bhardlimit &&
+                       !dquot->dq_dqb.dqb_bsoftlimit &&
+                       !dquot->dq_dqb.dqb_ihardlimit &&
+diff -rNpu linux-2.6.16.54-0.2.5/include/linux/dqblk_v2.h linux-2.6.16.54-0.2.5-quota/include/linux/dqblk_v2.h
+--- linux-2.6.16.54-0.2.5/include/linux/dqblk_v2.h     2006-03-20 08:53:29.000000000 +0300
++++ linux-2.6.16.54-0.2.5-quota/include/linux/dqblk_v2.h       2008-03-17 23:39:54.000000000 +0300
+@@ -21,6 +21,7 @@ struct v2_mem_dqinfo {
+       unsigned int dqi_blocks;
+       unsigned int dqi_free_blk;
+       unsigned int dqi_free_entry;
++      unsigned int dqi_revision;
+ };
+ 
+ #endif /* _LINUX_DQBLK_V2_H */
+diff -rNpu linux-2.6.16.54-0.2.5/include/linux/quota.h linux-2.6.16.54-0.2.5-quota/include/linux/quota.h
+--- linux-2.6.16.54-0.2.5/include/linux/quota.h        2006-03-20 08:53:29.000000000 +0300
++++ linux-2.6.16.54-0.2.5-quota/include/linux/quota.h  2008-03-17 23:39:54.000000000 +0300
+@@ -148,12 +148,12 @@ struct if_dqinfo {
+  * Data for one user/group kept in memory
+  */
+ struct mem_dqblk {
+-      __u32 dqb_bhardlimit;   /* absolute limit on disk blks alloc */
+-      __u32 dqb_bsoftlimit;   /* preferred limit on disk blks */
++      qsize_t dqb_bhardlimit; /* absolute limit on disk blks alloc */
++      qsize_t dqb_bsoftlimit; /* preferred limit on disk blks */
+       qsize_t dqb_curspace;   /* current used space */
+-      __u32 dqb_ihardlimit;   /* absolute limit on allocated inodes */
+-      __u32 dqb_isoftlimit;   /* preferred inode limit */
+-      __u32 dqb_curinodes;    /* current # allocated inodes */
++      qsize_t dqb_ihardlimit; /* absolute limit on allocated inodes */
++      qsize_t dqb_isoftlimit; /* preferred inode limit */
++      qsize_t dqb_curinodes;  /* current # allocated inodes */
+       time_t dqb_btime;       /* time limit for excessive disk use */
+       time_t dqb_itime;       /* time limit for excessive inode use */
+ };
+@@ -169,6 +169,8 @@ struct mem_dqinfo {
+       unsigned long dqi_flags;
+       unsigned int dqi_bgrace;
+       unsigned int dqi_igrace;
++      qsize_t dqi_maxblimit;
++      qsize_t dqi_maxilimit;
+       union {
+               struct v1_mem_dqinfo v1_i;
+               struct v2_mem_dqinfo v2_i;
+diff -rNpu linux-2.6.16.54-0.2.5/include/linux/quotaio_v2.h linux-2.6.16.54-0.2.5-quota/include/linux/quotaio_v2.h
+--- linux-2.6.16.54-0.2.5/include/linux/quotaio_v2.h   2006-03-20 08:53:29.000000000 +0300
++++ linux-2.6.16.54-0.2.5-quota/include/linux/quotaio_v2.h     2008-03-17 23:39:54.000000000 +0300
+@@ -16,28 +16,51 @@
+       0xd9c01927      /* GRPQUOTA */\
+ }
+ 
+-#define V2_INITQVERSIONS {\
++#define V2_INITQVERSIONS_R0 {\
+       0,              /* USRQUOTA */\
+       0               /* GRPQUOTA */\
+ }
+ 
++#define V2_INITQVERSIONS_R1 {\
++      1,              /* USRQUOTA */\
++      1               /* GRPQUOTA */\
++}
++
+ /*
+  * The following structure defines the format of the disk quota file
+  * (as it appears on disk) - the file is a radix tree whose leaves point
+  * to blocks of these structures.
+  */
+-struct v2_disk_dqblk {
++struct v2_disk_dqblk_r0 {
+       __le32 dqb_id;          /* id this quota applies to */
+       __le32 dqb_ihardlimit;  /* absolute limit on allocated inodes */
+       __le32 dqb_isoftlimit;  /* preferred inode limit */
+       __le32 dqb_curinodes;   /* current # allocated inodes */
+-      __le32 dqb_bhardlimit;  /* absolute limit on disk space (in QUOTABLOCK_SIZE) */
+-      __le32 dqb_bsoftlimit;  /* preferred limit on disk space (in QUOTABLOCK_SIZE) */
++      __le32 dqb_bhardlimit;  /* absolute limit on disk space */
++      __le32 dqb_bsoftlimit;  /* preferred limit on disk space */
++      __le64 dqb_curspace;    /* current space occupied (in bytes) */
++      __le64 dqb_btime;       /* time limit for excessive disk use */
++      __le64 dqb_itime;       /* time limit for excessive inode use */
++};
++
++struct v2_disk_dqblk_r1 {
++      __le32 dqb_id;          /* id this quota applies to */
++      __le32 dqb_padding;     /* padding field */
++      __le64 dqb_ihardlimit;  /* absolute limit on allocated inodes */
++      __le64 dqb_isoftlimit;  /* preferred inode limit */
++      __le64 dqb_curinodes;   /* current # allocated inodes */
++      __le64 dqb_bhardlimit;  /* absolute limit on disk space */
++      __le64 dqb_bsoftlimit;  /* preferred limit on disk space */
+       __le64 dqb_curspace;    /* current space occupied (in bytes) */
+       __le64 dqb_btime;       /* time limit for excessive disk use */
+       __le64 dqb_itime;       /* time limit for excessive inode use */
+ };
+ 
++union v2_disk_dqblk {
++      struct v2_disk_dqblk_r0 r0;
++      struct v2_disk_dqblk_r1 r1;
++};
++
+ /*
+  * Here are header structures as written on disk and their in-memory copies
+  */
+@@ -59,7 +82,7 @@ struct v2_disk_dqinfo {
+ 
+ /*
+  *  Structure of header of block with quota structures. It is padded to 16 bytes so
+- *  there will be space for exactly 21 quota-entries in a block
++ *  there will be space for exactly 21 (r0) or 14 (r1) quota-entries in a block
+  */
+ struct v2_disk_dqdbheader {
+       __le32 dqdh_next_free;  /* Number of next block with free entry */
+@@ -74,6 +97,5 @@ struct v2_disk_dqdbheader {
+ #define V2_DQBLKSIZE  (1 << V2_DQBLKSIZE_BITS)        /* Size of block with quota structures */
+ #define V2_DQTREEOFF  1               /* Offset of tree in file in blocks */
+ #define V2_DQTREEDEPTH        4               /* Depth of quota tree */
+-#define V2_DQSTRINBLK ((V2_DQBLKSIZE - sizeof(struct v2_disk_dqdbheader)) / sizeof(struct v2_disk_dqblk))     /* Number of entries in one blocks */
+ 
+ #endif /* _LINUX_QUOTAIO_V2_H */
diff --git a/lustre/kernel_patches/patches/quota-large-limits-sles10.patch b/lustre/kernel_patches/patches/quota-large-limits-sles10.patch

new file mode 100644 (file)

index 0000000..fcef1c2
--- /dev/null
+++ b/lustre/kernel_patches/patches/quota-large-limits-sles10.patch
@@ -0,0 +1,616 @@
+diff -rNpu linux-2.6.16.54-0.2.5/fs/dquot.c linux-2.6.16.54-0.2.5-quota/fs/dquot.c
+--- linux-2.6.16.54-0.2.5/fs/dquot.c   2008-03-18 15:48:26.000000000 +0300
++++ linux-2.6.16.54-0.2.5-quota/fs/dquot.c     2008-03-17 22:43:11.000000000 +0300
+@@ -1588,10 +1588,19 @@ int vfs_get_dqblk(struct super_block *sb
+ }
+ 
+ /* Generic routine for setting common part of quota structure */
+-static void do_set_dqblk(struct dquot *dquot, struct if_dqblk *di)
++static int do_set_dqblk(struct dquot *dquot, struct if_dqblk *di)
+ {
+       struct mem_dqblk *dm = &dquot->dq_dqb;
+       int check_blim = 0, check_ilim = 0;
++      struct mem_dqinfo *dqi = &sb_dqopt(dquot->dq_sb)->info[dquot->dq_type];
++
++      if ((di->dqb_valid & QIF_BLIMITS &&
++           (di->dqb_bhardlimit > dqi->dqi_maxblimit ||
++            di->dqb_bsoftlimit > dqi->dqi_maxblimit)) ||
++          (di->dqb_valid & QIF_ILIMITS &&
++           (di->dqb_ihardlimit > dqi->dqi_maxilimit ||
++            di->dqb_isoftlimit > dqi->dqi_maxilimit)))
++              return -ERANGE;
+ 
+       spin_lock(&dq_data_lock);
+       if (di->dqb_valid & QIF_SPACE) {
+@@ -1623,7 +1632,7 @@ static void do_set_dqblk(struct dquot *d
+                       clear_bit(DQ_BLKS_B, &dquot->dq_flags);
+               }
+               else if (!(di->dqb_valid & QIF_BTIME))  /* Set grace only if user hasn't provided his own... */
+-                      dm->dqb_btime = get_seconds() + sb_dqopt(dquot->dq_sb)->info[dquot->dq_type].dqi_bgrace;
++                      dm->dqb_btime = get_seconds() + dqi->dqi_bgrace;
+       }
+       if (check_ilim) {
+               if (!dm->dqb_isoftlimit || dm->dqb_curinodes < dm->dqb_isoftlimit) {
+@@ -1631,7 +1640,7 @@ static void do_set_dqblk(struct dquot *d
+                       clear_bit(DQ_INODES_B, &dquot->dq_flags);
+               }
+               else if (!(di->dqb_valid & QIF_ITIME))  /* Set grace only if user hasn't provided his own... */
+-                      dm->dqb_itime = get_seconds() + sb_dqopt(dquot->dq_sb)->info[dquot->dq_type].dqi_igrace;
++                      dm->dqb_itime = get_seconds() + dqi->dqi_igrace;
+       }
+       if (dm->dqb_bhardlimit || dm->dqb_bsoftlimit || dm->dqb_ihardlimit || dm->dqb_isoftlimit)
+               clear_bit(DQ_FAKE_B, &dquot->dq_flags);
+@@ -1639,21 +1648,24 @@ static void do_set_dqblk(struct dquot *d
+               set_bit(DQ_FAKE_B, &dquot->dq_flags);
+       spin_unlock(&dq_data_lock);
+       mark_dquot_dirty(dquot);
++
++      return 0;
+ }
+ 
+ int vfs_set_dqblk(struct super_block *sb, int type, qid_t id, struct if_dqblk *di)
+ {
+       struct dquot *dquot;
++      int rc;
+ 
+       down(&sb_dqopt(sb)->dqonoff_sem);
+       if (!(dquot = dqget(sb, id, type))) {
+               up(&sb_dqopt(sb)->dqonoff_sem);
+               return -ESRCH;
+       }
+-      do_set_dqblk(dquot, di);
++      rc = do_set_dqblk(dquot, di);
+       dqput(dquot);
+       up(&sb_dqopt(sb)->dqonoff_sem);
+-      return 0;
++      return rc;
+ }
+ 
+ /* Generic routine for getting common part of quota file information */
+diff -rNpu linux-2.6.16.54-0.2.5/fs/quota_v1.c linux-2.6.16.54-0.2.5-quota/fs/quota_v1.c
+--- linux-2.6.16.54-0.2.5/fs/quota_v1.c        2006-03-20 08:53:29.000000000 +0300
++++ linux-2.6.16.54-0.2.5-quota/fs/quota_v1.c  2008-03-17 22:42:47.000000000 +0300
+@@ -139,6 +139,9 @@ static int v1_read_file_info(struct supe
+               goto out;
+       }
+       ret = 0;
++      /* limits are stored as unsigned 32-bit data */
++      dqopt->info[type].dqi_maxblimit = 0xffffffff;
++      dqopt->info[type].dqi_maxilimit = 0xffffffff;
+       dqopt->info[type].dqi_igrace = dqblk.dqb_itime ? dqblk.dqb_itime : MAX_IQ_TIME;
+       dqopt->info[type].dqi_bgrace = dqblk.dqb_btime ? dqblk.dqb_btime : MAX_DQ_TIME;
+ out:
+diff -rNpu linux-2.6.16.54-0.2.5/fs/quota_v2.c linux-2.6.16.54-0.2.5-quota/fs/quota_v2.c
+--- linux-2.6.16.54-0.2.5/fs/quota_v2.c        2006-03-20 08:53:29.000000000 +0300
++++ linux-2.6.16.54-0.2.5-quota/fs/quota_v2.c  2008-03-18 11:58:02.000000000 +0300
+@@ -23,26 +23,64 @@ MODULE_LICENSE("GPL");
+ typedef char *dqbuf_t;
+ 
+ #define GETIDINDEX(id, depth) (((id) >> ((V2_DQTREEDEPTH-(depth)-1)*8)) & 0xff)
+-#define GETENTRIES(buf) ((struct v2_disk_dqblk *)(((char *)buf)+sizeof(struct v2_disk_dqdbheader)))
++#define GETENTRIES(buf) ((union v2_disk_dqblk *)(((char *)buf) + \
++                       sizeof(struct v2_disk_dqdbheader)))
++#define REV_ASSERT(r) BUG_ON((rev) != 0 && (rev) != 1)
++
++static const union v2_disk_dqblk emptydquot;
++static const union v2_disk_dqblk fakedquot[2] = {
++      {.r0 = {.dqb_itime = __constant_cpu_to_le64(1LLU)} },
++      {.r1 = {.dqb_itime = __constant_cpu_to_le64(1LLU)} }
++};
+ 
+-/* Check whether given file is really vfsv0 quotafile */
+-static int v2_check_quota_file(struct super_block *sb, int type)
++static inline uint v2_dqblksz(uint rev)
++{
++      uint sz;
++
++      REV_ASSERT(rev);
++
++      if (rev == 0)
++              sz = sizeof(struct v2_disk_dqblk_r0);
++      else
++              sz = sizeof(struct v2_disk_dqblk_r1);
++
++      return sz;
++}
++
++/* Number of quota entries in a block */
++static inline int v2_dqstrinblk(uint rev)
++{
++      return (V2_DQBLKSIZE-sizeof(struct v2_disk_dqdbheader))/v2_dqblksz(rev);
++}
++
++/* Get revision of a quota file, -1 if it does not look a quota file */
++static int v2_quota_file_revision(struct super_block *sb, int type)
+ {
+       struct v2_disk_dqheader dqhead;
+       ssize_t size;
+       static const uint quota_magics[] = V2_INITQMAGICS;
+-      static const uint quota_versions[] = V2_INITQVERSIONS;
++      static const uint quota_versions_r0[] = V2_INITQVERSIONS_R0;
++      static const uint quota_versions_r1[] = V2_INITQVERSIONS_R1;
+  
+       size = sb->s_op->quota_read(sb, type, (char *)&dqhead, sizeof(struct v2_disk_dqheader), 0);
+       if (size != sizeof(struct v2_disk_dqheader)) {
+               printk("quota_v2: failed read expected=%zd got=%zd\n",
+                       sizeof(struct v2_disk_dqheader), size);
+-              return 0;
++              return -1;
+       }
+-      if (le32_to_cpu(dqhead.dqh_magic) != quota_magics[type] ||
+-          le32_to_cpu(dqhead.dqh_version) != quota_versions[type])
+-              return 0;
+-      return 1;
++      if (le32_to_cpu(dqhead.dqh_magic) == quota_magics[type]) {
++              if (le32_to_cpu(dqhead.dqh_version) == quota_versions_r0[type])
++                      return 0;
++              if (le32_to_cpu(dqhead.dqh_version) == quota_versions_r1[type])
++                      return 1;
++      }
++      return -1;
++}
++
++/* Check whether given file is really vfsv0 quotafile */
++static inline int v2_check_quota_file(struct super_block *sb, int type)
++{
++      return v2_quota_file_revision(sb, type) != -1;
+ }
+ 
+ /* Read information header from quota file */
+@@ -51,6 +89,13 @@ static int v2_read_file_info(struct supe
+       struct v2_disk_dqinfo dinfo;
+       struct mem_dqinfo *info = sb_dqopt(sb)->info+type;
+       ssize_t size;
++      int rev;
++
++      rev = v2_quota_file_revision(sb, type);
++      if (rev < 0) {
++              printk(KERN_WARNING "Second quota file check failed.\n");
++              return -1;
++      }
+ 
+       size = sb->s_op->quota_read(sb, type, (char *)&dinfo,
+              sizeof(struct v2_disk_dqinfo), V2_DQINFOOFF);
+@@ -65,6 +110,16 @@ static int v2_read_file_info(struct supe
+       info->u.v2_i.dqi_blocks = le32_to_cpu(dinfo.dqi_blocks);
+       info->u.v2_i.dqi_free_blk = le32_to_cpu(dinfo.dqi_free_blk);
+       info->u.v2_i.dqi_free_entry = le32_to_cpu(dinfo.dqi_free_entry);
++
++      info->u.v2_i.dqi_revision = rev;
++      if (rev == 0) {
++              info->dqi_maxblimit = 0xffffffffULL;
++              info->dqi_maxilimit = 0xffffffffULL;
++      } else {
++              info->dqi_maxblimit = 0xffffffffffffffffULL;
++              info->dqi_maxilimit = 0xffffffffffffffffULL;
++      }
++
+       return 0;
+ }
+ 
+@@ -94,29 +149,61 @@ static int v2_write_file_info(struct sup
+       return 0;
+ }
+ 
+-static void disk2memdqb(struct mem_dqblk *m, struct v2_disk_dqblk *d)
++static void disk2memdqb(struct mem_dqblk *m, union v2_disk_dqblk *d, uint rev)
+ {
+-      m->dqb_ihardlimit = le32_to_cpu(d->dqb_ihardlimit);
+-      m->dqb_isoftlimit = le32_to_cpu(d->dqb_isoftlimit);
+-      m->dqb_curinodes = le32_to_cpu(d->dqb_curinodes);
+-      m->dqb_itime = le64_to_cpu(d->dqb_itime);
+-      m->dqb_bhardlimit = le32_to_cpu(d->dqb_bhardlimit);
+-      m->dqb_bsoftlimit = le32_to_cpu(d->dqb_bsoftlimit);
+-      m->dqb_curspace = le64_to_cpu(d->dqb_curspace);
+-      m->dqb_btime = le64_to_cpu(d->dqb_btime);
+-}
+-
+-static void mem2diskdqb(struct v2_disk_dqblk *d, struct mem_dqblk *m, qid_t id)
+-{
+-      d->dqb_ihardlimit = cpu_to_le32(m->dqb_ihardlimit);
+-      d->dqb_isoftlimit = cpu_to_le32(m->dqb_isoftlimit);
+-      d->dqb_curinodes = cpu_to_le32(m->dqb_curinodes);
+-      d->dqb_itime = cpu_to_le64(m->dqb_itime);
+-      d->dqb_bhardlimit = cpu_to_le32(m->dqb_bhardlimit);
+-      d->dqb_bsoftlimit = cpu_to_le32(m->dqb_bsoftlimit);
+-      d->dqb_curspace = cpu_to_le64(m->dqb_curspace);
+-      d->dqb_btime = cpu_to_le64(m->dqb_btime);
+-      d->dqb_id = cpu_to_le32(id);
++      REV_ASSERT(rev);
++
++      if (rev == 0) {
++              struct v2_disk_dqblk_r0 *ddqblk = &d->r0;
++              m->dqb_ihardlimit = le32_to_cpu(ddqblk->dqb_ihardlimit);
++              m->dqb_isoftlimit = le32_to_cpu(ddqblk->dqb_isoftlimit);
++              m->dqb_curinodes = le32_to_cpu(ddqblk->dqb_curinodes);
++              m->dqb_itime = le64_to_cpu(ddqblk->dqb_itime);
++              m->dqb_bhardlimit = le32_to_cpu(ddqblk->dqb_bhardlimit);
++              m->dqb_bsoftlimit = le32_to_cpu(ddqblk->dqb_bsoftlimit);
++              m->dqb_curspace = le64_to_cpu(ddqblk->dqb_curspace);
++              m->dqb_btime = le64_to_cpu(ddqblk->dqb_btime);
++      } else {
++              struct v2_disk_dqblk_r1 *ddqblk = &d->r1;
++              m->dqb_ihardlimit = le64_to_cpu(ddqblk->dqb_ihardlimit);
++              m->dqb_isoftlimit = le64_to_cpu(ddqblk->dqb_isoftlimit);
++              m->dqb_curinodes = le64_to_cpu(ddqblk->dqb_curinodes);
++              m->dqb_itime = le64_to_cpu(ddqblk->dqb_itime);
++              m->dqb_bhardlimit = le64_to_cpu(ddqblk->dqb_bhardlimit);
++              m->dqb_bsoftlimit = le64_to_cpu(ddqblk->dqb_bsoftlimit);
++              m->dqb_curspace = le64_to_cpu(ddqblk->dqb_curspace);
++              m->dqb_btime = le64_to_cpu(ddqblk->dqb_btime);
++      }
++}
++
++static void mem2diskdqb(union v2_disk_dqblk *d, struct mem_dqblk *m,
++                      qid_t id, uint rev)
++{
++      REV_ASSERT(rev);
++
++      if (rev == 0) {
++              struct v2_disk_dqblk_r0 *ddqblk = &d->r0;
++              ddqblk->dqb_id = cpu_to_le32(id);
++              ddqblk->dqb_ihardlimit = cpu_to_le32((__u32)m->dqb_ihardlimit);
++              ddqblk->dqb_isoftlimit = cpu_to_le32((__u32)m->dqb_isoftlimit);
++              ddqblk->dqb_curinodes = cpu_to_le32((__u32)m->dqb_curinodes);
++              ddqblk->dqb_itime = cpu_to_le64(m->dqb_itime);
++              ddqblk->dqb_bhardlimit = cpu_to_le32((__u32)m->dqb_bhardlimit);
++              ddqblk->dqb_bsoftlimit = cpu_to_le32((__u32)m->dqb_bsoftlimit);
++              ddqblk->dqb_curspace = cpu_to_le64(m->dqb_curspace);
++              ddqblk->dqb_btime = cpu_to_le64(ddqblk->dqb_btime);
++      } else {
++              struct v2_disk_dqblk_r1 *ddqblk = &d->r1;
++              ddqblk->dqb_id = cpu_to_le32(id);
++              ddqblk->dqb_ihardlimit = cpu_to_le64(m->dqb_ihardlimit);
++              ddqblk->dqb_isoftlimit = cpu_to_le64(m->dqb_isoftlimit);
++              ddqblk->dqb_curinodes = cpu_to_le64(m->dqb_curinodes);
++              ddqblk->dqb_itime = cpu_to_le64(m->dqb_itime);
++              ddqblk->dqb_bhardlimit = cpu_to_le64(m->dqb_bhardlimit);
++              ddqblk->dqb_bsoftlimit = cpu_to_le64(m->dqb_bsoftlimit);
++              ddqblk->dqb_curspace = cpu_to_le64(m->dqb_curspace);
++              ddqblk->dqb_btime = cpu_to_le64(ddqblk->dqb_btime);
++      }
+ }
+ 
+ static dqbuf_t getdqbuf(void)
+@@ -268,10 +355,10 @@ static uint find_free_dqentry(struct dqu
+ {
+       struct super_block *sb = dquot->dq_sb;
+       struct mem_dqinfo *info = sb_dqopt(sb)->info+dquot->dq_type;
+-      uint blk, i;
++      uint blk, i, rev = info->u.v2_i.dqi_revision;
++      uint dqblksz = v2_dqblksz(rev), dqstrinblk = v2_dqstrinblk(rev);
+       struct v2_disk_dqdbheader *dh;
+-      struct v2_disk_dqblk *ddquot;
+-      struct v2_disk_dqblk fakedquot;
++      union v2_disk_dqblk *ddquot;
+       dqbuf_t buf;
+ 
+       *err = 0;
+@@ -298,17 +385,18 @@ static uint find_free_dqentry(struct dqu
+               info->u.v2_i.dqi_free_entry = blk;
+               mark_info_dirty(sb, dquot->dq_type);
+       }
+-      if (le16_to_cpu(dh->dqdh_entries)+1 >= V2_DQSTRINBLK)   /* Block will be full? */
++      /* Block will be full? */
++      if (le16_to_cpu(dh->dqdh_entries)+1 >= dqstrinblk)
+               if ((*err = remove_free_dqentry(sb, dquot->dq_type, buf, blk)) < 0) {
+                       printk(KERN_ERR "VFS: find_free_dqentry(): Can't remove block (%u) from entry free list.\n", blk);
+                       goto out_buf;
+               }
+       dh->dqdh_entries = cpu_to_le16(le16_to_cpu(dh->dqdh_entries)+1);
+-      memset(&fakedquot, 0, sizeof(struct v2_disk_dqblk));
+       /* Find free structure in block */
+-      for (i = 0; i < V2_DQSTRINBLK && memcmp(&fakedquot, ddquot+i, sizeof(struct v2_disk_dqblk)); i++);
++      for (i = 0; i < dqstrinblk && memcmp(&emptydquot, ddquot, dqblksz);
++           i++, ddquot = (char *)ddquot + dqblksz);
+ #ifdef __QUOTA_V2_PARANOIA
+-      if (i == V2_DQSTRINBLK) {
++      if (i == dqstrinblk) {
+               printk(KERN_ERR "VFS: find_free_dqentry(): Data block full but it shouldn't.\n");
+               *err = -EIO;
+               goto out_buf;
+@@ -318,7 +406,8 @@ static uint find_free_dqentry(struct dqu
+               printk(KERN_ERR "VFS: find_free_dqentry(): Can't write quota data block %u.\n", blk);
+               goto out_buf;
+       }
+-      dquot->dq_off = (blk<<V2_DQBLKSIZE_BITS)+sizeof(struct v2_disk_dqdbheader)+i*sizeof(struct v2_disk_dqblk);
++      dquot->dq_off = (blk<<V2_DQBLKSIZE_BITS)+
++                      ((char *)ddquot - (char *)buf);
+       freedqbuf(buf);
+       return blk;
+ out_buf:
+@@ -392,7 +481,9 @@ static int v2_write_dquot(struct dquot *
+ {
+       int type = dquot->dq_type;
+       ssize_t ret;
+-      struct v2_disk_dqblk ddquot, empty;
++      union v2_disk_dqblk ddquot;
++      uint rev = sb_dqopt(dquot->dq_sb)->info[type].u.v2_i.dqi_revision;
++      uint dqblksz = v2_dqblksz(rev);
+ 
+       /* dq_off is guarded by dqio_sem */
+       if (!dquot->dq_off)
+@@ -401,18 +492,22 @@ static int v2_write_dquot(struct dquot *
+                       return ret;
+               }
+       spin_lock(&dq_data_lock);
+-      mem2diskdqb(&ddquot, &dquot->dq_dqb, dquot->dq_id);
++      mem2diskdqb(&ddquot, &dquot->dq_dqb, dquot->dq_id, rev);
+       /* Argh... We may need to write structure full of zeroes but that would be
+        * treated as an empty place by the rest of the code. Format change would
+        * be definitely cleaner but the problems probably are not worth it */
+-      memset(&empty, 0, sizeof(struct v2_disk_dqblk));
+-      if (!memcmp(&empty, &ddquot, sizeof(struct v2_disk_dqblk)))
+-              ddquot.dqb_itime = cpu_to_le64(1);
++      if (!memcmp(&emptydquot, &ddquot, dqblksz)) {
++              if (rev == 0)
++                      ddquot.r0.dqb_itime = cpu_to_le64(1);
++              else
++                      ddquot.r1.dqb_itime = cpu_to_le64(1);
++      }
+       spin_unlock(&dq_data_lock);
+       ret = dquot->dq_sb->s_op->quota_write(dquot->dq_sb, type,
+-            (char *)&ddquot, sizeof(struct v2_disk_dqblk), dquot->dq_off);
+-      if (ret != sizeof(struct v2_disk_dqblk)) {
+-              printk(KERN_WARNING "VFS: dquota write failed on dev %s\n", dquot->dq_sb->s_id);
++            (char *)&ddquot, dqblksz, dquot->dq_off);
++      if (ret != dqblksz) {
++              printk(KERN_WARNING "VFS: dquota write failed on dev %s\n",
++                      dquot->dq_sb->s_id);
+               if (ret >= 0)
+                       ret = -ENOSPC;
+       }
+@@ -431,6 +526,7 @@ static int free_dqentry(struct dquot *dq
+       struct v2_disk_dqdbheader *dh;
+       dqbuf_t buf = getdqbuf();
+       int ret = 0;
++      uint rev = sb_dqopt(sb)->info[type].u.v2_i.dqi_revision;
+ 
+       if (!buf)
+               return -ENOMEM;
+@@ -456,8 +552,8 @@ static int free_dqentry(struct dquot *dq
+       }
+       else {
+               memset(buf+(dquot->dq_off & ((1 << V2_DQBLKSIZE_BITS)-1)), 0,
+-                sizeof(struct v2_disk_dqblk));
+-              if (le16_to_cpu(dh->dqdh_entries) == V2_DQSTRINBLK-1) {
++                v2_dqblksz(rev));
++              if (le16_to_cpu(dh->dqdh_entries) == v2_dqstrinblk(rev)-1) {
+                       /* Insert will write block itself */
+                       if ((ret = insert_free_dqentry(sb, type, buf, blk)) < 0) {
+                               printk(KERN_ERR "VFS: Can't insert quota data block (%u) to free entry list.\n", blk);
+@@ -529,41 +625,56 @@ static int v2_delete_dquot(struct dquot 
+       return remove_tree(dquot, &tmp, 0);
+ }
+ 
++static inline __u32 dqid(union v2_disk_dqblk *ddquot, uint rev)
++{
++      __u32 dq_id;
++
++      REV_ASSERT(rev);
++
++      if (rev == 0)
++              dq_id = le32_to_cpu(ddquot->r0.dqb_id);
++      else
++              dq_id = le32_to_cpu(ddquot->r1.dqb_id);
++
++      return dq_id;
++}
++
+ /* Find entry in block */
+ static loff_t find_block_dqentry(struct dquot *dquot, uint blk)
+ {
+       dqbuf_t buf = getdqbuf();
+       loff_t ret = 0;
+       int i;
+-      struct v2_disk_dqblk *ddquot = GETENTRIES(buf);
++      union v2_disk_dqblk *ddquot = GETENTRIES(buf);
++      int type = dquot->dq_type;
++      uint rev = sb_dqopt(dquot->dq_sb)->info[type].u.v2_i.dqi_revision;
++      uint dqblksz = v2_dqblksz(rev), dqstrinblk = v2_dqstrinblk(rev);
+ 
+       if (!buf)
+               return -ENOMEM;
+-      if ((ret = read_blk(dquot->dq_sb, dquot->dq_type, blk, buf)) < 0) {
++
++      ret = read_blk(dquot->dq_sb, type, blk, buf);
++      if (ret < 0) {
+               printk(KERN_ERR "VFS: Can't read quota tree block %u.\n", blk);
+               goto out_buf;
+       }
+       if (dquot->dq_id)
+-              for (i = 0; i < V2_DQSTRINBLK &&
+-                   le32_to_cpu(ddquot[i].dqb_id) != dquot->dq_id; i++);
++              for (i = 0; i < dqstrinblk && dqid(ddquot, rev) != dquot->dq_id;
++                   i++, ddquot = (char *)ddquot + dqblksz);
+       else {  /* ID 0 as a bit more complicated searching... */
+-              struct v2_disk_dqblk fakedquot;
+-
+-              memset(&fakedquot, 0, sizeof(struct v2_disk_dqblk));
+-              for (i = 0; i < V2_DQSTRINBLK; i++)
+-                      if (!le32_to_cpu(ddquot[i].dqb_id) &&
+-                          memcmp(&fakedquot, ddquot+i, sizeof(struct v2_disk_dqblk)))
++              for (i = 0; i < dqstrinblk; i++, ddquot = (char *)ddquot+dqblksz)
++                      if (!dqid(ddquot, rev) &&
++                          memcmp(&emptydquot, ddquot, dqblksz))
+                               break;
+       }
+-      if (i == V2_DQSTRINBLK) {
++      if (i == dqstrinblk) {
+               printk(KERN_ERR "VFS: Quota for id %u referenced "
+                 "but not present.\n", dquot->dq_id);
+               ret = -EIO;
+               goto out_buf;
+       }
+       else
+-              ret = (blk << V2_DQBLKSIZE_BITS) + sizeof(struct
+-                v2_disk_dqdbheader) + i * sizeof(struct v2_disk_dqblk);
++              ret = (blk << V2_DQBLKSIZE_BITS)+((char *)ddquot-(char *)buf);
+ out_buf:
+       freedqbuf(buf);
+       return ret;
+@@ -605,7 +716,7 @@ static int v2_read_dquot(struct dquot *d
+ {
+       int type = dquot->dq_type;
+       loff_t offset;
+-      struct v2_disk_dqblk ddquot, empty;
++      union v2_disk_dqblk ddquot;
+       int ret = 0;
+ 
+ #ifdef __QUOTA_V2_PARANOIA
+@@ -626,25 +737,30 @@ static int v2_read_dquot(struct dquot *d
+               ret = offset;
+       }
+       else {
++              uint rev = sb_dqopt(dquot->dq_sb)->info[type].u.v2_i.
++                         dqi_revision;
++              uint  dqblksz = v2_dqblksz(rev);
+               dquot->dq_off = offset;
+-              if ((ret = dquot->dq_sb->s_op->quota_read(dquot->dq_sb, type,
+-                  (char *)&ddquot, sizeof(struct v2_disk_dqblk), offset))
+-                  != sizeof(struct v2_disk_dqblk)) {
++              ret = dquot->dq_sb->s_op->quota_read(dquot->dq_sb, type,
++                                         (char *)&ddquot, dqblksz, offset);
++              if (ret != dqblksz) {
+                       if (ret >= 0)
+                               ret = -EIO;
+                       printk(KERN_ERR "VFS: Error while reading quota "
+                         "structure for id %u.\n", dquot->dq_id);
+-                      memset(&ddquot, 0, sizeof(struct v2_disk_dqblk));
++                      memset(&ddquot, 0, dqblksz);
+               }
+               else {
+                       ret = 0;
+                       /* We need to escape back all-zero structure */
+-                      memset(&empty, 0, sizeof(struct v2_disk_dqblk));
+-                      empty.dqb_itime = cpu_to_le64(1);
+-                      if (!memcmp(&empty, &ddquot, sizeof(struct v2_disk_dqblk)))
+-                              ddquot.dqb_itime = 0;
++                      if (!memcmp(&fakedquot[rev], &ddquot, dqblksz)) {
++                              if (rev == 0)
++                                      ddquot.r0.dqb_itime = cpu_to_le64(0);
++                              else
++                                      ddquot.r1.dqb_itime = cpu_to_le64(0);
++                      }
+               }
+-              disk2memdqb(&dquot->dq_dqb, &ddquot);
++              disk2memdqb(&dquot->dq_dqb, &ddquot, rev);
+               if (!dquot->dq_dqb.dqb_bhardlimit &&
+                       !dquot->dq_dqb.dqb_bsoftlimit &&
+                       !dquot->dq_dqb.dqb_ihardlimit &&
+diff -rNpu linux-2.6.16.54-0.2.5/include/linux/dqblk_v2.h linux-2.6.16.54-0.2.5-quota/include/linux/dqblk_v2.h
+--- linux-2.6.16.54-0.2.5/include/linux/dqblk_v2.h     2006-03-20 08:53:29.000000000 +0300
++++ linux-2.6.16.54-0.2.5-quota/include/linux/dqblk_v2.h       2008-03-17 23:39:54.000000000 +0300
+@@ -21,6 +21,7 @@ struct v2_mem_dqinfo {
+       unsigned int dqi_blocks;
+       unsigned int dqi_free_blk;
+       unsigned int dqi_free_entry;
++      unsigned int dqi_revision;
+ };
+ 
+ #endif /* _LINUX_DQBLK_V2_H */
+diff -rNpu linux-2.6.16.54-0.2.5/include/linux/quota.h linux-2.6.16.54-0.2.5-quota/include/linux/quota.h
+--- linux-2.6.16.54-0.2.5/include/linux/quota.h        2006-03-20 08:53:29.000000000 +0300
++++ linux-2.6.16.54-0.2.5-quota/include/linux/quota.h  2008-03-17 23:39:54.000000000 +0300
+@@ -148,12 +148,12 @@ struct if_dqinfo {
+  * Data for one user/group kept in memory
+  */
+ struct mem_dqblk {
+-      __u32 dqb_bhardlimit;   /* absolute limit on disk blks alloc */
+-      __u32 dqb_bsoftlimit;   /* preferred limit on disk blks */
++      qsize_t dqb_bhardlimit; /* absolute limit on disk blks alloc */
++      qsize_t dqb_bsoftlimit; /* preferred limit on disk blks */
+       qsize_t dqb_curspace;   /* current used space */
+-      __u32 dqb_ihardlimit;   /* absolute limit on allocated inodes */
+-      __u32 dqb_isoftlimit;   /* preferred inode limit */
+-      __u32 dqb_curinodes;    /* current # allocated inodes */
++      qsize_t dqb_ihardlimit; /* absolute limit on allocated inodes */
++      qsize_t dqb_isoftlimit; /* preferred inode limit */
++      qsize_t dqb_curinodes;  /* current # allocated inodes */
+       time_t dqb_btime;       /* time limit for excessive disk use */
+       time_t dqb_itime;       /* time limit for excessive inode use */
+ };
+@@ -169,6 +169,8 @@ struct mem_dqinfo {
+       unsigned long dqi_flags;
+       unsigned int dqi_bgrace;
+       unsigned int dqi_igrace;
++      qsize_t dqi_maxblimit;
++      qsize_t dqi_maxilimit;
+       union {
+               struct v1_mem_dqinfo v1_i;
+               struct v2_mem_dqinfo v2_i;
+diff -rNpu linux-2.6.16.54-0.2.5/include/linux/quotaio_v2.h linux-2.6.16.54-0.2.5-quota/include/linux/quotaio_v2.h
+--- linux-2.6.16.54-0.2.5/include/linux/quotaio_v2.h   2006-03-20 08:53:29.000000000 +0300
++++ linux-2.6.16.54-0.2.5-quota/include/linux/quotaio_v2.h     2008-03-17 23:39:54.000000000 +0300
+@@ -16,28 +16,51 @@
+       0xd9c01927      /* GRPQUOTA */\
+ }
+ 
+-#define V2_INITQVERSIONS {\
++#define V2_INITQVERSIONS_R0 {\
+       0,              /* USRQUOTA */\
+       0               /* GRPQUOTA */\
+ }
+ 
++#define V2_INITQVERSIONS_R1 {\
++      1,              /* USRQUOTA */\
++      1               /* GRPQUOTA */\
++}
++
+ /*
+  * The following structure defines the format of the disk quota file
+  * (as it appears on disk) - the file is a radix tree whose leaves point
+  * to blocks of these structures.
+  */
+-struct v2_disk_dqblk {
++struct v2_disk_dqblk_r0 {
+       __le32 dqb_id;          /* id this quota applies to */
+       __le32 dqb_ihardlimit;  /* absolute limit on allocated inodes */
+       __le32 dqb_isoftlimit;  /* preferred inode limit */
+       __le32 dqb_curinodes;   /* current # allocated inodes */
+-      __le32 dqb_bhardlimit;  /* absolute limit on disk space (in QUOTABLOCK_SIZE) */
+-      __le32 dqb_bsoftlimit;  /* preferred limit on disk space (in QUOTABLOCK_SIZE) */
++      __le32 dqb_bhardlimit;  /* absolute limit on disk space */
++      __le32 dqb_bsoftlimit;  /* preferred limit on disk space */
++      __le64 dqb_curspace;    /* current space occupied (in bytes) */
++      __le64 dqb_btime;       /* time limit for excessive disk use */
++      __le64 dqb_itime;       /* time limit for excessive inode use */
++};
++
++struct v2_disk_dqblk_r1 {
++      __le32 dqb_id;          /* id this quota applies to */
++      __le32 dqb_padding;     /* padding field */
++      __le64 dqb_ihardlimit;  /* absolute limit on allocated inodes */
++      __le64 dqb_isoftlimit;  /* preferred inode limit */
++      __le64 dqb_curinodes;   /* current # allocated inodes */
++      __le64 dqb_bhardlimit;  /* absolute limit on disk space */
++      __le64 dqb_bsoftlimit;  /* preferred limit on disk space */
+       __le64 dqb_curspace;    /* current space occupied (in bytes) */
+       __le64 dqb_btime;       /* time limit for excessive disk use */
+       __le64 dqb_itime;       /* time limit for excessive inode use */
+ };
+ 
++union v2_disk_dqblk {
++      struct v2_disk_dqblk_r0 r0;
++      struct v2_disk_dqblk_r1 r1;
++};
++
+ /*
+  * Here are header structures as written on disk and their in-memory copies
+  */
+@@ -59,7 +82,7 @@ struct v2_disk_dqinfo {
+ 
+ /*
+  *  Structure of header of block with quota structures. It is padded to 16 bytes so
+- *  there will be space for exactly 21 quota-entries in a block
++ *  there will be space for exactly 21 (r0) or 14 (r1) quota-entries in a block
+  */
+ struct v2_disk_dqdbheader {
+       __le32 dqdh_next_free;  /* Number of next block with free entry */
+@@ -74,6 +97,5 @@ struct v2_disk_dqdbheader {
+ #define V2_DQBLKSIZE  (1 << V2_DQBLKSIZE_BITS)        /* Size of block with quota structures */
+ #define V2_DQTREEOFF  1               /* Offset of tree in file in blocks */
+ #define V2_DQTREEDEPTH        4               /* Depth of quota tree */
+-#define V2_DQSTRINBLK ((V2_DQBLKSIZE - sizeof(struct v2_disk_dqdbheader)) / sizeof(struct v2_disk_dqblk))     /* Number of entries in one blocks */
+ 
+ #endif /* _LINUX_QUOTAIO_V2_H */
diff --git a/lustre/kernel_patches/patches/raid5-merge-ios-rhel5.patch b/lustre/kernel_patches/patches/raid5-merge-ios-rhel5.patch

index 735af2c..decf7a4 100644 (file)
--- a/lustre/kernel_patches/patches/raid5-merge-ios-rhel5.patch
+++ b/lustre/kernel_patches/patches/raid5-merge-ios-rhel5.patch
@@ -1,6 +1,7 @@
-diff -pur linux-2.6.18-53.orig/drivers/md/raid5.c linux-2.6.18-53/drivers/md/raid5.c
---- linux-2.6.18-53.orig/drivers/md/raid5.c    2007-12-28 18:55:24.000000000 +0800
-+++ linux-2.6.18-53/drivers/md/raid5.c 2007-12-28 19:08:15.000000000 +0800
+Index: linux-2.6.18-92.1.17/drivers/md/raid5.c
+===================================================================
+--- linux-2.6.18-92.1.17.orig/drivers/md/raid5.c
++++ linux-2.6.18-92.1.17/drivers/md/raid5.c
  @@ -1277,7 +1277,26 @@ static void compute_block_2(struct strip
         }
   }
@@ -151,7 +152,7 @@ diff -pur linux-2.6.18-53.orig/drivers/md/raid5.c linux-2.6.18-53/drivers/md/rai
                 }
                 if (sh) {
  -                      handle_stripe(sh, NULL);
-+                      handle_stripe(sh, NULL, NULL);
++                      handle_stripe(sh, NULL, bios);
                         release_stripe(sh);
                         sh = NULL;
                 }
diff --git a/lustre/kernel_patches/patches/raid5-zerocopy-rhel5.patch b/lustre/kernel_patches/patches/raid5-zerocopy-rhel5.patch

index fa92977..dd80825 100644 (file)
--- a/lustre/kernel_patches/patches/raid5-zerocopy-rhel5.patch
+++ b/lustre/kernel_patches/patches/raid5-zerocopy-rhel5.patch
@@ -345,9 +345,9 @@ diff -pur linux-2.6.18-53.orig/include/linux/page-flags.h linux-2.6.18-53/includ
   #define PG_nosave_free                18      /* Free, should not be written */
   #define PG_buddy              19      /* Page is free, on buddy lists */
  +#define PG_constant           20      /* To mark if the page is constant */
+ #define PG_xpmem              27      /* Testing for xpmem. */
   
   /* PG_owner_priv_1 users should have descriptive aliases */
- #define PG_checked              PG_owner_priv_1 /* Used by some filesystems */
  @@ -252,6 +253,14 @@
   
   struct page;  /* forward declaration */
diff --git a/lustre/kernel_patches/patches/sd_iostats-2.6-rhel4.patch b/lustre/kernel_patches/patches/sd_iostats-2.6-rhel4.patch

index 33160d9..954c445 100644 (file)
--- a/lustre/kernel_patches/patches/sd_iostats-2.6-rhel4.patch
+++ b/lustre/kernel_patches/patches/sd_iostats-2.6-rhel4.patch
@@ -1,9 +1,7 @@
-Index: linux-2.6.9-5.0.3.EL/drivers/scsi/Kconfig\r
-===================================================================\r
-Index: linux-2.6.9/drivers/scsi/Kconfig
+Index: linux-2.6.9-67.0.20/drivers/scsi/Kconfig
  ===================================================================
---- linux-2.6.9.orig/drivers/scsi/Kconfig      2007-07-23 14:19:13.000000000 +0400
-+++ linux-2.6.9/drivers/scsi/Kconfig   2007-07-26 14:16:36.000000000 +0400
+--- linux-2.6.9-67.0.20.orig/drivers/scsi/Kconfig
++++ linux-2.6.9-67.0.20/drivers/scsi/Kconfig
  @@ -61,6 +61,14 @@ config SCSI_DUMP
         help
            SCSI dump support
@@ -19,10 +17,10 @@ Index: linux-2.6.9/drivers/scsi/Kconfig
   config CHR_DEV_ST
         tristate "SCSI tape support"
         depends on SCSI
-Index: linux-2.6.9/drivers/scsi/scsi_proc.c
+Index: linux-2.6.9-67.0.20/drivers/scsi/scsi_proc.c
  ===================================================================
---- linux-2.6.9.orig/drivers/scsi/scsi_proc.c  2007-03-13 02:47:28.000000000 +0300
-+++ linux-2.6.9/drivers/scsi/scsi_proc.c       2007-07-26 14:16:36.000000000 +0400
+--- linux-2.6.9-67.0.20.orig/drivers/scsi/scsi_proc.c
++++ linux-2.6.9-67.0.20/drivers/scsi/scsi_proc.c
  @@ -38,7 +38,8 @@
   /* 4K page size, but our output routines, use some slack for overruns */
   #define PROC_BLOCK_SIZE (3*1024)
@@ -33,11 +31,11 @@ Index: linux-2.6.9/drivers/scsi/scsi_proc.c
   
   /* Protect sht->present and sht->proc_dir */
   static DECLARE_MUTEX(global_host_template_sem);
-Index: linux-2.6.9/drivers/scsi/sd.c
+Index: linux-2.6.9-67.0.20/drivers/scsi/sd.c
  ===================================================================
---- linux-2.6.9.orig/drivers/scsi/sd.c 2007-03-13 02:47:27.000000000 +0300
-+++ linux-2.6.9/drivers/scsi/sd.c      2007-07-28 14:55:56.000000000 +0400
-@@ -63,6 +63,67 @@
+--- linux-2.6.9-67.0.20.orig/drivers/scsi/sd.c
++++ linux-2.6.9-67.0.20/drivers/scsi/sd.c
+@@ -63,6 +63,63 @@
   
   #include "scsi_logging.h"
   
@@ -46,15 +44,15 @@ Index: linux-2.6.9/drivers/scsi/sd.c
  +# include <linux/seq_file.h>
  +
  +typedef struct {
-+        unsigned long long iostat_size;
-+        unsigned long long iostat_count;
++      unsigned long long iostat_size;
++      unsigned long long iostat_count;
  +} iostat_counter_t;
  +
  +#define IOSTAT_NCOUNTERS 16
  +typedef struct {
-+        iostat_counter_t        iostat_read_histogram[IOSTAT_NCOUNTERS];
-+        iostat_counter_t        iostat_write_histogram[IOSTAT_NCOUNTERS];
-+        struct timeval          iostat_timeval;
++      iostat_counter_t        iostat_read_histogram[IOSTAT_NCOUNTERS];
++      iostat_counter_t        iostat_write_histogram[IOSTAT_NCOUNTERS];
++      struct timeval          iostat_timeval;
  +
  +      /* queue depth: how well the pipe is filled up */
  +      unsigned long long      iostat_queue_ticks[IOSTAT_NCOUNTERS];
@@ -79,24 +77,20 @@ Index: linux-2.6.9/drivers/scsi/sd.c
  +      unsigned long           iostat_rtime_in_queue[IOSTAT_NCOUNTERS];
  +      unsigned long           iostat_wtime_in_queue[IOSTAT_NCOUNTERS];
  +
-+      char                    iostat_name[32];
-+
  +      /* must be the last field, as it's used to know size to be memset'ed */
-+      spinlock_t              iostat_lock;
-+}  ____cacheline_aligned_in_smp iostat_stats_t;
++      spinlock_t              iostat_lock;
++} ____cacheline_aligned_in_smp iostat_stats_t;
  +
-+iostat_stats_t       **sd_iostats;
-+struct proc_dir_entry *sd_iostats_procdir;
-+char                   sd_iostats_procdir_name[] = "sd_iostats";
++struct proc_dir_entry *sd_iostats_procdir = NULL;
++char sd_iostats_procdir_name[] = "sd_iostats";
++static struct file_operations sd_iostats_proc_fops;
  +
  +extern void sd_iostats_init(void);
-+extern void sd_iostats_init_disk(struct gendisk *);
  +extern void sd_iostats_fini(void);
  +void sd_iostats_start_req(struct scsi_cmnd *SCpnt);
  +void sd_iostats_finish_req(struct scsi_cmnd *SCpnt);
  +#else
  +static inline void sd_iostats_init(void) {}
-+static inline void sd_iostats_init_disk(struct gendisk *disk) {}
  +static inline void sd_iostats_fini(void) {}
  +static inline void sd_iostats_start_req(struct scsi_cmnd *SCpnt) {}
  +static inline void sd_iostats_finish_req(struct scsi_cmnd *SCpnt) {}
@@ -105,32 +99,26 @@ Index: linux-2.6.9/drivers/scsi/sd.c
   /*
    * More than enough for everybody ;)  The huge number of majors
    * is a leftover from 16bit dev_t days, we don't really need that
-@@ -76,6 +137,7 @@
-  */
- #define SD_MAX_DISKS  (((26 * 26) + 26 + 1) * 26)
+@@ -101,6 +158,9 @@ struct scsi_disk {
+       u8              write_prot;
+       unsigned        WCE : 1;        /* state of disk WCE bit */
+       unsigned        RCD : 1;        /* state of disk RCD bit, unused */
++#if (defined(CONFIG_SD_IOSTATS) && defined(CONFIG_PROC_FS))
++      iostat_stats_t  *stats;         /* scsi disk statistics */
++#endif
+ };
   
-+#define SD_STATS 256
- /*
-  * Time out in seconds for disks and Magneto-opticals (which are slower).
-  */
-@@ -278,6 +340,8 @@ static int sd_init_command(struct scsi_c
-       SCSI_LOG_HLQUEUE(2, printk("%s : block=%llu\n",
-                                  disk->disk_name, (unsigned long long)block));
+ static DEFINE_IDR(sd_index_idr);
+@@ -391,6 +451,8 @@ queue:
+       SCpnt->allowed = SD_MAX_RETRIES;
+       SCpnt->timeout_per_command = timeout;
   
  +      sd_iostats_start_req(SCpnt);
  +
         /*
-        * If we have a 1K hardware sectorsize, prevent access to single
-        * 512 byte sectors.  In theory we could handle this - in fact
-@@ -474,6 +538,7 @@ static int sd_open(struct inode *inode, 
-                       scsi_set_medium_removal(sdev, SCSI_REMOVAL_PREVENT);
-       }
- 
-+   sd_iostats_init_disk(disk);
-       return 0;
- 
- error_out:
-@@ -849,6 +914,9 @@ static void sd_rw_intr(struct scsi_cmnd 
+        * This is the completion routine we use.  This is matched in terms
+        * of capability to this function.
+@@ -849,6 +911,9 @@ static void sd_rw_intr(struct scsi_cmnd 
                         break;
                 }
         }
@@ -140,7 +128,60 @@ Index: linux-2.6.9/drivers/scsi/sd.c
         /*
          * This calls the generic completion function, now that we know
          * how many actual sectors finished, and how many sectors we need
-@@ -1575,6 +1643,481 @@ static void sd_shutdown(struct device *d
+@@ -1487,6 +1552,36 @@ static int sd_probe(struct device *dev)
+               gd->flags |= GENHD_FL_REMOVABLE;
+       gd->queue = sdkp->device->request_queue;
+ 
++#if (defined(CONFIG_SD_IOSTATS) && defined(CONFIG_PROC_FS))
++      sdkp->stats = kzalloc(sizeof(iostat_stats_t), GFP_KERNEL);
++      if (!sdkp->stats) {
++              printk(KERN_WARNING "cannot allocate iostat structure for"
++                                  "%s\n", gd->disk_name);
++      } else {
++              do_gettimeofday(&sdkp->stats->iostat_timeval);
++              sdkp->stats->iostat_queue_stamp = jiffies;
++              spin_lock_init(&sdkp->stats->iostat_lock);
++              if (sd_iostats_procdir) {
++                      struct proc_dir_entry *pde;
++                      pde = create_proc_entry(gd->disk_name, S_IRUGO | S_IWUSR,
++                                              sd_iostats_procdir);
++                      if (!pde) {
++                              printk(KERN_WARNING "Can't create /proc/scsi/"
++                                                  "%s/%s\n",
++                                                  sd_iostats_procdir_name,
++                                                  gd->disk_name);
++                              kfree(sdkp->stats);
++                              sdkp->stats = NULL;
++                      } else {
++                              pde->proc_fops = &sd_iostats_proc_fops;
++                              pde->data = gd;
++                      }
++              } else {
++                      kfree(sdkp->stats);
++                      sdkp->stats = NULL;
++              }
++      }
++#endif
+       dev_set_drvdata(dev, sdkp);
+       add_disk(gd);
+ 
+@@ -1549,8 +1644,14 @@ static void scsi_disk_release(struct kre
+ 
+       disk->private_data = NULL;
+ 
++#if (defined(CONFIG_SD_IOSTATS) && defined(CONFIG_PROC_FS))
++      if (sdkp->stats) {
++              remove_proc_entry(disk->disk_name, sd_iostats_procdir);
++              kfree(sdkp->stats);
++              sdkp->stats = NULL;
++      }
++#endif
+       put_disk(disk);
+-
+       kfree(sdkp);
+ }
+ 
+@@ -1575,6 +1676,366 @@ static void sd_shutdown(struct device *d
         sd_sync_cache(sdp);
   }     
   
@@ -162,12 +203,7 @@ Index: linux-2.6.9/drivers/scsi/sd.c
  +      int                i;
  +      int                maxi;
  +
-+      if (sd_iostats == NULL) {
-+              printk(KERN_ERR "sd_iostats_seq_show: NULL stats array\n");
-+              BUG();
-+      }
-+
-+      stats = sd_iostats[scsi_disk(disk)->index];
++      stats = scsi_disk(disk)->stats;
  +      if (stats == NULL) {
  +              printk(KERN_ERR "sd_iostats_seq_show: NULL stats entry\n");
  +              BUG();
@@ -314,7 +350,7 @@ Index: linux-2.6.9/drivers/scsi/sd.c
  +static int
  +sd_iostats_seq_open (struct inode *inode, struct file *file)
  +{
-+      int                    rc;
++      int rc;
  +
  +      rc = seq_open(file, &sd_iostats_seqops);
  +      if (rc != 0)
@@ -326,11 +362,11 @@ Index: linux-2.6.9/drivers/scsi/sd.c
  +
  +static ssize_t
  +sd_iostats_seq_write(struct file *file, const char *buffer,
-+                     size_t len, loff_t *off)
++                   size_t len, loff_t *off)
  +{
  +      struct seq_file   *seq = file->private_data;
  +      struct gendisk *disk = seq->private;
-+      iostat_stats_t    *stats = sd_iostats[scsi_disk(disk)->index];
++      iostat_stats_t    *stats = scsi_disk(disk)->stats;
  +      unsigned long      flags;
  +      unsigned long      qdepth;
  +
@@ -360,19 +396,6 @@ Index: linux-2.6.9/drivers/scsi/sd.c
  +void
  +sd_iostats_init(void)
  +{
-+      int    i;
-+
-+      sd_iostats = kmalloc(SD_STATS * sizeof(iostat_stats_t *), GFP_KERNEL);
-+      if (sd_iostats == NULL) {
-+              printk(KERN_WARNING "Can't keep sd iostats: "
-+                      "ENOMEM allocating stats array size %d\n",
-+                      SD_STATS * sizeof(iostat_stats_t *));
-+              return;
-+      }
-+
-+      for (i = 0; i < SD_STATS; i++)
-+              sd_iostats[i] = NULL;
-+
  +      if (proc_scsi == NULL) {
  +              printk(KERN_WARNING "No access to sd iostats: "
  +                      "proc_scsi is NULL\n");
@@ -386,91 +409,15 @@ Index: linux-2.6.9/drivers/scsi/sd.c
  +              printk(KERN_WARNING "No access to sd iostats: "
  +                      "can't create /proc/scsi/%s\n", sd_iostats_procdir_name);
  +              return;
-+        }
-+}
-+
-+void
-+sd_iostats_init_disk(struct gendisk *disk)
-+{
-+      struct proc_dir_entry *pde;
-+      unsigned long          flags;
-+      iostat_stats_t        *stats;
-+
-+      if (sd_iostats == NULL || sd_iostats_procdir == NULL)
-+              return;
-+
-+      if (scsi_disk(disk)->index > SD_STATS) {
-+              printk(KERN_ERR "sd_iostats_init_disk: "
-+                      "unexpected disk index %d(%d)\n",
-+                      scsi_disk(disk)->index, SD_STATS);
-+              return;
-+      }
-+
-+      if (sd_iostats[scsi_disk(disk)->index] != NULL)
-+              return;
-+
-+      stats = kmalloc(sizeof(*stats), GFP_KERNEL);
-+      if (stats == NULL) {
-+              printk(KERN_WARNING "Can't keep %s iostats: "
-+                      "ENOMEM allocating stats size %d\n", 
-+                      disk->disk_name, sizeof(*stats));
-+              return;
-+      }
-+
-+      memset (stats, 0, sizeof(*stats));
-+      do_gettimeofday(&stats->iostat_timeval);
-+      stats->iostat_queue_stamp = jiffies;
-+      spin_lock_init(&stats->iostat_lock);
-+
-+
-+      spin_lock_irqsave(&stats->iostat_lock, flags);
-+
-+      if (sd_iostats[scsi_disk(disk)->index] != NULL) {
-+              spin_unlock_irqrestore(&stats->iostat_lock, flags);
-+              kfree (stats);
-+              return;
-+      }
-+
-+      sd_iostats[scsi_disk(disk)->index] = stats;
-+
-+      spin_unlock_irqrestore(&stats->iostat_lock, flags);
-+
-+      strncpy(stats->iostat_name, disk->disk_name,
-+              sizeof(stats->iostat_name)-1);
-+
-+      pde = create_proc_entry(stats->iostat_name, S_IRUGO | S_IWUSR,
-+                              sd_iostats_procdir);
-+      if (pde == NULL) {
-+              printk(KERN_WARNING "Can't create /proc/scsi/%s/%s\n",
-+                      sd_iostats_procdir_name, disk->disk_name);
-+      } else {
-+              pde->proc_fops = &sd_iostats_proc_fops;
-+              pde->data = disk;
  +      }
  +}
  +
  +void sd_iostats_fini(void)
  +{
-+      int  i;
-+
-+      if (sd_iostats == NULL)
-+              return;
-+
-+      for (i = 0; i < SD_STATS; i++) {
-+              if (sd_iostats[i] == NULL)
-+                      continue;
-+              if (sd_iostats_procdir != NULL)
-+                      remove_proc_entry(sd_iostats[i]->iostat_name,
-+                                              sd_iostats_procdir);
-+              kfree(sd_iostats[i]);
-+      }
-+
  +      if (proc_scsi != NULL && sd_iostats_procdir != NULL)
  +              remove_proc_entry(sd_iostats_procdir_name, proc_scsi);
  +
  +      sd_iostats_procdir = NULL;
-+      kfree(sd_iostats);
-+      sd_iostats = NULL;
  +}
  +
  +void sd_iostats_finish_req(struct scsi_cmnd *SCpnt)
@@ -481,20 +428,9 @@ Index: linux-2.6.9/drivers/scsi/sd.c
  +      int                     tbucket;
  +      int                     tmp;
  +      unsigned long           irqflags;
-+      int                     disk, i;
-+
-+      disk = scsi_disk(rq->rq_disk)->index;
-+
-+      if (sd_iostats == NULL)
-+              return;
-+
-+      if (disk < 0 || disk >= SD_STATS) {
-+              printk(KERN_ERR "sd_iostats_bump: unexpected disk index "
-+                      "%d([0-%d])\n", disk, SD_STATS);
-+              BUG();
-+      }
++      unsigned long           i;
  +
-+      stats = sd_iostats[disk];
++      stats = scsi_disk(rq->rq_disk)->stats;
  +      if (stats == NULL)
  +              return;
  +
@@ -519,6 +455,7 @@ Index: linux-2.6.9/drivers/scsi/sd.c
  +              i = IOSTAT_NCOUNTERS - 1;
  +      stats->iostat_queue_ticks[i] += jiffies - stats->iostat_queue_stamp;
  +      stats->iostat_queue_ticks_sum += jiffies - stats->iostat_queue_stamp;
++      BUG_ON(stats->iostat_queue_depth == 0);
  +      stats->iostat_queue_depth--;
  +
  +      /* update seek stats. XXX: not sure about nr_sectors */
@@ -547,21 +484,10 @@ Index: linux-2.6.9/drivers/scsi/sd.c
  +      int                     tbucket;
  +      int                     tmp;
  +      unsigned long           irqflags;
-+      int                     disk, i;
++      unsigned long           i;
  +      int                     nsect;
  +
-+      disk = scsi_disk(rq->rq_disk)->index;
-+
-+      if (sd_iostats == NULL)
-+              return;
-+
-+      if (disk < 0 || disk >= SD_STATS) {
-+              printk(KERN_ERR "sd_iostats_bump: unexpected disk index %d([0-%d])\n",
-+                      disk, SD_STATS);
-+              BUG();
-+      }
-+
-+      stats = sd_iostats[disk];
++      stats = scsi_disk(rq->rq_disk)->stats;
  +      if (stats == NULL)
  +              return;
  +
@@ -622,31 +548,32 @@ Index: linux-2.6.9/drivers/scsi/sd.c
   /**
    *    init_sd - entry point for this driver (both when built in or when
    *    a module).
-@@ -1584,6 +2127,7 @@ static void sd_shutdown(struct device *d
+@@ -1584,6 +2045,7 @@ static void sd_shutdown(struct device *d
   static int __init init_sd(void)
   {
         int majors = 0, i;
-+   int rc = 0;
++      int rc = 0;
   
         SCSI_LOG_HLQUEUE(3, printk("init_sd: sd driver entry point\n"));
   
-@@ -1594,7 +2138,10 @@ static int __init init_sd(void)
+@@ -1594,7 +2056,11 @@ static int __init init_sd(void)
         if (!majors)
                 return -ENODEV;
   
  -      return scsi_register_driver(&sd_template.gendrv);
-+   rc = scsi_register_driver(&sd_template.gendrv);
-+   if (rc == 0)
-+      sd_iostats_init();
-+   return rc;
++      sd_iostats_init();
++      rc = scsi_register_driver(&sd_template.gendrv);
++      if (rc)
++              sd_iostats_fini();
++      return rc;
   }
   
   /**
-@@ -1608,6 +2155,7 @@ static void __exit exit_sd(void)
- 
-       SCSI_LOG_HLQUEUE(3, printk("exit_sd: exiting sd driver\n"));
- 
-+   sd_iostats_fini();
+@@ -1611,6 +2077,7 @@ static void __exit exit_sd(void)
         scsi_unregister_driver(&sd_template.gendrv);
         for (i = 0; i < SD_MAJORS; i++)
                 unregister_blkdev(sd_major(i), "sd");
++      sd_iostats_fini();
+ }
+ 
+ MODULE_LICENSE("GPL");
diff --git a/lustre/kernel_patches/patches/sd_iostats-2.6-rhel5.patch b/lustre/kernel_patches/patches/sd_iostats-2.6-rhel5.patch

index e38e22a..d0cc6f6 100644 (file)
--- a/lustre/kernel_patches/patches/sd_iostats-2.6-rhel5.patch
+++ b/lustre/kernel_patches/patches/sd_iostats-2.6-rhel5.patch
@@ -1,12 +1,10 @@
-Index: linux-2.6.9-5.0.3.EL/drivers/scsi/Kconfig\r
-===================================================================\r
-Index: linux-2.6.9/drivers/scsi/Kconfig
+Index: linux-2.6.18-53.1.21/drivers/scsi/Kconfig
  ===================================================================
---- linux-2.6.9.orig/drivers/scsi/Kconfig      2007-07-23 14:19:13.000000000 +0400
-+++ linux-2.6.9/drivers/scsi/Kconfig   2007-07-26 14:16:36.000000000 +0400
-@@ -61,6 +61,14 @@ config SCSI_DUMP
-       help
-          SCSI dump support
+--- linux-2.6.18-53.1.21.orig/drivers/scsi/Kconfig
++++ linux-2.6.18-53.1.21/drivers/scsi/Kconfig
+@@ -66,6 +66,14 @@ config BLK_DEV_SD
+         In this case, do not compile the driver for your SCSI host adapter
+         (below) as a module either.
   
  +config SD_IOSTATS
  +   bool "Enable SCSI disk I/O stats"
@@ -19,11 +17,11 @@ Index: linux-2.6.9/drivers/scsi/Kconfig
   config CHR_DEV_ST
         tristate "SCSI tape support"
         depends on SCSI
-Index: linux-2.6.9/drivers/scsi/scsi_proc.c
+Index: linux-2.6.18-53.1.21/drivers/scsi/scsi_proc.c
  ===================================================================
---- linux-2.6.9.orig/drivers/scsi/scsi_proc.c  2007-03-13 02:47:28.000000000 +0300
-+++ linux-2.6.9/drivers/scsi/scsi_proc.c       2007-07-26 14:16:36.000000000 +0400
-@@ -38,7 +38,8 @@
+--- linux-2.6.18-53.1.21.orig/drivers/scsi/scsi_proc.c
++++ linux-2.6.18-53.1.21/drivers/scsi/scsi_proc.c
+@@ -40,7 +40,8 @@
   /* 4K page size, but our output routines, use some slack for overruns */
   #define PROC_BLOCK_SIZE (3*1024)
   
@@ -32,12 +30,12 @@ Index: linux-2.6.9/drivers/scsi/scsi_proc.c
  +EXPORT_SYMBOL(proc_scsi);
   
   /* Protect sht->present and sht->proc_dir */
- static DECLARE_MUTEX(global_host_template_sem);
-Index: linux-2.6.9/drivers/scsi/sd.c
+ static DEFINE_MUTEX(global_host_template_mutex);
+Index: linux-2.6.18-53.1.21/drivers/scsi/sd.c
  ===================================================================
---- linux-2.6.9.orig/drivers/scsi/sd.c 2007-03-13 02:47:27.000000000 +0300
-+++ linux-2.6.9/drivers/scsi/sd.c      2007-07-28 14:55:56.000000000 +0400
-@@ -63,6 +63,67 @@
+--- linux-2.6.18-53.1.21.orig/drivers/scsi/sd.c
++++ linux-2.6.18-53.1.21/drivers/scsi/sd.c
+@@ -62,6 +62,63 @@
   
   #include "scsi_logging.h"
   
@@ -46,15 +44,15 @@ Index: linux-2.6.9/drivers/scsi/sd.c
  +# include <linux/seq_file.h>
  +
  +typedef struct {
-+        unsigned long long iostat_size;
-+        unsigned long long iostat_count;
++      unsigned long long iostat_size;
++      unsigned long long iostat_count;
  +} iostat_counter_t;
  +
  +#define IOSTAT_NCOUNTERS 16
  +typedef struct {
-+        iostat_counter_t        iostat_read_histogram[IOSTAT_NCOUNTERS];
-+        iostat_counter_t        iostat_write_histogram[IOSTAT_NCOUNTERS];
-+        struct timeval          iostat_timeval;
++      iostat_counter_t        iostat_read_histogram[IOSTAT_NCOUNTERS];
++      iostat_counter_t        iostat_write_histogram[IOSTAT_NCOUNTERS];
++      struct timeval          iostat_timeval;
  +
  +      /* queue depth: how well the pipe is filled up */
  +      unsigned long long      iostat_queue_ticks[IOSTAT_NCOUNTERS];
@@ -79,24 +77,20 @@ Index: linux-2.6.9/drivers/scsi/sd.c
  +      unsigned long           iostat_rtime_in_queue[IOSTAT_NCOUNTERS];
  +      unsigned long           iostat_wtime_in_queue[IOSTAT_NCOUNTERS];
  +
-+      char                    iostat_name[32];
-+
  +      /* must be the last field, as it's used to know size to be memset'ed */
-+      spinlock_t              iostat_lock;
-+}  ____cacheline_aligned_in_smp iostat_stats_t;
++      spinlock_t              iostat_lock;
++} ____cacheline_aligned_in_smp iostat_stats_t;
  +
-+iostat_stats_t       **sd_iostats;
-+struct proc_dir_entry *sd_iostats_procdir;
-+char                   sd_iostats_procdir_name[] = "sd_iostats";
++struct proc_dir_entry *sd_iostats_procdir = NULL;
++char sd_iostats_procdir_name[] = "sd_iostats";
++static struct file_operations sd_iostats_proc_fops;
  +
  +extern void sd_iostats_init(void);
-+extern void sd_iostats_init_disk(struct gendisk *);
  +extern void sd_iostats_fini(void);
  +void sd_iostats_start_req(struct scsi_cmnd *SCpnt);
  +void sd_iostats_finish_req(struct scsi_cmnd *SCpnt);
  +#else
  +static inline void sd_iostats_init(void) {}
-+static inline void sd_iostats_init_disk(struct gendisk *disk) {}
  +static inline void sd_iostats_fini(void) {}
  +static inline void sd_iostats_start_req(struct scsi_cmnd *SCpnt) {}
  +static inline void sd_iostats_finish_req(struct scsi_cmnd *SCpnt) {}
@@ -105,42 +99,73 @@ Index: linux-2.6.9/drivers/scsi/sd.c
   /*
    * More than enough for everybody ;)  The huge number of majors
    * is a leftover from 16bit dev_t days, we don't really need that
-@@ -76,6 +137,7 @@
-  */
- #define SD_MAX_DISKS  (((26 * 26) + 26 + 1) * 26)
+@@ -126,6 +183,9 @@ struct scsi_disk {
+       unsigned        WCE : 1;        /* state of disk WCE bit */
+       unsigned        RCD : 1;        /* state of disk RCD bit, unused */
+       unsigned        DPOFUA : 1;     /* state of disk DPOFUA bit */
++#if (defined(CONFIG_SD_IOSTATS) && defined(CONFIG_PROC_FS))
++      iostat_stats_t  *stats;         /* scsi disk statistics */
++#endif
+ };
+ #define to_scsi_disk(obj) container_of(obj,struct scsi_disk,cdev)
   
-+#define SD_STATS 256
- /*
-  * Time out in seconds for disks and Magneto-opticals (which are slower).
-  */
-@@ -278,6 +340,8 @@ static int sd_init_command(struct scsi_c
-       SCSI_LOG_HLQUEUE(2, printk("%s : block=%llu\n",
-                                  disk->disk_name, (unsigned long long)block));
+@@ -557,6 +617,8 @@ static int sd_init_command(struct scsi_c
+        */
+       SCpnt->done = sd_rw_intr;
   
  +      sd_iostats_start_req(SCpnt);
  +
         /*
-        * If we have a 1K hardware sectorsize, prevent access to single
-        * 512 byte sectors.  In theory we could handle this - in fact
-@@ -474,6 +538,7 @@ static int sd_open(struct inode *inode, 
-                       scsi_set_medium_removal(sdev, SCSI_REMOVAL_PREVENT);
-       }
- 
-+   sd_iostats_init_disk(disk);
-       return 0;
- 
- error_out:
-@@ -849,6 +914,7 @@ static void sd_rw_intr(struct scsi_cmnd 
+        * This indicates that the command is ready from our end to be
+        * queued.
+@@ -1040,6 +1102,7 @@ static void sd_rw_intr(struct scsi_cmnd 
                 break;
         }
    out:
  +      sd_iostats_finish_req(SCpnt);
         scsi_io_completion(SCpnt, good_bytes);
   }
-
-@@ -1575,6 +1643,481 @@ static void sd_shutdown(struct device *d
-       sd_sync_cache(sdp);
- }     
+ 
+@@ -1735,6 +1798,36 @@ static int sd_probe(struct device *dev)
+       if (sdp->removable)
+               gd->flags |= GENHD_FL_REMOVABLE;
+ 
++#if (defined(CONFIG_SD_IOSTATS) && defined(CONFIG_PROC_FS))
++      sdkp->stats = kzalloc(sizeof(iostat_stats_t), GFP_KERNEL);
++      if (!sdkp->stats) {
++              printk(KERN_WARNING "cannot allocate iostat structure for"
++                                  "%s\n", gd->disk_name);
++      } else {
++              do_gettimeofday(&sdkp->stats->iostat_timeval);
++              sdkp->stats->iostat_queue_stamp = jiffies;
++              spin_lock_init(&sdkp->stats->iostat_lock);
++              if (sd_iostats_procdir) {
++                      struct proc_dir_entry *pde;
++                      pde = create_proc_entry(gd->disk_name, S_IRUGO | S_IWUSR,
++                                              sd_iostats_procdir);
++                      if (!pde) {
++                              printk(KERN_WARNING "Can't create /proc/scsi/"
++                                                  "%s/%s\n",
++                                                  sd_iostats_procdir_name,
++                                                  gd->disk_name);
++                              kfree(sdkp->stats);
++                              sdkp->stats = NULL;
++                      } else {
++                              pde->proc_fops = &sd_iostats_proc_fops;
++                              pde->data = gd;
++                      }
++              } else {
++                      kfree(sdkp->stats);
++                      sdkp->stats = NULL;
++              }
++      }
++#endif
+       dev_set_drvdata(dev, sdkp);
+       add_disk(gd);
+ 
+@@ -1778,6 +1871,366 @@ static int sd_remove(struct device *dev)
+       return 0;
+ }
   
  +#if (defined(CONFIG_SD_IOSTATS) && defined(CONFIG_PROC_FS))
  +static int
@@ -160,12 +185,7 @@ Index: linux-2.6.9/drivers/scsi/sd.c
  +      int                i;
  +      int                maxi;
  +
-+      if (sd_iostats == NULL) {
-+              printk(KERN_ERR "sd_iostats_seq_show: NULL stats array\n");
-+              BUG();
-+      }
-+
-+      stats = sd_iostats[scsi_disk(disk)->index];
++      stats = scsi_disk(disk)->stats;
  +      if (stats == NULL) {
  +              printk(KERN_ERR "sd_iostats_seq_show: NULL stats entry\n");
  +              BUG();
@@ -312,7 +332,7 @@ Index: linux-2.6.9/drivers/scsi/sd.c
  +static int
  +sd_iostats_seq_open (struct inode *inode, struct file *file)
  +{
-+      int                    rc;
++      int rc;
  +
  +      rc = seq_open(file, &sd_iostats_seqops);
  +      if (rc != 0)
@@ -324,11 +344,11 @@ Index: linux-2.6.9/drivers/scsi/sd.c
  +
  +static ssize_t
  +sd_iostats_seq_write(struct file *file, const char *buffer,
-+                     size_t len, loff_t *off)
++                   size_t len, loff_t *off)
  +{
  +      struct seq_file   *seq = file->private_data;
  +      struct gendisk *disk = seq->private;
-+      iostat_stats_t    *stats = sd_iostats[scsi_disk(disk)->index];
++      iostat_stats_t    *stats = scsi_disk(disk)->stats;
  +      unsigned long      flags;
  +      unsigned long      qdepth;
  +
@@ -358,19 +378,6 @@ Index: linux-2.6.9/drivers/scsi/sd.c
  +void
  +sd_iostats_init(void)
  +{
-+      int    i;
-+
-+      sd_iostats = kmalloc(SD_STATS * sizeof(iostat_stats_t *), GFP_KERNEL);
-+      if (sd_iostats == NULL) {
-+              printk(KERN_WARNING "Can't keep sd iostats: "
-+                      "ENOMEM allocating stats array size %d\n",
-+                      SD_STATS * sizeof(iostat_stats_t *));
-+              return;
-+      }
-+
-+      for (i = 0; i < SD_STATS; i++)
-+              sd_iostats[i] = NULL;
-+
  +      if (proc_scsi == NULL) {
  +              printk(KERN_WARNING "No access to sd iostats: "
  +                      "proc_scsi is NULL\n");
@@ -378,97 +385,21 @@ Index: linux-2.6.9/drivers/scsi/sd.c
  +      }
  +
  +      sd_iostats_procdir = create_proc_entry(sd_iostats_procdir_name,
-+                      S_IFDIR | S_IRUGO | S_IXUGO,
-+                      proc_scsi);
++                                             S_IFDIR | S_IRUGO | S_IXUGO,
++                                              proc_scsi);
  +      if (sd_iostats_procdir == NULL) {
  +              printk(KERN_WARNING "No access to sd iostats: "
  +                      "can't create /proc/scsi/%s\n", sd_iostats_procdir_name);
  +              return;
-+        }
-+}
-+
-+void
-+sd_iostats_init_disk(struct gendisk *disk)
-+{
-+      struct proc_dir_entry *pde;
-+      unsigned long          flags;
-+      iostat_stats_t        *stats;
-+
-+      if (sd_iostats == NULL || sd_iostats_procdir == NULL)
-+              return;
-+
-+      if (scsi_disk(disk)->index > SD_STATS) {
-+              printk(KERN_ERR "sd_iostats_init_disk: "
-+                      "unexpected disk index %d(%d)\n",
-+                      scsi_disk(disk)->index, SD_STATS);
-+              return;
-+      }
-+
-+      if (sd_iostats[scsi_disk(disk)->index] != NULL)
-+              return;
-+
-+      stats = kmalloc(sizeof(*stats), GFP_KERNEL);
-+      if (stats == NULL) {
-+              printk(KERN_WARNING "Can't keep %s iostats: "
-+                      "ENOMEM allocating stats size %d\n", 
-+                      disk->disk_name, sizeof(*stats));
-+              return;
-+      }
-+
-+      memset (stats, 0, sizeof(*stats));
-+      do_gettimeofday(&stats->iostat_timeval);
-+      stats->iostat_queue_stamp = jiffies;
-+      spin_lock_init(&stats->iostat_lock);
-+
-+
-+      spin_lock_irqsave(&stats->iostat_lock, flags);
-+
-+      if (sd_iostats[scsi_disk(disk)->index] != NULL) {
-+              spin_unlock_irqrestore(&stats->iostat_lock, flags);
-+              kfree (stats);
-+              return;
-+      }
-+
-+      sd_iostats[scsi_disk(disk)->index] = stats;
-+
-+      spin_unlock_irqrestore(&stats->iostat_lock, flags);
-+
-+      strncpy(stats->iostat_name, disk->disk_name,
-+              sizeof(stats->iostat_name)-1);
-+
-+      pde = create_proc_entry(stats->iostat_name, S_IRUGO | S_IWUSR,
-+                              sd_iostats_procdir);
-+      if (pde == NULL) {
-+              printk(KERN_WARNING "Can't create /proc/scsi/%s/%s\n",
-+                      sd_iostats_procdir_name, disk->disk_name);
-+      } else {
-+              pde->proc_fops = &sd_iostats_proc_fops;
-+              pde->data = disk;
  +      }
  +}
  +
  +void sd_iostats_fini(void)
  +{
-+      int  i;
-+
-+      if (sd_iostats == NULL)
-+              return;
-+
-+      for (i = 0; i < SD_STATS; i++) {
-+              if (sd_iostats[i] == NULL)
-+                      continue;
-+              if (sd_iostats_procdir != NULL)
-+                      remove_proc_entry(sd_iostats[i]->iostat_name,
-+                                              sd_iostats_procdir);
-+              kfree(sd_iostats[i]);
-+      }
-+
  +      if (proc_scsi != NULL && sd_iostats_procdir != NULL)
  +              remove_proc_entry(sd_iostats_procdir_name, proc_scsi);
  +
  +      sd_iostats_procdir = NULL;
-+      kfree(sd_iostats);
-+      sd_iostats = NULL;
  +}
  +
  +void sd_iostats_finish_req(struct scsi_cmnd *SCpnt)
@@ -479,31 +410,20 @@ Index: linux-2.6.9/drivers/scsi/sd.c
  +      int                     tbucket;
  +      int                     tmp;
  +      unsigned long           irqflags;
-+      int                     disk, i;
-+
-+      disk = scsi_disk(rq->rq_disk)->index;
-+
-+      if (sd_iostats == NULL)
-+              return;
++      unsigned long           i;
  +
-+      if (disk < 0 || disk >= SD_STATS) {
-+              printk(KERN_ERR "sd_iostats_bump: unexpected disk index "
-+                      "%d([0-%d])\n", disk, SD_STATS);
-+              BUG();
-+      }
-+
-+      stats = sd_iostats[disk];
++      stats = scsi_disk(rq->rq_disk)->stats;
  +      if (stats == NULL)
  +              return;
  +
-+      tmp = jiffies -  rq->start_time;
++      tmp = jiffies - rq->start_time;
  +      for (tbucket = 0; tmp > 1; tbucket++)
  +              tmp >>= 1;
  +      if (tbucket >= IOSTAT_NCOUNTERS)
  +              tbucket = IOSTAT_NCOUNTERS - 1;
  +      //printk("%u ticks in D to %u\n", jiffies - rq->start_time, tbucket);
  +
-+      tcounter = rq_data_dir(rq) == WRITE ? 
++      tcounter = rq_data_dir(rq) == WRITE ?
  +              &stats->iostat_wtime[tbucket] : &stats->iostat_rtime[tbucket];
  +
  +      spin_lock_irqsave(&stats->iostat_lock, irqflags);
@@ -517,13 +437,14 @@ Index: linux-2.6.9/drivers/scsi/sd.c
  +              i = IOSTAT_NCOUNTERS - 1;
  +      stats->iostat_queue_ticks[i] += jiffies - stats->iostat_queue_stamp;
  +      stats->iostat_queue_ticks_sum += jiffies - stats->iostat_queue_stamp;
++      BUG_ON(stats->iostat_queue_depth == 0);
  +      stats->iostat_queue_depth--;
  +
  +      /* update seek stats. XXX: not sure about nr_sectors */
  +      stats->iostat_sectors += rq->nr_sectors;
  +      stats->iostat_reqs++;
  +      if (rq->sector != stats->iostat_next_sector) {
-+              stats->iostat_seek_sectors += 
++              stats->iostat_seek_sectors +=
  +                      rq->sector > stats->iostat_next_sector ?
  +                      rq->sector - stats->iostat_next_sector :
  +                      stats->iostat_next_sector - rq->sector;
@@ -545,21 +466,10 @@ Index: linux-2.6.9/drivers/scsi/sd.c
  +      int                     tbucket;
  +      int                     tmp;
  +      unsigned long           irqflags;
-+      int                     disk, i;
++      unsigned long           i;
  +      int                     nsect;
  +
-+      disk = scsi_disk(rq->rq_disk)->index;
-+
-+      if (sd_iostats == NULL)
-+              return;
-+
-+      if (disk < 0 || disk >= SD_STATS) {
-+              printk(KERN_ERR "sd_iostats_bump: unexpected disk index %d([0-%d])\n",
-+                      disk, SD_STATS);
-+              BUG();
-+      }
-+
-+      stats = sd_iostats[disk];
++      stats = scsi_disk(rq->rq_disk)->stats;
  +      if (stats == NULL)
  +              return;
  +
@@ -572,7 +482,7 @@ Index: linux-2.6.9/drivers/scsi/sd.c
  +              BUG();
  +      }
  +
-+      counter = rq_data_dir(rq) == WRITE ? 
++      counter = rq_data_dir(rq) == WRITE ?
  +              &stats->iostat_write_histogram[bucket] :
  +              &stats->iostat_read_histogram[bucket];
  +
@@ -618,33 +528,54 @@ Index: linux-2.6.9/drivers/scsi/sd.c
  +#endif
  +
   /**
-  *    init_sd - entry point for this driver (both when built in or when
-  *    a module).
-@@ -1584,6 +2127,7 @@ static void sd_shutdown(struct device *d
+  *    scsi_disk_release - Called to free the scsi_disk structure
+  *    @cdev: pointer to embedded class device
+@@ -1796,10 +2249,16 @@ static void scsi_disk_release(struct cla
+       idr_remove(&sd_index_idr, sdkp->index);
+       spin_unlock(&sd_index_lock);
+ 
++#if (defined(CONFIG_SD_IOSTATS) && defined(CONFIG_PROC_FS))
++      if (sdkp->stats) {
++              remove_proc_entry(disk->disk_name, sd_iostats_procdir);
++              kfree(sdkp->stats);
++              sdkp->stats = NULL;
++      }
++#endif
+       disk->private_data = NULL;
+       put_disk(disk);
+       put_device(&sdkp->device->sdev_gendev);
+-
+       kfree(sdkp);
+ }
+ 
+@@ -1907,6 +2366,7 @@ done:
   static int __init init_sd(void)
   {
         int majors = 0, i;
-+   int rc = 0;
++      int rc = 0;
   
         SCSI_LOG_HLQUEUE(3, printk("init_sd: sd driver entry point\n"));
   
-@@ -1594,7 +2138,10 @@ static int __init init_sd(void)
+@@ -1917,9 +2377,13 @@ static int __init init_sd(void)
         if (!majors)
                 return -ENODEV;
   
++      sd_iostats_init();
+       class_register(&sd_disk_class);
+ 
  -      return scsi_register_driver(&sd_template.gendrv);
-+   rc = scsi_register_driver(&sd_template.gendrv);
-+   if (rc == 0)
-+      sd_iostats_init();
-+   return rc;
++      rc = scsi_register_driver(&sd_template.gendrv);
++      if (rc)
++              sd_iostats_fini();
++      return rc;
   }
   
   /**
-@@ -1608,6 +2155,7 @@ static void __exit exit_sd(void)
+@@ -1938,6 +2402,7 @@ static void __exit exit_sd(void)
+               unregister_blkdev(sd_major(i), "sd");
   
-       SCSI_LOG_HLQUEUE(3, printk("exit_sd: exiting sd driver\n"));
+       class_unregister(&sd_disk_class);
++      sd_iostats_fini();
+ }
   
-+   sd_iostats_fini();
-       scsi_unregister_driver(&sd_template.gendrv);
-       for (i = 0; i < SD_MAJORS; i++)
-               unregister_blkdev(sd_major(i), "sd");
+ module_init(init_sd);
diff --git a/lustre/kernel_patches/patches/sd_iostats-2.6-suse.patch b/lustre/kernel_patches/patches/sd_iostats-2.6-suse.patch

new file mode 100644 (file)

index 0000000..6d00acd
--- /dev/null
+++ b/lustre/kernel_patches/patches/sd_iostats-2.6-suse.patch
@@ -0,0 +1,579 @@
+Index: linux-2.6.5-7.311/drivers/scsi/Kconfig
+===================================================================
+--- linux-2.6.5-7.311.orig/drivers/scsi/Kconfig
++++ linux-2.6.5-7.311/drivers/scsi/Kconfig
+@@ -67,6 +67,14 @@ config SCSI_DUMP
+          polling I/O.  If it doesn't, LKCD will fall back to ordinary
+          interrupt-driven I/O.
+ 
++config SD_IOSTATS
++   bool "Enable SCSI disk I/O stats"
++   depends on BLK_DEV_SD
++   default y
++   ---help---
++     This enables SCSI disk I/O stats collection.  You must also enable
++     /proc file system support if you want this feature.
++
+ config CHR_DEV_ST
+       tristate "SCSI tape support"
+       depends on SCSI
+Index: linux-2.6.5-7.311/drivers/scsi/scsi_proc.c
+===================================================================
+--- linux-2.6.5-7.311.orig/drivers/scsi/scsi_proc.c
++++ linux-2.6.5-7.311/drivers/scsi/scsi_proc.c
+@@ -38,7 +38,8 @@
+ /* 4K page size, but our output routines, use some slack for overruns */
+ #define PROC_BLOCK_SIZE (3*1024)
+ 
+-static struct proc_dir_entry *proc_scsi;
++struct proc_dir_entry *proc_scsi;
++EXPORT_SYMBOL(proc_scsi);
+ 
+ /* Protect sht->present and sht->proc_dir */
+ static DECLARE_MUTEX(global_host_template_sem);
+Index: linux-2.6.5-7.311/drivers/scsi/sd.c
+===================================================================
+--- linux-2.6.5-7.311.orig/drivers/scsi/sd.c
++++ linux-2.6.5-7.311/drivers/scsi/sd.c
+@@ -66,6 +66,63 @@
+ #define SD_MAJORS     16
+ #define SD_DISKS      32768   /* anything between 256 and 262144 */
+ 
++#if (defined(CONFIG_SD_IOSTATS) && defined(CONFIG_PROC_FS))
++# include <linux/proc_fs.h>
++# include <linux/seq_file.h>
++
++typedef struct {
++      unsigned long long iostat_size;
++      unsigned long long iostat_count;
++} iostat_counter_t;
++
++#define IOSTAT_NCOUNTERS 16
++typedef struct {
++      iostat_counter_t        iostat_read_histogram[IOSTAT_NCOUNTERS];
++      iostat_counter_t        iostat_write_histogram[IOSTAT_NCOUNTERS];
++      struct timeval          iostat_timeval;
++
++      /* queue depth: how well the pipe is filled up */
++      unsigned long long      iostat_queue_ticks[IOSTAT_NCOUNTERS];
++      unsigned long long      iostat_queue_ticks_sum;
++      unsigned long           iostat_queue_depth;
++      unsigned long           iostat_queue_stamp;
++
++      /* seeks: how linear the traffic is */
++      unsigned long long      iostat_next_sector;
++      unsigned long long      iostat_seek_sectors;
++      unsigned long long      iostat_seeks;
++      unsigned long long      iostat_sectors;
++      unsigned long long      iostat_reqs;
++      unsigned long           iostat_read_reqs;
++      unsigned long           iostat_write_reqs;
++
++      /* process time: how long it takes to process requests */
++      unsigned long           iostat_rtime[IOSTAT_NCOUNTERS];
++      unsigned long           iostat_wtime[IOSTAT_NCOUNTERS];
++
++      /* queue time: how long process spent in elevator's queue */
++      unsigned long           iostat_rtime_in_queue[IOSTAT_NCOUNTERS];
++      unsigned long           iostat_wtime_in_queue[IOSTAT_NCOUNTERS];
++
++      /* must be the last field, as it's used to know size to be memset'ed */
++      spinlock_t              iostat_lock;
++} ____cacheline_aligned_in_smp iostat_stats_t;
++
++struct proc_dir_entry *sd_iostats_procdir = NULL;
++char sd_iostats_procdir_name[] = "sd_iostats";
++static struct file_operations sd_iostats_proc_fops;
++
++extern void sd_iostats_init(void);
++extern void sd_iostats_fini(void);
++void sd_iostats_start_req(struct scsi_cmnd *SCpnt);
++void sd_iostats_finish_req(struct scsi_cmnd *SCpnt);
++#else
++static inline void sd_iostats_init(void) {}
++static inline void sd_iostats_fini(void) {}
++static inline void sd_iostats_start_req(struct scsi_cmnd *SCpnt) {}
++static inline void sd_iostats_finish_req(struct scsi_cmnd *SCpnt) {}
++#endif
++
+ /*
+  * Time out in seconds for disks and Magneto-opticals (which are slower).
+  */
+@@ -96,6 +153,9 @@ struct scsi_disk {
+       u8              device_ready;
+       unsigned        WCE : 1;        /* state of disk WCE bit */
+       unsigned        RCD : 1;        /* state of disk RCD bit, unused */
++#if (defined(CONFIG_SD_IOSTATS) && defined(CONFIG_PROC_FS))
++      iostat_stats_t  *stats;         /* scsi disk statistics */
++#endif
+ };
+ 
+ 
+@@ -384,6 +444,8 @@ queue:
+       SCpnt->allowed = SD_MAX_RETRIES;
+       SCpnt->timeout_per_command = timeout;
+ 
++      sd_iostats_start_req(SCpnt);
++
+       /*
+        * This is the completion routine we use.  This is matched in terms
+        * of capability to this function.
+@@ -884,6 +946,9 @@ static void sd_rw_intr(struct scsi_cmnd 
+                       break;
+               }
+       }
++
++      sd_iostats_finish_req(SCpnt);
++
+       /*
+        * This calls the generic completion function, now that we know
+        * how many actual sectors finished, and how many sectors we need
+@@ -1527,6 +1592,36 @@ static int sd_probe(struct device *dev)
+       if (!sdkp->device_ready || sdp->host->no_partition_check)
+               gd->flags |= GENHD_FL_NO_PARTITION_CHECK;
+ 
++#if (defined(CONFIG_SD_IOSTATS) && defined(CONFIG_PROC_FS))
++      sdkp->stats = kzalloc(sizeof(iostat_stats_t), GFP_KERNEL);
++      if (!sdkp->stats) {
++              printk(KERN_WARNING "cannot allocate iostat structure for"
++                                  "%s\n", gd->disk_name);
++      } else {
++              do_gettimeofday(&sdkp->stats->iostat_timeval);
++              sdkp->stats->iostat_queue_stamp = jiffies;
++              spin_lock_init(&sdkp->stats->iostat_lock);
++              if (sd_iostats_procdir) {
++                      struct proc_dir_entry *pde;
++                      pde = create_proc_entry(gd->disk_name, S_IRUGO | S_IWUSR,
++                                              sd_iostats_procdir);
++                      if (!pde) {
++                              printk(KERN_WARNING "Can't create /proc/scsi/"
++                                                  "%s/%s\n",
++                                                  sd_iostats_procdir_name,
++                                                  gd->disk_name);
++                              kfree(sdkp->stats);
++                              sdkp->stats = NULL;
++                      } else {
++                              pde->proc_fops = &sd_iostats_proc_fops;
++                              pde->data = gd;
++                      }
++              } else {
++                      kfree(sdkp->stats);
++                      sdkp->stats = NULL;
++              }
++      }
++#endif
+       dev_set_drvdata(dev, sdkp);
+       add_disk(gd);
+ 
+@@ -1574,7 +1669,14 @@ static int sd_remove(struct device *dev)
+ static void scsi_disk_release(struct kobject *kobj)
+ {
+       struct scsi_disk *sdkp = to_scsi_disk(kobj);
+-      
++
++#if (defined(CONFIG_SD_IOSTATS) && defined(CONFIG_PROC_FS))
++      if (sdkp->stats) {
++              remove_proc_entry(sdkp->disk->disk_name, sd_iostats_procdir);
++              kfree(sdkp->stats);
++              sdkp->stats = NULL;
++      }
++#endif
+       put_disk(sdkp->disk);
+ 
+       spin_lock(&sd_index_lock);
+@@ -1605,6 +1707,366 @@ static void sd_shutdown(struct device *d
+       sd_sync_cache(sdp);
+ }     
+ 
++#if (defined(CONFIG_SD_IOSTATS) && defined(CONFIG_PROC_FS))
++static int
++sd_iostats_seq_show(struct seq_file *seq, void *v)
++{
++      struct timeval     now;
++      struct gendisk *disk = seq->private;
++      iostat_stats_t    *stats;
++      unsigned long long read_len;
++      unsigned long long read_len_tot;
++      unsigned long      read_num;
++      unsigned long      read_num_tot;
++      unsigned long long write_len;
++      unsigned long long write_len_tot;
++      unsigned long      write_num;
++      unsigned long      write_num_tot;
++      int                i;
++      int                maxi;
++
++      stats = scsi_disk(disk)->stats;
++      if (stats == NULL) {
++              printk(KERN_ERR "sd_iostats_seq_show: NULL stats entry\n");
++              BUG();
++      }
++
++      do_gettimeofday(&now);
++      now.tv_sec -= stats->iostat_timeval.tv_sec;
++      now.tv_usec -= stats->iostat_timeval.tv_usec;
++      if (now.tv_usec < 0) {
++              now.tv_usec += 1000000;
++              now.tv_sec--;
++      }
++
++      /* this sampling races with updates */
++      seq_printf(seq, "index:        %lu   snapshot_time:         %lu.%06lu\n",
++                      (unsigned long) scsi_disk(disk)->index,
++                      now.tv_sec, now.tv_usec);
++
++      for (i = IOSTAT_NCOUNTERS - 1; i > 0; i--)
++              if (stats->iostat_read_histogram[i].iostat_count != 0 ||
++                              stats->iostat_write_histogram[i].iostat_count != 0)
++                      break;
++      maxi = i;
++
++      seq_printf(seq, "%8s %8s %12s %8s %12s\n", "size", 
++                      "reads", "total", "writes", "total");
++
++      read_len_tot = write_len_tot = 0;
++      read_num_tot = write_num_tot = 0;
++      for (i = 0; i <= maxi; i++) {
++              read_len = stats->iostat_read_histogram[i].iostat_size;
++              read_len_tot += read_len;
++              read_num = stats->iostat_read_histogram[i].iostat_count;
++              read_num_tot += read_num;
++
++              write_len = stats->iostat_write_histogram[i].iostat_size;
++              write_len_tot += write_len;
++              write_num = stats->iostat_write_histogram[i].iostat_count;
++              write_num_tot += write_num;
++
++              seq_printf (seq, "%8d %8lu %12llu %8lu %12llu\n", 
++                              512<<i, read_num, read_len, write_num, write_len);
++      }
++
++      seq_printf(seq, "%8s %8lu %12llu %8lu %12llu\n\n", "total",
++                      read_num_tot, read_len_tot, 
++                      write_num_tot, write_len_tot);
++
++      seq_printf(seq, "%8s %8s %8s\n", "qdepth", "ticks", "%");
++      for (i = 0; i < IOSTAT_NCOUNTERS; i++) {
++              unsigned long long ticks, percent;
++              ticks = stats->iostat_queue_ticks[i];
++              if (ticks == 0)
++                      continue;
++              percent = stats->iostat_queue_ticks[i] * 100;
++              do_div(percent, stats->iostat_queue_ticks_sum);
++              seq_printf(seq, "%8d %8llu %8llu\n", i, ticks, percent);
++      }
++
++      if (stats->iostat_reqs != 0) {
++              unsigned long long aveseek = 0, percent = 0;
++
++              if (stats->iostat_seeks) {
++                      aveseek = stats->iostat_seek_sectors;
++                      do_div(aveseek, stats->iostat_seeks);
++                      percent = stats->iostat_seeks * 100;
++                      do_div(percent, stats->iostat_reqs);
++              }
++
++              seq_printf(seq, "\n%llu sectors in %llu reqs: %llu seek(s) over "
++                              "%llu sectors in ave, %llu%% of all reqs\n",
++                              stats->iostat_sectors, stats->iostat_reqs,
++                              stats->iostat_seeks, aveseek, percent);
++      }
++
++      seq_printf(seq, "\n%16s %8s %8s %8s %8s\n", "process time", "reads",
++                      "%%", "writes", "%%");
++      for (i = 0; i < IOSTAT_NCOUNTERS; i++) {
++              unsigned long read_percent = 0, write_percent = 0;
++              if (stats->iostat_wtime[i] == 0 &&
++                              stats->iostat_rtime[i] == 0)
++                      continue;
++              if (stats->iostat_read_reqs)
++                      read_percent = stats->iostat_rtime[i] * 100 / 
++                              stats->iostat_read_reqs;
++              if (stats->iostat_write_reqs)
++                      write_percent = stats->iostat_wtime[i] * 100 / 
++                              stats->iostat_write_reqs;
++              seq_printf(seq, "%16u %8lu %8lu %8lu %8lu\n",
++                              jiffies_to_msecs(((1UL << i) >> 1) << 1),
++                              stats->iostat_rtime[i], read_percent,
++                              stats->iostat_wtime[i], write_percent);
++      }
++
++      seq_printf(seq, "\n%16s %8s %8s %8s %8s\n", "time in queue", "reads",
++                      "%%", "writes", "%%");
++      for (i = 0; i < IOSTAT_NCOUNTERS; i++) {
++              unsigned long read_percent = 0, write_percent = 0;
++              if (stats->iostat_wtime_in_queue[i] == 0 &&
++                              stats->iostat_rtime_in_queue[i] == 0)
++                      continue;
++              if (stats->iostat_read_reqs)
++                      read_percent = stats->iostat_rtime_in_queue[i] * 100 / 
++                              stats->iostat_read_reqs;
++              if (stats->iostat_write_reqs)
++                      write_percent = stats->iostat_wtime_in_queue[i] * 100 / 
++                              stats->iostat_write_reqs;
++              seq_printf(seq, "%16u %8lu %8lu %8lu %8lu\n",
++                              jiffies_to_msecs(((1UL << i) >> 1) << 1),
++                              stats->iostat_rtime_in_queue[i],
++                              read_percent,
++                              stats->iostat_wtime_in_queue[i],
++                              write_percent);
++      }
++
++      return 0;
++}
++
++static void *
++sd_iostats_seq_start(struct seq_file *p, loff_t *pos)
++{
++      return (*pos == 0) ? (void *)1 : NULL;
++}
++
++static void *
++sd_iostats_seq_next(struct seq_file *p, void *v, loff_t *pos)
++{
++      ++*pos;
++      return NULL;
++}
++
++static void
++sd_iostats_seq_stop(struct seq_file *p, void *v)
++{
++}
++
++static struct seq_operations sd_iostats_seqops = {
++      .start = sd_iostats_seq_start,
++      .stop  = sd_iostats_seq_stop,
++      .next  = sd_iostats_seq_next,
++      .show  = sd_iostats_seq_show,
++};
++
++static int
++sd_iostats_seq_open (struct inode *inode, struct file *file)
++{
++      int rc;
++
++      rc = seq_open(file, &sd_iostats_seqops);
++      if (rc != 0)
++              return rc;
++
++      ((struct seq_file *)file->private_data)->private = PDE(inode)->data;
++      return 0;
++}
++
++static ssize_t
++sd_iostats_seq_write(struct file *file, const char *buffer,
++                   size_t len, loff_t *off)
++{
++      struct seq_file   *seq = file->private_data;
++      struct gendisk *disk = seq->private;
++      iostat_stats_t    *stats = scsi_disk(disk)->stats;
++      unsigned long      flags;
++      unsigned long      qdepth;
++
++
++      spin_lock_irqsave (&stats->iostat_lock, flags);
++      qdepth = stats->iostat_queue_depth;
++      memset (stats, 0, offsetof(iostat_stats_t, iostat_lock));
++      do_gettimeofday(&stats->iostat_timeval);
++      stats->iostat_queue_stamp = jiffies;
++      stats->iostat_queue_depth = qdepth;
++      spin_unlock_irqrestore (&stats->iostat_lock, flags);
++
++      return len;
++}
++
++static struct file_operations sd_iostats_proc_fops = {
++      .owner   = THIS_MODULE,
++      .open    = sd_iostats_seq_open,
++      .read    = seq_read,
++      .write   = sd_iostats_seq_write,
++      .llseek  = seq_lseek,
++      .release = seq_release,
++};
++
++extern struct proc_dir_entry *proc_scsi;
++
++void
++sd_iostats_init(void)
++{
++      if (proc_scsi == NULL) {
++              printk(KERN_WARNING "No access to sd iostats: "
++                      "proc_scsi is NULL\n");
++              return;
++      }
++
++      sd_iostats_procdir = create_proc_entry(sd_iostats_procdir_name,
++                      S_IFDIR | S_IRUGO | S_IXUGO,
++                      proc_scsi);
++      if (sd_iostats_procdir == NULL) {
++              printk(KERN_WARNING "No access to sd iostats: "
++                      "can't create /proc/scsi/%s\n", sd_iostats_procdir_name);
++              return;
++      }
++}
++
++void sd_iostats_fini(void)
++{
++      if (proc_scsi != NULL && sd_iostats_procdir != NULL)
++              remove_proc_entry(sd_iostats_procdir_name, proc_scsi);
++
++      sd_iostats_procdir = NULL;
++}
++
++void sd_iostats_finish_req(struct scsi_cmnd *SCpnt)
++{
++      struct request          *rq = SCpnt->request;
++      iostat_stats_t          *stats;
++      unsigned long           *tcounter;
++      int                     tbucket;
++      int                     tmp;
++      unsigned long           irqflags;
++      unsigned long           i;
++
++      stats = scsi_disk(rq->rq_disk)->stats;
++      if (stats == NULL)
++              return;
++
++      tmp = jiffies -  rq->start_time;
++      for (tbucket = 0; tmp > 1; tbucket++)
++              tmp >>= 1;
++      if (tbucket >= IOSTAT_NCOUNTERS)
++              tbucket = IOSTAT_NCOUNTERS - 1;
++      //printk("%u ticks in D to %u\n", jiffies - rq->start_time, tbucket);
++
++      tcounter = rq_data_dir(rq) == WRITE ? 
++              &stats->iostat_wtime[tbucket] : &stats->iostat_rtime[tbucket];
++
++      spin_lock_irqsave(&stats->iostat_lock, irqflags);
++
++      /* update delay stats */
++      (*tcounter)++;
++
++      /* update queue depth stats */
++      i = stats->iostat_queue_depth;
++      if (i >= IOSTAT_NCOUNTERS)
++              i = IOSTAT_NCOUNTERS - 1;
++      stats->iostat_queue_ticks[i] += jiffies - stats->iostat_queue_stamp;
++      stats->iostat_queue_ticks_sum += jiffies - stats->iostat_queue_stamp;
++      BUG_ON(stats->iostat_queue_depth == 0);
++      stats->iostat_queue_depth--;
++
++      /* update seek stats. XXX: not sure about nr_sectors */
++      stats->iostat_sectors += rq->nr_sectors;
++      stats->iostat_reqs++;
++      if (rq->sector != stats->iostat_next_sector) {
++              stats->iostat_seek_sectors += 
++                      rq->sector > stats->iostat_next_sector ?
++                      rq->sector - stats->iostat_next_sector :
++                      stats->iostat_next_sector - rq->sector;
++              stats->iostat_seeks++;
++      }
++      stats->iostat_next_sector = rq->sector + rq->nr_sectors;
++
++      stats->iostat_queue_stamp = jiffies;
++
++      spin_unlock_irqrestore(&stats->iostat_lock, irqflags);
++}
++
++void sd_iostats_start_req(struct scsi_cmnd *SCpnt)
++{
++      struct request          *rq = SCpnt->request;
++      iostat_stats_t          *stats;
++      iostat_counter_t        *counter;
++      int                     bucket;
++      int                     tbucket;
++      int                     tmp;
++      unsigned long           irqflags;
++      unsigned long           i;
++      int                     nsect;
++
++      stats = scsi_disk(rq->rq_disk)->stats;
++      if (stats == NULL)
++              return;
++
++      nsect = SCpnt->request_bufflen >> 9;
++      for (bucket = 0, tmp = nsect; tmp > 1; bucket++)
++              tmp >>= 1;
++
++      if (bucket >= IOSTAT_NCOUNTERS) {
++              printk (KERN_ERR "sd_iostats_bump: nsect %d too big\n", nsect);
++              BUG();
++      }
++
++      counter = rq_data_dir(rq) == WRITE ? 
++              &stats->iostat_write_histogram[bucket] :
++              &stats->iostat_read_histogram[bucket];
++
++      tmp = jiffies - rq->start_time;
++      for (tbucket = 0; tmp > 1; tbucket++)
++              tmp >>= 1;
++      if (tbucket >= IOSTAT_NCOUNTERS)
++              tbucket = IOSTAT_NCOUNTERS - 1;
++      //printk("%u ticks in Q to %u\n", jiffies - rq->start_time, tbucket);
++
++      /* an ugly hack to know exact processing time. the right
++       * solution is to add one more field to struct request
++       * hopefully it will break nothing ... */
++      rq->start_time = jiffies;
++
++      spin_lock_irqsave(&stats->iostat_lock, irqflags);
++
++      /* update queue depth stats */
++      i = stats->iostat_queue_depth;
++      if (i >= IOSTAT_NCOUNTERS)
++              i = IOSTAT_NCOUNTERS - 1;
++      stats->iostat_queue_ticks[i] += jiffies - stats->iostat_queue_stamp;
++      stats->iostat_queue_ticks_sum += jiffies - stats->iostat_queue_stamp;
++      stats->iostat_queue_depth++;
++
++      /* update delay stats */
++      if (rq_data_dir(rq) == WRITE) {
++              stats->iostat_wtime_in_queue[tbucket]++;
++              stats->iostat_write_reqs++;
++      } else {
++              stats->iostat_rtime_in_queue[tbucket]++;
++              stats->iostat_read_reqs++;
++      }
++
++      /* update size stats */
++      counter->iostat_size += nsect;
++      counter->iostat_count++;
++
++      stats->iostat_queue_stamp = jiffies;
++
++      spin_unlock_irqrestore(&stats->iostat_lock, irqflags);
++}
++#endif
++
+ /**
+  *    init_sd - entry point for this driver (both when built in or when
+  *    a module).
+@@ -1614,6 +2076,7 @@ static void sd_shutdown(struct device *d
+ static int __init init_sd(void)
+ {
+       int majors = 0, i;
++      int rc = 0;
+ 
+       SCSI_LOG_HLQUEUE(3, printk("init_sd: sd driver entry point\n"));
+ 
+@@ -1624,7 +2087,11 @@ static int __init init_sd(void)
+       if (!majors)
+               return -ENODEV;
+ 
+-      return scsi_register_driver(&sd_template.gendrv);
++      sd_iostats_init();
++      rc = scsi_register_driver(&sd_template.gendrv);
++      if (rc)
++              sd_iostats_fini();
++      return rc;
+ }
+ 
+ /**
+@@ -1641,6 +2108,7 @@ static void __exit exit_sd(void)
+       scsi_unregister_driver(&sd_template.gendrv);
+       for (i = 0; i < SD_MAJORS; i++)
+               unregister_blkdev(sd_major(i), "sd");
++      sd_iostats_fini();
+ }
+ 
+ MODULE_LICENSE("GPL");
diff --git a/lustre/kernel_patches/patches/sd_iostats-2.6.22-vanilla.patch b/lustre/kernel_patches/patches/sd_iostats-2.6.22-vanilla.patch

index d6efdc6..9e822d2 100644 (file)
--- a/lustre/kernel_patches/patches/sd_iostats-2.6.22-vanilla.patch
+++ b/lustre/kernel_patches/patches/sd_iostats-2.6.22-vanilla.patch
@@ -1,8 +1,8 @@
-Index: linux-2.6.22.5/drivers/scsi/Kconfig
+Index: linux-2.6.22.19/drivers/scsi/Kconfig
  ===================================================================
---- linux-2.6.22.5.orig/drivers/scsi/Kconfig   2007-08-22 17:23:54.000000000 -0600
-+++ linux-2.6.22.5/drivers/scsi/Kconfig        2008-02-21 01:20:41.000000000 -0700
-@@ -76,6 +76,14 @@
+--- linux-2.6.22.19.orig/drivers/scsi/Kconfig
++++ linux-2.6.22.19/drivers/scsi/Kconfig
+@@ -76,6 +76,14 @@ config BLK_DEV_SD
           In this case, do not compile the driver for your SCSI host adapter
           (below) as a module either.
   
@@ -17,99 +17,104 @@ Index: linux-2.6.22.5/drivers/scsi/Kconfig
   config CHR_DEV_ST
         tristate "SCSI tape support"
         depends on SCSI
-Index: linux-2.6.22.5/drivers/scsi/sd.c
+Index: linux-2.6.22.19/drivers/scsi/scsi_proc.c
  ===================================================================
---- linux-2.6.22.5.orig/drivers/scsi/sd.c      2007-08-22 17:23:54.000000000 -0600
-+++ linux-2.6.22.5/drivers/scsi/sd.c   2008-02-21 01:20:41.000000000 -0700
-@@ -62,6 +62,38 @@
+--- linux-2.6.22.19.orig/drivers/scsi/scsi_proc.c
++++ linux-2.6.22.19/drivers/scsi/scsi_proc.c
+@@ -40,7 +40,8 @@
+ /* 4K page size, but our output routines, use some slack for overruns */
+ #define PROC_BLOCK_SIZE (3*1024)
+ 
+-static struct proc_dir_entry *proc_scsi;
++struct proc_dir_entry *proc_scsi;
++EXPORT_SYMBOL(proc_scsi);
   
- #include "scsi_logging.h"
+ /* Protect sht->present and sht->proc_dir */
+ static DEFINE_MUTEX(global_host_template_mutex);
+Index: linux-2.6.22.19/drivers/scsi/sd.c
+===================================================================
+--- linux-2.6.22.19.orig/drivers/scsi/sd.c
++++ linux-2.6.22.19/drivers/scsi/sd.c
+@@ -94,6 +94,24 @@ static DEFINE_SPINLOCK(sd_index_lock);
+  * object after last put) */
+ static DEFINE_MUTEX(sd_ref_mutex);
   
  +#if (defined(CONFIG_SD_IOSTATS) && defined(CONFIG_PROC_FS))
  +# include <linux/proc_fs.h>
  +# include <linux/seq_file.h>
-+
-+typedef struct {
-+        unsigned long long iostat_size;
-+        unsigned long long iostat_count;
-+} iostat_counter_t;
-+
-+#define IOSTAT_NCOUNTERS 16
-+typedef struct {
-+        iostat_counter_t        iostat_read_histogram[IOSTAT_NCOUNTERS];
-+        iostat_counter_t        iostat_write_histogram[IOSTAT_NCOUNTERS];
-+        struct timeval          iostat_timeval;
-+} iostat_stats_t;
-+
-+iostat_stats_t       **sd_iostats;
-+spinlock_t             sd_iostats_lock;
-+struct proc_dir_entry *sd_iostats_procdir;
-+char                   sd_iostats_procdir_name[] = "sd_iostats";
++struct proc_dir_entry *sd_iostats_procdir = NULL;
++char sd_iostats_procdir_name[] = "sd_iostats";
++static struct file_operations sd_iostats_proc_fops;
  +
  +extern void sd_iostats_init(void);
-+extern void sd_iostats_init_disk(struct gendisk *);
  +extern void sd_iostats_fini(void);
-+extern void sd_iostats_bump(int disk, unsigned int nsect, int iswrite);
++void sd_iostats_start_req(struct scsi_cmnd *SCpnt);
++void sd_iostats_finish_req(struct scsi_cmnd *SCpnt);
  +#else
  +static inline void sd_iostats_init(void) {}
-+static inline void sd_iostats_init_disk(struct gendisk *disk) {}
  +static inline void sd_iostats_fini(void) {}
-+static inline void sd_iostats_bump(int disk, unsigned int nsect, int iswrite) {}
++static inline void sd_iostats_start_req(struct scsi_cmnd *SCpnt) {}
++static inline void sd_iostats_finish_req(struct scsi_cmnd *SCpnt) {}
  +#endif
  +
- MODULE_AUTHOR("Eric Youngdale");
- MODULE_DESCRIPTION("SCSI disk (sd) driver");
- MODULE_LICENSE("GPL");
-@@ -89,6 +121,7 @@
- static DEFINE_IDR(sd_index_idr);
- static DEFINE_SPINLOCK(sd_index_lock);
+ static const char *sd_cache_types[] = {
+       "write through", "none", "write back",
+       "write back, no read (daft)"
+@@ -498,6 +516,8 @@ static int sd_init_command(struct scsi_c
+        */
+       SCpnt->done = sd_rw_intr;
   
-+#define SD_STATS 256
- /* This semaphore is used to mediate the 0->1 reference get in the
-  * face of object destruction (i.e. we can't allow a get on an
-  * object after last put) */
-@@ -368,6 +401,9 @@
-       SCSI_LOG_HLQUEUE(2, scmd_printk(KERN_INFO, SCpnt, "block=%llu\n",
-                                       (unsigned long long)block));
- 
-+   sd_iostats_bump(scsi_disk(disk)->index, this_count,
-+                   rq_data_dir(SCpnt->request) == WRITE);
++      sd_iostats_start_req(SCpnt);
  +
         /*
-        * If we have a 1K hardware sectorsize, prevent access to single
-        * 512 byte sectors.  In theory we could handle this - in fact
-@@ -575,6 +611,7 @@
-                       scsi_set_medium_removal(sdev, SCSI_REMOVAL_PREVENT);
+        * This indicates that the command is ready from our end to be
+        * queued.
+@@ -980,6 +1000,7 @@ static void sd_rw_intr(struct scsi_cmnd 
+               break;
         }
+  out:
++      sd_iostats_finish_req(SCpnt);
+       scsi_io_completion(SCpnt, good_bytes);
+ }
   
-+   sd_iostats_init_disk(disk);
-       return 0;
- 
- error_out:
-@@ -601,8 +638,20 @@
- 
-       SCSI_LOG_HLQUEUE(3, sd_printk(KERN_INFO, sdkp, "sd_release\n"));
+@@ -1666,6 +1687,36 @@ static int sd_probe(struct device *dev)
+       if (sdp->removable)
+               gd->flags |= GENHD_FL_REMOVABLE;
   
--      if (!--sdkp->openers && sdev->removable) {
--              if (scsi_block_when_processing_errors(sdev))
-+      if (!--sdkp->openers) {
-+              /*
-+               * Remove sd_iostats information about this disk
-+               */
-+              if (sd_iostats_procdir != NULL) {
-+                      remove_proc_entry(disk->disk_name, sd_iostats_procdir);
-+              }
-+              if (sd_iostats != NULL) {
-+                      if (sd_iostats[sdkp->index] != NULL) {
-+                              kfree (sd_iostats[sdkp->index]);
-+                              sd_iostats[sdkp->index] = NULL;
++#if (defined(CONFIG_SD_IOSTATS) && defined(CONFIG_PROC_FS))
++      sdkp->stats = kzalloc(sizeof(iostat_stats_t), GFP_KERNEL);
++      if (!sdkp->stats) {
++              printk(KERN_WARNING "cannot allocate iostat structure for"
++                                  "%s\n", gd->disk_name);
++      } else {
++              do_gettimeofday(&sdkp->stats->iostat_timeval);
++              sdkp->stats->iostat_queue_stamp = jiffies;
++              spin_lock_init(&sdkp->stats->iostat_lock);
++              if (sd_iostats_procdir) {
++                      struct proc_dir_entry *pde;
++                      pde = create_proc_entry(gd->disk_name, S_IRUGO | S_IWUSR,
++                                              sd_iostats_procdir);
++                      if (!pde) {
++                              printk(KERN_WARNING "Can't create /proc/scsi/"
++                                                  "%s/%s\n",
++                                                  sd_iostats_procdir_name,
++                                                  gd->disk_name);
++                              kfree(sdkp->stats);
++                              sdkp->stats = NULL;
++                      } else {
++                              pde->proc_fops = &sd_iostats_proc_fops;
++                              pde->data = gd;
  +                      }
++              } else {
++                      kfree(sdkp->stats);
++                      sdkp->stats = NULL;
  +              }
-+              if (sdev->removable && scsi_block_when_processing_errors(sdev))
-                       scsi_set_medium_removal(sdev, SCSI_REMOVAL_ALLOW);
-       }
++      }
++#endif
+       dev_set_drvdata(dev, sdkp);
+       add_disk(gd);
   
-@@ -1563,6 +1612,342 @@
+@@ -1709,6 +1760,366 @@ static int sd_remove(struct device *dev)
         return 0;
   }
   
@@ -117,101 +122,150 @@ Index: linux-2.6.22.5/drivers/scsi/sd.c
  +static int
  +sd_iostats_seq_show(struct seq_file *seq, void *v)
  +{
-+        struct timeval     now;
-+        struct gendisk *disk;
-+        iostat_stats_t    *stats;
-+        unsigned long long read_len;
-+        unsigned long long read_len_tot;
-+        unsigned long      read_num;
-+        unsigned long      read_num_tot;
-+        unsigned long long write_len;
-+        unsigned long long write_len_tot;
-+        unsigned long      write_num;
-+        unsigned long      write_num_tot;
-+        int                i;
-+        int                maxi;
-+
-+      if (seq == NULL || seq->private == NULL) {
-+              printk(KERN_ERR "sd_iostats_seq_show: NULL disk\n");
++      struct timeval     now;
++      struct gendisk *disk = seq->private;
++      iostat_stats_t    *stats;
++      unsigned long long read_len;
++      unsigned long long read_len_tot;
++      unsigned long      read_num;
++      unsigned long      read_num_tot;
++      unsigned long long write_len;
++      unsigned long long write_len_tot;
++      unsigned long      write_num;
++      unsigned long      write_num_tot;
++      int                i;
++      int                maxi;
++
++      stats = scsi_disk(disk)->stats;
++      if (stats == NULL) {
++              printk(KERN_ERR "sd_iostats_seq_show: NULL stats entry\n");
  +              BUG();
  +      }
  +
-+      disk = seq->private;
++      do_gettimeofday(&now);
++      now.tv_sec -= stats->iostat_timeval.tv_sec;
++      now.tv_usec -= stats->iostat_timeval.tv_usec;
++      if (now.tv_usec < 0) {
++              now.tv_usec += 1000000;
++              now.tv_sec--;
++      }
++
++      /* this sampling races with updates */
++      seq_printf(seq, "index:        %lu   snapshot_time:         %lu.%06lu\n",
++                      (unsigned long) scsi_disk(disk)->index,
++                      now.tv_sec, now.tv_usec);
++
++      for (i = IOSTAT_NCOUNTERS - 1; i > 0; i--)
++              if (stats->iostat_read_histogram[i].iostat_count != 0 ||
++                              stats->iostat_write_histogram[i].iostat_count != 0)
++                      break;
++      maxi = i;
++
++      seq_printf(seq, "%8s %8s %12s %8s %12s\n", "size", 
++                      "reads", "total", "writes", "total");
++
++      read_len_tot = write_len_tot = 0;
++      read_num_tot = write_num_tot = 0;
++      for (i = 0; i <= maxi; i++) {
++              read_len = stats->iostat_read_histogram[i].iostat_size;
++              read_len_tot += read_len;
++              read_num = stats->iostat_read_histogram[i].iostat_count;
++              read_num_tot += read_num;
++
++              write_len = stats->iostat_write_histogram[i].iostat_size;
++              write_len_tot += write_len;
++              write_num = stats->iostat_write_histogram[i].iostat_count;
++              write_num_tot += write_num;
++
++              seq_printf (seq, "%8d %8lu %12llu %8lu %12llu\n", 
++                              512<<i, read_num, read_len, write_num, write_len);
++      }
++
++      seq_printf(seq, "%8s %8lu %12llu %8lu %12llu\n\n", "total",
++                      read_num_tot, read_len_tot, 
++                      write_num_tot, write_len_tot);
++
++      seq_printf(seq, "%8s %8s %8s\n", "qdepth", "ticks", "%");
++      for (i = 0; i < IOSTAT_NCOUNTERS; i++) {
++              unsigned long long ticks, percent;
++              ticks = stats->iostat_queue_ticks[i];
++              if (ticks == 0)
++                      continue;
++              percent = stats->iostat_queue_ticks[i] * 100;
++              do_div(percent, stats->iostat_queue_ticks_sum);
++              seq_printf(seq, "%8d %8llu %8llu\n", i, ticks, percent);
++      }
++
++      if (stats->iostat_reqs != 0) {
++              unsigned long long aveseek = 0, percent = 0;
++
++              if (stats->iostat_seeks) {
++                      aveseek = stats->iostat_seek_sectors;
++                      do_div(aveseek, stats->iostat_seeks);
++                      percent = stats->iostat_seeks * 100;
++                      do_div(percent, stats->iostat_reqs);
++              }
++
++              seq_printf(seq, "\n%llu sectors in %llu reqs: %llu seek(s) over "
++                              "%llu sectors in ave, %llu%% of all reqs\n",
++                              stats->iostat_sectors, stats->iostat_reqs,
++                              stats->iostat_seeks, aveseek, percent);
++      }
  +
-+      if (scsi_disk(disk) == NULL || (disk->flags & GENHD_FL_UP) == 0) {
-+              seq_printf(seq, "sd_iostats_seq_show: Device %s "
-+                              "does not exist\n", disk->disk_name);
-+              return 0;
++      seq_printf(seq, "\n%16s %8s %8s %8s %8s\n", "process time", "reads",
++                      "%%", "writes", "%%");
++      for (i = 0; i < IOSTAT_NCOUNTERS; i++) {
++              unsigned long read_percent = 0, write_percent = 0;
++              if (stats->iostat_wtime[i] == 0 &&
++                              stats->iostat_rtime[i] == 0)
++                      continue;
++              if (stats->iostat_read_reqs)
++                      read_percent = stats->iostat_rtime[i] * 100 / 
++                              stats->iostat_read_reqs;
++              if (stats->iostat_write_reqs)
++                      write_percent = stats->iostat_wtime[i] * 100 / 
++                              stats->iostat_write_reqs;
++              seq_printf(seq, "%16u %8lu %8lu %8lu %8lu\n",
++                              jiffies_to_msecs(((1UL << i) >> 1) << 1),
++                              stats->iostat_rtime[i], read_percent,
++                              stats->iostat_wtime[i], write_percent);
  +      }
  +
-+        if (sd_iostats == NULL) {
-+                printk(KERN_ERR "sd_iostats_seq_show: NULL stats array\n");
-+                BUG();
-+        }
-+
-+        stats = sd_iostats[scsi_disk(disk)->index];
-+        if (stats == NULL) {
-+                seq_printf(seq, "sd_iostats_seq_show: sd_iostats "
-+                              "entry %d does not exist\n",
-+                              scsi_disk(disk)->index);
-+              return 0;
-+        }
-+
-+        do_gettimeofday(&now);
-+        now.tv_sec -= stats->iostat_timeval.tv_sec;
-+        now.tv_usec -= stats->iostat_timeval.tv_usec;
-+        if (now.tv_usec < 0) {
-+                now.tv_usec += 1000000;
-+                now.tv_sec--;
-+        }
-+
-+        /* this sampling races with updates */
-+        seq_printf(seq, "index:        %lu   snapshot_time:         %lu.%06lu\n",
-+                   scsi_disk(disk)->index, now.tv_sec, now.tv_usec);
-+
-+        for (i = IOSTAT_NCOUNTERS - 1; i > 0; i--)
-+                if (stats->iostat_read_histogram[i].iostat_count != 0 ||
-+                    stats->iostat_write_histogram[i].iostat_count != 0)
-+                        break;
-+        maxi = i;
-+
-+        seq_printf(seq, "%8s %8s %12s %8s %12s\n", "size", 
-+                   "reads", "total", "writes", "total");
-+
-+        read_len_tot = write_len_tot = 0;
-+        read_num_tot = write_num_tot = 0;
-+        for (i = 0; i <= maxi; i++) {
-+                read_len = stats->iostat_read_histogram[i].iostat_size;
-+                read_len_tot += read_len;
-+                read_num = stats->iostat_read_histogram[i].iostat_count;
-+                read_num_tot += read_num;
-+
-+                write_len = stats->iostat_write_histogram[i].iostat_size;
-+                write_len_tot += write_len;
-+                write_num = stats->iostat_write_histogram[i].iostat_count;
-+                write_num_tot += write_num;
-+
-+                seq_printf (seq, "%8d %8lu %12llu %8lu %12llu\n", 
-+                            512<<i, read_num, read_len, write_num, write_len);
-+        }
-+        
-+        seq_printf(seq, "%8s %8lu %12llu %8lu %12llu\n", "total",
-+                   read_num_tot, read_len_tot, 
-+                   write_num_tot, write_len_tot);
-+        return 0;
++      seq_printf(seq, "\n%16s %8s %8s %8s %8s\n", "time in queue", "reads",
++                      "%%", "writes", "%%");
++      for (i = 0; i < IOSTAT_NCOUNTERS; i++) {
++              unsigned long read_percent = 0, write_percent = 0;
++              if (stats->iostat_wtime_in_queue[i] == 0 &&
++                              stats->iostat_rtime_in_queue[i] == 0)
++                      continue;
++              if (stats->iostat_read_reqs)
++                      read_percent = stats->iostat_rtime_in_queue[i] * 100 / 
++                              stats->iostat_read_reqs;
++              if (stats->iostat_write_reqs)
++                      write_percent = stats->iostat_wtime_in_queue[i] * 100 / 
++                              stats->iostat_write_reqs;
++              seq_printf(seq, "%16u %8lu %8lu %8lu %8lu\n",
++                              jiffies_to_msecs(((1UL << i) >> 1) << 1),
++                              stats->iostat_rtime_in_queue[i],
++                              read_percent,
++                              stats->iostat_wtime_in_queue[i],
++                              write_percent);
++      }
++
++      return 0;
  +}
  +
  +static void *
  +sd_iostats_seq_start(struct seq_file *p, loff_t *pos)
  +{
-+        return (*pos == 0) ? (void *)1 : NULL;
++      return (*pos == 0) ? (void *)1 : NULL;
  +}
  +
  +static void *
  +sd_iostats_seq_next(struct seq_file *p, void *v, loff_t *pos)
  +{
-+        ++*pos;
-+        return NULL;
++      ++*pos;
++      return NULL;
  +}
  +
  +static void
@@ -220,50 +274,54 @@ Index: linux-2.6.22.5/drivers/scsi/sd.c
  +}
  +
  +static struct seq_operations sd_iostats_seqops = {
-+        .start = sd_iostats_seq_start,
-+        .stop  = sd_iostats_seq_stop,
-+        .next  = sd_iostats_seq_next,
-+        .show  = sd_iostats_seq_show,
++      .start = sd_iostats_seq_start,
++      .stop  = sd_iostats_seq_stop,
++      .next  = sd_iostats_seq_next,
++      .show  = sd_iostats_seq_show,
  +};
  +
  +static int
  +sd_iostats_seq_open (struct inode *inode, struct file *file)
  +{
-+        int                    rc;
++      int rc;
  +
-+        rc = seq_open(file, &sd_iostats_seqops);
-+        if (rc != 0)
-+                return rc;
++      rc = seq_open(file, &sd_iostats_seqops);
++      if (rc != 0)
++              return rc;
  +
-+        ((struct seq_file *)file->private_data)->private = PDE(inode)->data;
-+        return 0;
++      ((struct seq_file *)file->private_data)->private = PDE(inode)->data;
++      return 0;
  +}
  +
  +static ssize_t
  +sd_iostats_seq_write(struct file *file, const char *buffer,
-+                     size_t len, loff_t *off)
++                   size_t len, loff_t *off)
  +{
-+        struct seq_file   *seq = file->private_data;
-+        struct gendisk *disk = seq->private;
-+        iostat_stats_t    *stats = sd_iostats[scsi_disk(disk)->index];
-+        unsigned long      flags;
-+        
-+        
-+        spin_lock_irqsave (&sd_iostats_lock, flags);
-+        memset (stats, 0, sizeof(*stats));
-+        do_gettimeofday(&stats->iostat_timeval);
-+        spin_unlock_irqrestore (&sd_iostats_lock, flags);
-+
-+        return len;
++      struct seq_file   *seq = file->private_data;
++      struct gendisk *disk = seq->private;
++      iostat_stats_t    *stats = scsi_disk(disk)->stats;
++      unsigned long      flags;
++      unsigned long      qdepth;
++
++
++      spin_lock_irqsave (&stats->iostat_lock, flags);
++      qdepth = stats->iostat_queue_depth;
++      memset (stats, 0, offsetof(iostat_stats_t, iostat_lock));
++      do_gettimeofday(&stats->iostat_timeval);
++      stats->iostat_queue_stamp = jiffies;
++      stats->iostat_queue_depth = qdepth;
++      spin_unlock_irqrestore (&stats->iostat_lock, flags);
++
++      return len;
  +}
  +
  +static struct file_operations sd_iostats_proc_fops = {
-+        .owner   = THIS_MODULE,
-+        .open    = sd_iostats_seq_open,
-+        .read    = seq_read,
-+        .write   = sd_iostats_seq_write,
-+        .llseek  = seq_lseek,
-+        .release = seq_release,
++      .owner   = THIS_MODULE,
++      .open    = sd_iostats_seq_open,
++      .read    = seq_read,
++      .write   = sd_iostats_seq_write,
++      .llseek  = seq_lseek,
++      .release = seq_release,
  +};
  +
  +extern struct proc_dir_entry *proc_scsi;
@@ -271,214 +329,251 @@ Index: linux-2.6.22.5/drivers/scsi/sd.c
  +void
  +sd_iostats_init(void)
  +{
-+        int    i;
-+
-+        spin_lock_init(&sd_iostats_lock);
-+
-+        sd_iostats = kmalloc(SD_STATS * sizeof(iostat_stats_t *), GFP_KERNEL);
-+        if (sd_iostats == NULL) {
-+                printk(KERN_WARNING "Can't keep sd iostats: "
-+                       "ENOMEM allocating stats array size %ld\n",
-+                       SD_STATS * sizeof(iostat_stats_t *));
-+                return;
-+        }
-+
-+        for (i = 0; i < SD_STATS; i++)
-+                sd_iostats[i] = NULL;
-+
-+        if (proc_scsi == NULL) {
-+                printk(KERN_WARNING "No access to sd iostats: "
-+                       "proc_scsi is NULL\n");
-+                return;
-+        }
-+
-+        sd_iostats_procdir = create_proc_entry(sd_iostats_procdir_name,
-+                                               S_IFDIR | S_IRUGO | S_IXUGO,
-+                                               proc_scsi);
-+        if (sd_iostats_procdir == NULL) {
-+                printk(KERN_WARNING "No access to sd iostats: "
-+                       "can't create /proc/scsi/%s\n", sd_iostats_procdir_name);
-+                return;
-+        }
-+}
++      if (proc_scsi == NULL) {
++              printk(KERN_WARNING "No access to sd iostats: "
++                      "proc_scsi is NULL\n");
++              return;
++      }
  +
-+void
-+sd_iostats_init_disk(struct gendisk *disk)
-+{
-+        struct proc_dir_entry *pde;
-+        unsigned long          flags;
-+        iostat_stats_t        *stats;
-+
-+        if (sd_iostats == NULL ||
-+            sd_iostats_procdir == NULL)
-+                return;
-+
-+        if (scsi_disk(disk)->index > SD_STATS) {
-+                printk(KERN_ERR "sd_iostats_init_disk: "
-+                       "unexpected disk index %d(%d)\n",
-+                       scsi_disk(disk)->index, SD_STATS);
-+                                  return;
-+        }
-+
-+        if (sd_iostats[scsi_disk(disk)->index] != NULL)
-+                return;
-+
-+        stats = kmalloc(sizeof(*stats), GFP_KERNEL);
-+        if (stats == NULL) {
-+                printk(KERN_WARNING "Can't keep %s iostats: "
-+                       "ENOMEM allocating stats size %ld\n", 
-+                       disk->disk_name, sizeof(*stats));
-+                return;
-+        }
-+
-+        memset (stats, 0, sizeof(*stats));
-+        do_gettimeofday(&stats->iostat_timeval);
-+
-+        spin_lock_irqsave(&sd_iostats_lock, flags);
-+
-+        if (sd_iostats[scsi_disk(disk)->index] != NULL) {
-+                spin_unlock_irqrestore(&sd_iostats_lock, flags);
-+                kfree (stats);
-+                return;
-+        }
-+
-+        sd_iostats[scsi_disk(disk)->index] = stats;
-+        
-+        spin_unlock_irqrestore(&sd_iostats_lock, flags);
-+        
-+        pde = create_proc_entry(disk->disk_name, S_IRUGO | S_IWUSR, 
-+                                sd_iostats_procdir);
-+        if (pde == NULL) {
-+                printk(KERN_WARNING "Can't create /proc/scsi/%s/%s\n",
-+                       sd_iostats_procdir_name, disk->disk_name);
-+        } else {
-+                pde->proc_fops = &sd_iostats_proc_fops;
-+                pde->data = disk;
-+        }
++      sd_iostats_procdir = create_proc_entry(sd_iostats_procdir_name,
++                                             S_IFDIR | S_IRUGO | S_IXUGO,
++                                              proc_scsi);
++      if (sd_iostats_procdir == NULL) {
++              printk(KERN_WARNING "No access to sd iostats: "
++                      "can't create /proc/scsi/%s\n", sd_iostats_procdir_name);
++              return;
++      }
  +}
  +
-+static void sd_devname(unsigned int disknum, char *buffer)
++void sd_iostats_fini(void)
  +{
-+        if (disknum < 26)
-+                sprintf(buffer, "sd%c", 'a' + disknum);
-+        else {
-+                unsigned int min1;
-+                unsigned int min2;
-+                /*
-+                 * For larger numbers of disks, we need to go to a new
-+                 * naming scheme.
-+                 */
-+                min1 = disknum / 26;
-+                min2 = disknum % 26;
-+                sprintf(buffer, "sd%c%c", 'a' + min1 - 1, 'a' + min2);
-+        }
++      if (proc_scsi != NULL && sd_iostats_procdir != NULL)
++              remove_proc_entry(sd_iostats_procdir_name, proc_scsi);
++
++      sd_iostats_procdir = NULL;
  +}
  +
-+void
-+sd_iostats_fini(void)
++void sd_iostats_finish_req(struct scsi_cmnd *SCpnt)
  +{
-+        char name[6];
-+        int  i;
-+        
-+        if (sd_iostats_procdir != NULL) {
-+                for (i = 0; i < SD_STATS; i++) {
-+                        sd_devname(i, name);
-+                        remove_proc_entry(name, sd_iostats_procdir);
-+                }
-+
-+                if (proc_scsi == NULL) {
-+                        printk(KERN_ERR "sd_iostats_fini: proc_scsi NULL\n");
-+                        BUG();
-+                }
-+                remove_proc_entry(sd_iostats_procdir_name,
-+                                  proc_scsi);
-+
-+                sd_iostats_procdir = NULL;
-+        }
-+        
-+        if (sd_iostats != NULL) {
-+                for (i = 0; i < SD_STATS; i++) {
-+                        if (sd_iostats[i] != NULL)
-+                                kfree (sd_iostats[i]);
-+                }
-+                
-+                kfree(sd_iostats);
-+                sd_iostats = NULL;
-+        }
++      struct request          *rq = SCpnt->request;
++      iostat_stats_t          *stats;
++      unsigned long           *tcounter;
++      int                     tbucket;
++      int                     tmp;
++      unsigned long           irqflags;
++      unsigned long           i;
++
++      stats = scsi_disk(rq->rq_disk)->stats;
++      if (stats == NULL)
++              return;
++
++      tmp = jiffies - rq->start_time;
++      for (tbucket = 0; tmp > 1; tbucket++)
++              tmp >>= 1;
++      if (tbucket >= IOSTAT_NCOUNTERS)
++              tbucket = IOSTAT_NCOUNTERS - 1;
++      //printk("%u ticks in D to %u\n", jiffies - rq->start_time, tbucket);
++
++      tcounter = rq_data_dir(rq) == WRITE ?
++              &stats->iostat_wtime[tbucket] : &stats->iostat_rtime[tbucket];
++
++      spin_lock_irqsave(&stats->iostat_lock, irqflags);
++
++      /* update delay stats */
++      (*tcounter)++;
++
++      /* update queue depth stats */
++      i = stats->iostat_queue_depth;
++      if (i >= IOSTAT_NCOUNTERS)
++              i = IOSTAT_NCOUNTERS - 1;
++      stats->iostat_queue_ticks[i] += jiffies - stats->iostat_queue_stamp;
++      stats->iostat_queue_ticks_sum += jiffies - stats->iostat_queue_stamp;
++      BUG_ON(stats->iostat_queue_depth == 0);
++      stats->iostat_queue_depth--;
++
++      /* update seek stats. XXX: not sure about nr_sectors */
++      stats->iostat_sectors += rq->nr_sectors;
++      stats->iostat_reqs++;
++      if (rq->sector != stats->iostat_next_sector) {
++              stats->iostat_seek_sectors +=
++                      rq->sector > stats->iostat_next_sector ?
++                      rq->sector - stats->iostat_next_sector :
++                      stats->iostat_next_sector - rq->sector;
++              stats->iostat_seeks++;
++      }
++      stats->iostat_next_sector = rq->sector + rq->nr_sectors;
++
++      stats->iostat_queue_stamp = jiffies;
++
++      spin_unlock_irqrestore(&stats->iostat_lock, irqflags);
  +}
  +
-+void
-+sd_iostats_bump(int disk, unsigned int nsect, int iswrite)
++void sd_iostats_start_req(struct scsi_cmnd *SCpnt)
  +{
-+        iostat_stats_t    *stats;
-+        iostat_counter_t  *counter;
-+        int                bucket;
-+        int                tmp;
-+        unsigned long      irqflags;
-+
-+        if (sd_iostats == NULL)
-+                return;
-+
-+        if (disk < 0 || disk >= SD_STATS) {
-+                printk(KERN_ERR "sd_iostats_bump: unexpected disk index %d([0-%d])\n",
-+                       disk, SD_STATS);
-+                BUG();
-+        }
-+
-+        for (bucket = 0, tmp = nsect; tmp > 1; bucket++)
-+                tmp /= 2;
-+
-+        if (bucket >= IOSTAT_NCOUNTERS) {
-+                printk (KERN_ERR "sd_iostats_bump: nsect %d too big\n", nsect);
-+                BUG();
-+        }
-+
-+        spin_lock_irqsave(&sd_iostats_lock, irqflags);
-+        
-+        stats = sd_iostats[disk];
-+        if (stats != NULL) {
-+                counter = iswrite ? 
-+                          &stats->iostat_write_histogram[bucket] :
-+                          &stats->iostat_read_histogram[bucket];
-+
-+                counter->iostat_size += nsect;
-+                counter->iostat_count++;
-+        }
-+
-+        spin_unlock_irqrestore(&sd_iostats_lock, irqflags);
++      struct request          *rq = SCpnt->request;
++      iostat_stats_t          *stats;
++      iostat_counter_t        *counter;
++      int                     bucket;
++      int                     tbucket;
++      int                     tmp;
++      unsigned long           irqflags;
++      unsigned long           i;
++      int                     nsect;
++
++      stats = scsi_disk(rq->rq_disk)->stats;
++      if (stats == NULL)
++              return;
++
++      nsect = SCpnt->request_bufflen >> 9;
++      for (bucket = 0, tmp = nsect; tmp > 1; bucket++)
++              tmp >>= 1;
++
++      if (bucket >= IOSTAT_NCOUNTERS) {
++              printk (KERN_ERR "sd_iostats_bump: nsect %d too big\n", nsect);
++              BUG();
++      }
++
++      counter = rq_data_dir(rq) == WRITE ?
++              &stats->iostat_write_histogram[bucket] :
++              &stats->iostat_read_histogram[bucket];
++
++      tmp = jiffies - rq->start_time;
++      for (tbucket = 0; tmp > 1; tbucket++)
++              tmp >>= 1;
++      if (tbucket >= IOSTAT_NCOUNTERS)
++              tbucket = IOSTAT_NCOUNTERS - 1;
++      //printk("%u ticks in Q to %u\n", jiffies - rq->start_time, tbucket);
++
++      /* an ugly hack to know exact processing time. the right
++       * solution is to add one more field to struct request
++       * hopefully it will break nothing ... */
++      rq->start_time = jiffies;
++
++      spin_lock_irqsave(&stats->iostat_lock, irqflags);
++
++      /* update queue depth stats */
++      i = stats->iostat_queue_depth;
++      if (i >= IOSTAT_NCOUNTERS)
++              i = IOSTAT_NCOUNTERS - 1;
++      stats->iostat_queue_ticks[i] += jiffies - stats->iostat_queue_stamp;
++      stats->iostat_queue_ticks_sum += jiffies - stats->iostat_queue_stamp;
++      stats->iostat_queue_depth++;
++
++      /* update delay stats */
++      if (rq_data_dir(rq) == WRITE) {
++              stats->iostat_wtime_in_queue[tbucket]++;
++              stats->iostat_write_reqs++;
++      } else {
++              stats->iostat_rtime_in_queue[tbucket]++;
++              stats->iostat_read_reqs++;
++      }
++
++      /* update size stats */
++      counter->iostat_size += nsect;
++      counter->iostat_count++;
++
++      stats->iostat_queue_stamp = jiffies;
++
++      spin_unlock_irqrestore(&stats->iostat_lock, irqflags);
  +}
  +#endif
  +
   /**
-  *    sd_probe - called during driver initialization and whenever a
-  *    new scsi device is attached to the system. It is called once
-@@ -1854,6 +2239,7 @@
-       err = scsi_register_driver(&sd_template.gendrv);
-       if (err)
-               goto err_out_class;
-+      sd_iostats_init();
+  *    scsi_disk_release - Called to free the scsi_disk structure
+  *    @cdev: pointer to embedded class device
+@@ -1727,10 +2138,16 @@ static void scsi_disk_release(struct cla
+       idr_remove(&sd_index_idr, sdkp->index);
+       spin_unlock(&sd_index_lock);
   
-       return 0;
- 
-@@ -1876,6 +2262,7 @@
++#if (defined(CONFIG_SD_IOSTATS) && defined(CONFIG_PROC_FS))
++      if (sdkp->stats) {
++              remove_proc_entry(disk->disk_name, sd_iostats_procdir);
++              kfree(sdkp->stats);
++              sdkp->stats = NULL;
++      }
++#endif
+       disk->private_data = NULL;
+       put_disk(disk);
+       put_device(&sdkp->device->sdev_gendev);
+-
+       kfree(sdkp);
+ }
   
-       SCSI_LOG_HLQUEUE(3, printk("exit_sd: exiting sd driver\n"));
+@@ -1845,6 +2262,8 @@ static int __init init_sd(void)
+       if (!majors)
+               return -ENODEV;
   
++      sd_iostats_init();
++
+       err = class_register(&sd_disk_class);
+       if (err)
+               goto err_out;
+@@ -1860,6 +2279,7 @@ err_out_class:
+ err_out:
+       for (i = 0; i < SD_MAJORS; i++)
+               unregister_blkdev(sd_major(i), "sd");
  +      sd_iostats_fini();
-       scsi_unregister_driver(&sd_template.gendrv);
-       class_unregister(&sd_disk_class);
+       return err;
+ }
   
-Index: linux-2.6.22.5/drivers/scsi/scsi_proc.c
+Index: linux-2.6.22.19/include/scsi/sd.h
  ===================================================================
---- linux-2.6.22.5.orig/drivers/scsi/scsi_proc.c       2007-08-22 17:23:54.000000000 -0600
-+++ linux-2.6.22.5/drivers/scsi/scsi_proc.c    2008-02-21 01:20:41.000000000 -0700
-@@ -40,7 +40,8 @@
- /* 4K page size, but our output routines, use some slack for overruns */
- #define PROC_BLOCK_SIZE (3*1024)
+--- linux-2.6.22.19.orig/include/scsi/sd.h
++++ linux-2.6.22.19/include/scsi/sd.h
+@@ -31,6 +31,46 @@
+  */
+ #define SD_BUF_SIZE           512
   
--static struct proc_dir_entry *proc_scsi;
-+struct proc_dir_entry *proc_scsi;
-+EXPORT_SYMBOL(proc_scsi);
++#if (defined(CONFIG_SD_IOSTATS) && defined(CONFIG_PROC_FS))
++typedef struct {
++      unsigned long long iostat_size;
++      unsigned long long iostat_count;
++} iostat_counter_t;
++
++#define IOSTAT_NCOUNTERS 16
++typedef struct {
++      iostat_counter_t        iostat_read_histogram[IOSTAT_NCOUNTERS];
++      iostat_counter_t        iostat_write_histogram[IOSTAT_NCOUNTERS];
++      struct timeval          iostat_timeval;
++
++      /* queue depth: how well the pipe is filled up */
++      unsigned long long      iostat_queue_ticks[IOSTAT_NCOUNTERS];
++      unsigned long long      iostat_queue_ticks_sum;
++      unsigned long           iostat_queue_depth;
++      unsigned long           iostat_queue_stamp;
++
++      /* seeks: how linear the traffic is */
++      unsigned long long      iostat_next_sector;
++      unsigned long long      iostat_seek_sectors;
++      unsigned long long      iostat_seeks;
++      unsigned long long      iostat_sectors;
++      unsigned long long      iostat_reqs;
++      unsigned long           iostat_read_reqs;
++      unsigned long           iostat_write_reqs;
++
++      /* process time: how long it takes to process requests */
++      unsigned long           iostat_rtime[IOSTAT_NCOUNTERS];
++      unsigned long           iostat_wtime[IOSTAT_NCOUNTERS];
++
++      /* queue time: how long process spent in elevator's queue */
++      unsigned long           iostat_rtime_in_queue[IOSTAT_NCOUNTERS];
++      unsigned long           iostat_wtime_in_queue[IOSTAT_NCOUNTERS];
++
++      /* must be the last field, as it's used to know size to be memset'ed */
++      spinlock_t              iostat_lock;
++} ____cacheline_aligned_in_smp iostat_stats_t;
++#endif
++
+ struct scsi_disk {
+       struct scsi_driver *driver;     /* always &sd_template */
+       struct scsi_device *device;
+@@ -44,6 +84,9 @@ struct scsi_disk {
+       unsigned        WCE : 1;        /* state of disk WCE bit */
+       unsigned        RCD : 1;        /* state of disk RCD bit, unused */
+       unsigned        DPOFUA : 1;     /* state of disk DPOFUA bit */
++#if (defined(CONFIG_SD_IOSTATS) && defined(CONFIG_PROC_FS))
++      iostat_stats_t  *stats;         /* scsi disk statistics */
++#endif
+ };
+ #define to_scsi_disk(obj) container_of(obj,struct scsi_disk,cdev)
   
- /* Protect sht->present and sht->proc_dir */
- static DEFINE_MUTEX(global_host_template_mutex);
diff --git a/lustre/kernel_patches/series/2.6-rhel5.series b/lustre/kernel_patches/series/2.6-rhel5.series

index 1f2cb66..c1bfc94 100644 (file)
--- a/lustre/kernel_patches/series/2.6-rhel5.series
+++ b/lustre/kernel_patches/series/2.6-rhel5.series
@@ -17,3 +17,6 @@ raid5-stripe-by-stripe-handling-rhel5.patch
  raid5-merge-ios-rhel5.patch
  raid5-zerocopy-rhel5.patch
  md-rebuild-policy.patch
+md-soft-lockups.patch
+jbd-journal-chksum-2.6.18-vanilla.patch
+quota-large-limits-rhel5.patch
diff --git a/lustre/kernel_patches/series/2.6-sles10.series b/lustre/kernel_patches/series/2.6-sles10.series

index 8a34943..f5b9a4d 100644 (file)
--- a/lustre/kernel_patches/series/2.6-sles10.series
+++ b/lustre/kernel_patches/series/2.6-sles10.series
@@ -12,3 +12,4 @@ jbd-stats-2.6-sles10.patch
  i_filter_data.patch
  quota-fix-oops-in-invalidate_dquots.patch
  fmode-exec-2.6-sles10.patch
+quota-large-limits-sles10.patch
diff --git a/lustre/kernel_patches/series/2.6-suse-newer.series b/lustre/kernel_patches/series/2.6-suse-newer.series

index 1f092e8..d031826 100644 (file)
--- a/lustre/kernel_patches/series/2.6-suse-newer.series
+++ b/lustre/kernel_patches/series/2.6-suse-newer.series
@@ -1,7 +1,7 @@
  lustre-version-revert_suse.patch
  lustre_version.patch
  dev_read_only-2.6-lnxi.patch
-sd_iostats-2.6-rhel4.patch 
+sd_iostats-2.6-suse.patch 
  blkdev_tunables-2.6-suse.patch
  uml-exprt-clearuser.patch
  qsnet-suse-2.6.patch 
diff --git a/lustre/kernel_patches/series/2.6.22-vanilla.series b/lustre/kernel_patches/series/2.6.22-vanilla.series

index e259c63..2d29ae1 100644 (file)
--- a/lustre/kernel_patches/series/2.6.22-vanilla.series
+++ b/lustre/kernel_patches/series/2.6.22-vanilla.series
@@ -9,4 +9,5 @@ dev_read_only-2.6.22-vanilla.patch
  export-2.6.18-vanilla.patch 
  8kstack-2.6.12.patch
  export-show_task-2.6.18-vanilla.patch 
-sd_iostats-2.6.22-vanilla.patch 
+sd_iostats-2.6.22-vanilla.patch
+quota-large-limits-rhel5.patch
diff --git a/lustre/kernel_patches/targets/2.6-rhel4.target.in b/lustre/kernel_patches/targets/2.6-rhel4.target.in

index 4822946..7d27ba6 100644 (file)
--- a/lustre/kernel_patches/targets/2.6-rhel4.target.in
+++ b/lustre/kernel_patches/targets/2.6-rhel4.target.in
@@ -1,5 +1,5 @@
  lnxmaj="2.6.9"
-lnxrel="67.0.20.EL"
+lnxrel="67.0.22.EL"
  
  KERNEL=linux-${lnxmaj}-${lnxrel}.tar.bz2
  SERIES=2.6-rhel4.series
diff --git a/lustre/kernel_patches/targets/2.6-rhel5.target.in b/lustre/kernel_patches/targets/2.6-rhel5.target.in

index ccaa05e..660b7a6 100644 (file)
--- a/lustre/kernel_patches/targets/2.6-rhel5.target.in
+++ b/lustre/kernel_patches/targets/2.6-rhel5.target.in
@@ -1,5 +1,5 @@
  lnxmaj="2.6.18"
-lnxrel="53.1.21.el5"
+lnxrel="92.1.17.el5"
  
  KERNEL=linux-${lnxmaj}-${lnxrel}.tar.bz2
  SERIES=2.6-rhel5.series
@@ -9,13 +9,13 @@ RHBUILD=1
  LINUX26=1
  LUSTRE_VERSION=@VERSION@
  
-OFED_VERSION=1.3
+OFED_VERSION=1.3.1
  
-BASE_ARCHS="i686 x86_64 ia64"
+BASE_ARCHS="i686 x86_64 ia64 ppc64"
  BIGMEM_ARCHS=""
  BOOT_ARCHS=""
  JENSEN_ARCHS=""
-SMP_ARCHS="i686 x86_64 ia64"
+SMP_ARCHS="i686 x86_64 ia64 ppc64"
  UP_ARCHS=""
  
  for cc in gcc ; do
diff --git a/lustre/kernel_patches/targets/2.6-sles10.target.in b/lustre/kernel_patches/targets/2.6-sles10.target.in

index 95838df..bfa8365 100644 (file)
--- a/lustre/kernel_patches/targets/2.6-sles10.target.in
+++ b/lustre/kernel_patches/targets/2.6-sles10.target.in
@@ -1,5 +1,5 @@
  lnxmaj="2.6.16"
-lnxrel="54-0.2.5"
+lnxrel="60-0.31"
  
  # this is the delimeter that goes between $lnxmaj and $lnxrel
  # defaults to "-"
@@ -19,13 +19,13 @@ LINUX26=1
  # No /boot/Kerntypes* in SLES10
  SUSEBUILD=0
  
-OFED_VERSION=1.3
+OFED_VERSION=1.3.1
  
-BASE_ARCHS="i686 ppc x86_64 ia64"
+BASE_ARCHS="i686 ppc x86_64 ia64 ppc64"
  BIGMEM_ARCHS=""
  BOOT_ARCHS=""
  JENSEN_ARCHS=""
-SMP_ARCHS="x86_64 ia64"
+SMP_ARCHS="x86_64 ia64 ppc64"
  BIGSMP_ARCHS="i686"
  PSERIES64_ARCHS="ppc"
  UP_ARCHS=""
diff --git a/lustre/kernel_patches/targets/2.6-suse.target.in b/lustre/kernel_patches/targets/2.6-suse.target.in

index 8723679..be6dcb8 100644 (file)
--- a/lustre/kernel_patches/targets/2.6-suse.target.in
+++ b/lustre/kernel_patches/targets/2.6-suse.target.in
@@ -1,5 +1,5 @@
  lnxmaj="2.6.5"
-lnxrel="7.311"
+lnxrel="7.314"
  
  KERNEL=linux-$lnxmaj-$lnxrel.tar.bz2
  # they include our patches
@@ -15,7 +15,7 @@ BASE_ARCHS="i686 ppc x86_64 ia64"
  BIGMEM_ARCHS=""
  BOOT_ARCHS=""
  JENSEN_ARCHS=""
-SMP_ARCHS="x86_64 ia64"
+SMP_ARCHS="x86_64 ia64 ppc64"
  BIGSMP_ARCHS="i686"
  PSERIES64_ARCHS="ppc"
  UP_ARCHS=""
diff --git a/lustre/kernel_patches/targets/2.6-vanilla.target.in b/lustre/kernel_patches/targets/2.6-vanilla.target.in

index dd7a0c2..49dbe0f 100644 (file)
--- a/lustre/kernel_patches/targets/2.6-vanilla.target.in
+++ b/lustre/kernel_patches/targets/2.6-vanilla.target.in
@@ -1,8 +1,16 @@
-lnxmaj="2.6.18"
-lnxrel="8"
+lnxmaj="2.6.22"
+lnxrel="14"
+
+# this is the delimeter that goes between $lnxmaj and $lnxrel
+# defaults to "-"
+EXTRA_VERSION_DELIMITER="."
+
+# this is the delimeter that goes before the "smp" at the end of the version
+# defaults to empty
+TARGET_DELIMITER="-"
  
  KERNEL=linux-$lnxmaj.$lnxrel.tar.bz2
-SERIES=2.6.18-vanilla.series
+SERIES=2.6.22-vanilla.series
  VERSION=$lnxmaj
  EXTRA_VERSION="${lnxrel}_lustre.@VERSION@"
  LUSTRE_VERSION=@VERSION@
@@ -16,8 +24,9 @@ BASE_ARCHS="i686 x86_64"
  BIGMEM_ARCHS=""
  BOOT_ARCHS=""
  JENSEN_ARCHS=""
-SMP_ARCHS="i686 x86_64"
-BIGSMP_ARCHS=""
+SMP_ARCHS="x86_64"
+BIGSMP_ARCHS="i686"
+PSERIES64_ARCHS=""
  UP_ARCHS=""
  SRC_ARCHS=""
  
diff --git a/lustre/kernel_patches/which_patch b/lustre/kernel_patches/which_patch

index eb90349..8b6b6cb 100644 (file)
--- a/lustre/kernel_patches/which_patch
+++ b/lustre/kernel_patches/which_patch
@@ -2,10 +2,10 @@ SERIES                VERSION                  COMMENT
  
  SUPPORTED KERNELS:
  2.6-suse              SLES9 before SP1         already in SLES9 SP1 kernel
-2.6-suse-newer        SLES9: 2.6.5-7.311       extra patches for SLES9 after SP1
-2.6-rhel4             RHEL4: 2.6.9-67.0.20.EL
-2.6-sles10            SLES10: 2.6.16.54-0.2.5
-2.6-rhel5             RHEL5: 2.6.18-92.1.6.el5
+2.6-suse-newer        SLES9: 2.6.5-7.314       extra patches for SLES9 after SP1
+2.6-rhel4             RHEL4: 2.6.9-67.0.22.EL
+2.6-sles10            SLES10: 2.6.16.60-0.31
+2.6-rhel5             RHEL5: 2.6.18-92.1.17.el5
  2.6.18-vanilla        kernel.org: 2.6.18.8
  2.6.22-vanilla        kernel.org: 2.6.22.14
  
diff --git a/lustre/ldlm/Makefile.am b/lustre/ldlm/Makefile.am

index b5993fc..600c679 100644 (file)
--- a/lustre/ldlm/Makefile.am
+++ b/lustre/ldlm/Makefile.am
@@ -1,7 +1,38 @@
-# Copyright (C) 2001  Cluster File Systems, Inc.
  #
-# This code is issued under the GNU General Public License.
-# See the file COPYING in this distribution
+# GPL HEADER START
+#
+# DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License version 2 only,
+# as published by the Free Software Foundation.
+#
+# This program is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+# General Public License version 2 for more details (a copy is included
+# in the LICENSE file that accompanied this code).
+#
+# You should have received a copy of the GNU General Public License
+# version 2 along with this program; If not, see
+# http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+#
+# Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+# CA 95054 USA or visit www.sun.com if you need additional information or
+# have any questions.
+#
+# GPL HEADER END
+#
+
+#
+# Copyright  2008 Sun Microsystems, Inc. All rights reserved
+# Use is subject to license terms.
+#
+
+#
+# This file is part of Lustre, http://www.lustre.org/
+# Lustre is a trademark of Sun Microsystems, Inc.
+#
  
  #
  # ldlm is built into ptlrpc
diff --git a/lustre/ldlm/interval_tree.c b/lustre/ldlm/interval_tree.c

index bedf5b3..21fa9e0 100644 (file)
--- a/lustre/ldlm/interval_tree.c
+++ b/lustre/ldlm/interval_tree.c
@@ -1,29 +1,44 @@
  /* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
   * vim:expandtab:shiftwidth=8:tabstop=8:
   *
- *  Interval tree library used by ldlm extent lock code
+ * GPL HEADER START
   *
- *  Copyright (c) 2007 Cluster File Systems, Inc.
- *   Author: Huang Wei <huangwei@clusterfs.com>
- *   Author: Jay Xiong <jinshan.xiong@sun.com>
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   *
- *   This file is part of the Lustre file system, http://www.lustre.org
- *   Lustre is a trademark of Cluster File Systems, Inc.
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
   *
- *   You may have signed or agreed to another license before downloading
- *   this software.  If so, you are bound by the terms and conditions
- *   of that agreement, and the following does not apply to you.  See the
- *   LICENSE file included with this distribution for more information.
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
   *
- *   If you did not agree to a different license, then this copy of Lustre
- *   is open source software; you can redistribute it and/or modify it
- *   under the terms of version 2 of the GNU General Public License as
- *   published by the Free Software Foundation.
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
   *
- *   In either case, Lustre is distributed in the hope that it will be
- *   useful, but WITHOUT ANY WARRANTY; without even the implied warranty
- *   of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- *   license text for more details.
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright  2008 Sun Microsystems, Inc. All rights reserved
+ * Use is subject to license terms.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/ldlm/interval_tree.c
+ *
+ * Interval tree library used by ldlm extent lock code
+ *
+ * Author: Huang Wei <huangwei@clusterfs.com>
+ * Author: Jay Xiong <jinshan.xiong@sun.com>
   */
  #ifdef __KERNEL__
  # include <lustre_dlm.h>
@@ -375,6 +390,7 @@ struct interval_node *interval_insert(struct interval_node *node,
          struct interval_node **p, *parent = NULL;
          ENTRY;
  
+        LASSERT(!interval_is_intree(node));
          p = root;
          while (*p) {
                  parent = *p;
@@ -398,6 +414,7 @@ struct interval_node *interval_insert(struct interval_node *node,
          *p = node;
  
          interval_insert_color(node, root);
+        node->in_intree = 1;
  
          RETURN(NULL);
  }
@@ -513,6 +530,8 @@ void interval_erase(struct interval_node *node,
          int color;
          ENTRY;
  
+        LASSERT(interval_is_intree(node));
+        node->in_intree = 0;
          if (!node->in_left) {
                  child = node->in_right;
          } else if (!node->in_right) {
diff --git a/lustre/ldlm/l_lock.c b/lustre/ldlm/l_lock.c

index e23a755..ac268ec 100644 (file)
--- a/lustre/ldlm/l_lock.c
+++ b/lustre/ldlm/l_lock.c
@@ -1,26 +1,37 @@
  /* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
   * vim:expandtab:shiftwidth=8:tabstop=8:
   *
- * Copyright (C) 2001, 2002 Cluster File Systems, Inc.
+ * GPL HEADER START
   *
- *   This file is part of the Lustre file system, http://www.lustre.org
- *   Lustre is a trademark of Cluster File Systems, Inc.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   *
- *   You may have signed or agreed to another license before downloading
- *   this software.  If so, you are bound by the terms and conditions
- *   of that agreement, and the following does not apply to you.  See the
- *   LICENSE file included with this distribution for more information.
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
   *
- *   If you did not agree to a different license, then this copy of Lustre
- *   is open source software; you can redistribute it and/or modify it
- *   under the terms of version 2 of the GNU General Public License as
- *   published by the Free Software Foundation.
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
   *
- *   In either case, Lustre is distributed in the hope that it will be
- *   useful, but WITHOUT ANY WARRANTY; without even the implied warranty
- *   of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- *   license text for more details.
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
   *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright  2008 Sun Microsystems, Inc. All rights reserved
+ * Use is subject to license terms.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
   */
  
  #define DEBUG_SUBSYSTEM S_LDLM
@@ -68,4 +79,3 @@ void unlock_res_and_lock(struct ldlm_lock *lock)
          unlock_res(res);
          spin_unlock(&lock->l_lock);
  }
-
diff --git a/lustre/ldlm/ldlm_extent.c b/lustre/ldlm/ldlm_extent.c

index 2db142f..22c42c1 100644 (file)
--- a/lustre/ldlm/ldlm_extent.c
+++ b/lustre/ldlm/ldlm_extent.c
@@ -1,27 +1,42 @@
  /* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
   * vim:expandtab:shiftwidth=8:tabstop=8:
   *
- *  Copyright (c) 2002, 2003 Cluster File Systems, Inc.
- *   Author: Peter Braam <braam@clusterfs.com>
- *   Author: Phil Schwan <phil@clusterfs.com>
+ * GPL HEADER START
   *
- *   This file is part of the Lustre file system, http://www.lustre.org
- *   Lustre is a trademark of Cluster File Systems, Inc.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   *
- *   You may have signed or agreed to another license before downloading
- *   this software.  If so, you are bound by the terms and conditions
- *   of that agreement, and the following does not apply to you.  See the
- *   LICENSE file included with this distribution for more information.
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
   *
- *   If you did not agree to a different license, then this copy of Lustre
- *   is open source software; you can redistribute it and/or modify it
- *   under the terms of version 2 of the GNU General Public License as
- *   published by the Free Software Foundation.
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
   *
- *   In either case, Lustre is distributed in the hope that it will be
- *   useful, but WITHOUT ANY WARRANTY; without even the implied warranty
- *   of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- *   license text for more details.
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright  2008 Sun Microsystems, Inc. All rights reserved
+ * Use is subject to license terms.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/ldlm/ldlm_extent.c
+ *
+ * Author: Peter Braam <braam@clusterfs.com>
+ * Author: Phil Schwan <phil@clusterfs.com>
   */
  
  #define DEBUG_SUBSYSTEM S_LDLM
@@ -32,6 +47,7 @@
  #include <lustre_dlm.h>
  #include <obd_support.h>
  #include <obd.h>
+#include <obd_class.h>
  #include <lustre_lib.h>
  
  #include "ldlm_internal.h"
@@ -686,10 +702,25 @@ int ldlm_process_extent_lock(struct ldlm_lock *lock, int *flags, int first_enq,
                  if (list_empty(&lock->l_res_link))
                          ldlm_resource_add_lock(res, &res->lr_waiting, lock);
                  unlock_res(res);
+
                  rc = ldlm_run_bl_ast_work(&rpc_list);
-                lock_res(res);
  
+                if (OBD_FAIL_CHECK(OBD_FAIL_LDLM_OST_FAIL_RACE) &&
+                    !ns_is_client(res->lr_namespace))
+                        class_fail_export(lock->l_export);
+
+                lock_res(res);
                  if (rc == -ERESTART) {
+                        /* 15715: The lock was granted and destroyed after
+                         * resource lock was dropped. Interval node was freed
+                         * in ldlm_lock_destroy. Anyway, this always happens
+                         * when a client is being evicted. So it would be
+                         * ok to return an error. -jay */
+                        if (lock->l_destroyed) {
+                                *err = -EAGAIN;
+                                GOTO(out, rc = -EAGAIN);
+                        }
+
                          /* lock was granted while resource was unlocked. */
                          if (lock->l_granted_mode == lock->l_req_mode) {
                                  /* bug 11300: if the lock has been granted,
@@ -778,6 +809,7 @@ void ldlm_interval_free(struct ldlm_interval *node)
  {
          if (node) {
                  LASSERT(list_empty(&node->li_group));
+                LASSERT(!interval_is_intree(&node->li_node));
                  OBD_SLAB_FREE(node, ldlm_interval_slab, sizeof(*node));
          }
  }
@@ -830,6 +862,7 @@ void ldlm_extent_add_lock(struct ldlm_resource *res,
  
          node = lock->l_tree_node;
          LASSERT(node != NULL);
+        LASSERT(!interval_is_intree(&node->li_node));
  
          idx = lock_mode_to_index(lock->l_granted_mode);
          LASSERT(lock->l_granted_mode == 1 << idx);
@@ -857,14 +890,13 @@ void ldlm_extent_add_lock(struct ldlm_resource *res,
  void ldlm_extent_unlink_lock(struct ldlm_lock *lock)
  {
          struct ldlm_resource *res = lock->l_resource;
-        struct ldlm_interval *node;
+        struct ldlm_interval *node = lock->l_tree_node;
          struct ldlm_interval_tree *tree;
          int idx;
  
-        if (lock->l_granted_mode != lock->l_req_mode)
+        if (!node || !interval_is_intree(&node->li_node)) /* duplicate unlink */
                  return;
  
-        LASSERT(lock->l_tree_node != NULL);
          idx = lock_mode_to_index(lock->l_granted_mode);
          LASSERT(lock->l_granted_mode == 1 << idx);
          tree = &res->lr_itree[idx];
diff --git a/lustre/ldlm/ldlm_flock.c b/lustre/ldlm/ldlm_flock.c

index dddd257..5bdd456 100644 (file)
--- a/lustre/ldlm/ldlm_flock.c
+++ b/lustre/ldlm/ldlm_flock.c
@@ -1,27 +1,41 @@
  /* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
   * vim:expandtab:shiftwidth=8:tabstop=8:
   *
- *  Copyright (c) 2003 Hewlett-Packard Development Company LP.
- *   Developed under the sponsorship of the US Government under
- *   Subcontract No. B514193
+ * GPL HEADER START
   *
- *   This file is part of the Lustre file system, http://www.lustre.org
- *   Lustre is a trademark of Cluster File Systems, Inc.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   *
- *   You may have signed or agreed to another license before downloading
- *   this software.  If so, you are bound by the terms and conditions
- *   of that agreement, and the following does not apply to you.  See the
- *   LICENSE file included with this distribution for more information.
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
   *
- *   If you did not agree to a different license, then this copy of Lustre
- *   is open source software; you can redistribute it and/or modify it
- *   under the terms of version 2 of the GNU General Public License as
- *   published by the Free Software Foundation.
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
   *
- *   In either case, Lustre is distributed in the hope that it will be
- *   useful, but WITHOUT ANY WARRANTY; without even the implied warranty
- *   of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- *   license text for more details.
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright  2008 Sun Microsystems, Inc. All rights reserved
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2003 Hewlett-Packard Development Company LP.
+ * Developed under the sponsorship of the US Government under
+ * Subcontract No. B514193
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
   */
  
  #define DEBUG_SUBSYSTEM S_LDLM
@@ -91,7 +105,7 @@ ldlm_flock_destroy(struct ldlm_lock *lock, ldlm_mode_t mode, int flags)
                  /* client side - set a flag to prevent sending a CANCEL */
                  lock->l_flags |= LDLM_FL_LOCAL_ONLY | LDLM_FL_CBPENDING;
  
-                /* when reaching here, it is under lock_res_and_lock(). Thus, 
+                /* when reaching here, it is under lock_res_and_lock(). Thus,
                     need call the nolock version of ldlm_lock_decref_internal*/
                  ldlm_lock_decref_internal_nolock(lock, mode);
          }
@@ -377,14 +391,14 @@ reprocess:
                  new2->l_conn_export = lock->l_conn_export;
                  if (lock->l_export != NULL) {
                          new2->l_export = class_export_get(lock->l_export);
-                        spin_lock(&new2->l_export->exp_ldlm_data.led_lock);
-                        list_add(&new2->l_export_chain,
-                                 &new2->l_export->exp_ldlm_data.led_held_locks);
-                        spin_unlock(&new2->l_export->exp_ldlm_data.led_lock);
+                        if (new2->l_export->exp_lock_hash && 
+                            hlist_unhashed(&new2->l_exp_hash))
+                                lustre_hash_add(new2->l_export->exp_lock_hash,
+                                                &new2->l_remote_handle,
+                                                &new2->l_exp_hash);
                  }
-                if (*flags == LDLM_FL_WAIT_NOREPROC) {
+                if (*flags == LDLM_FL_WAIT_NOREPROC)
                          ldlm_lock_addref_internal_nolock(new2, lock->l_granted_mode);
-                }
  
                  /* insert new2 at lock */
                  ldlm_resource_add_lock(res, ownlocks, new2);
@@ -409,7 +423,7 @@ reprocess:
          if (*flags != LDLM_FL_WAIT_NOREPROC) {
                  if (first_enq) {
                          /* If this is an unlock, reprocess the waitq and
-                         * send completions ASTs for locks that can now be 
+                         * send completions ASTs for locks that can now be
                           * granted. The only problem with doing this
                           * reprocessing here is that the completion ASTs for
                           * newly granted locks will be sent before the unlock
@@ -444,7 +458,7 @@ restart:
          if (added)
                  ldlm_flock_destroy(req, mode, *flags);
  
-        ldlm_resource_dump(D_OTHER, res);
+        ldlm_resource_dump(D_INFO, res);
          RETURN(LDLM_ITER_CONTINUE);
  }
  
@@ -501,7 +515,7 @@ ldlm_flock_completion_ast(struct ldlm_lock *lock, int flags, void *data)
           * holding the lock even if app still believes it has it, since
           * server already dropped it anyway. Only for granted locks too. */
          lock_res_and_lock(lock);
-        if ((lock->l_flags & (LDLM_FL_FAILED|LDLM_FL_LOCAL_ONLY)) == 
+        if ((lock->l_flags & (LDLM_FL_FAILED|LDLM_FL_LOCAL_ONLY)) ==
              (LDLM_FL_FAILED|LDLM_FL_LOCAL_ONLY)) {
                  unlock_res_and_lock(lock);
                  if (lock->l_req_mode == lock->l_granted_mode &&
@@ -542,20 +556,22 @@ ldlm_flock_completion_ast(struct ldlm_lock *lock, int flags, void *data)
  
          LDLM_DEBUG(lock, "client-side enqueue waking up: rc = %d", rc);
          RETURN(rc);
- 
+
  granted:
+        OBD_FAIL_TIMEOUT(OBD_FAIL_LDLM_CP_CB_WAIT, 10);
+        LDLM_DEBUG(lock, "client-side enqueue granted");
+        ns = lock->l_resource->lr_namespace;
+        lock_res_and_lock(lock);
+
          /* before flock's complete ast gets here, the flock
           * can possibly be freed by another thread
           */
          if (lock->l_destroyed) {
                  LDLM_DEBUG(lock, "already destroyed by another thread");
+                unlock_res_and_lock(lock);
                  RETURN(0);
          }
  
-        LDLM_DEBUG(lock, "client-side enqueue granted");
-        ns = lock->l_resource->lr_namespace;
-        lock_res_and_lock(lock);
-
          /* take lock off the deadlock detection waitq. */
          spin_lock(&ldlm_flock_waitq_lock);
          list_del_init(&lock->l_flock_waitq);
diff --git a/lustre/ldlm/ldlm_inodebits.c b/lustre/ldlm/ldlm_inodebits.c

index 67d72ae..dd43733 100644 (file)
--- a/lustre/ldlm/ldlm_inodebits.c
+++ b/lustre/ldlm/ldlm_inodebits.c
@@ -1,24 +1,42 @@
  /* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
   * vim:expandtab:shiftwidth=8:tabstop=8:
   *
- *  Copyright (c) 2002, 2003, 2004 Cluster File Systems, Inc.
- *   Author: Peter Braam <braam@clusterfs.com>
- *   Author: Phil Schwan <phil@clusterfs.com>
+ * GPL HEADER START
   *
- *   This file is part of Lustre, http://www.lustre.org.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   *
- *   Lustre is free software; you can redistribute it and/or
- *   modify it under the terms of version 2 of the GNU General Public
- *   License as published by the Free Software Foundation.
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
   *
- *   Lustre is distributed in the hope that it will be useful,
- *   but WITHOUT ANY WARRANTY; without even the implied warranty of
- *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- *   GNU General Public License for more details.
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
   *
- *   You should have received a copy of the GNU General Public License
- *   along with Lustre; if not, write to the Free Software
- *   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright  2008 Sun Microsystems, Inc. All rights reserved
+ * Use is subject to license terms.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/ldlm/ldlm_inodebits.c
+ *
+ * Author: Peter Braam <braam@clusterfs.com>
+ * Author: Phil Schwan <phil@clusterfs.com>
   */
  
  #define DEBUG_SUBSYSTEM S_LDLM
diff --git a/lustre/ldlm/ldlm_internal.h b/lustre/ldlm/ldlm_internal.h

index 50dca93..c316918 100644 (file)
--- a/lustre/ldlm/ldlm_internal.h
+++ b/lustre/ldlm/ldlm_internal.h
@@ -1,5 +1,37 @@
  /* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
   * vim:expandtab:shiftwidth=8:tabstop=8:
+ *
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright  2008 Sun Microsystems, Inc. All rights reserved
+ * Use is subject to license terms.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
   */
  
  #define MAX_STRING_SIZE 128
diff --git a/lustre/ldlm/ldlm_lib.c b/lustre/ldlm/ldlm_lib.c

index c5293b3..cb742e0 100644 (file)
--- a/lustre/ldlm/ldlm_lib.c
+++ b/lustre/ldlm/ldlm_lib.c
@@ -1,25 +1,37 @@
  /* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
   * vim:expandtab:shiftwidth=8:tabstop=8:
   *
- *  Copyright (c) 2003 Cluster File Systems, Inc.
+ * GPL HEADER START
   *
- *   This file is part of the Lustre file system, http://www.lustre.org
- *   Lustre is a trademark of Cluster File Systems, Inc.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   *
- *   You may have signed or agreed to another license before downloading
- *   this software.  If so, you are bound by the terms and conditions
- *   of that agreement, and the following does not apply to you.  See the
- *   LICENSE file included with this distribution for more information.
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
   *
- *   If you did not agree to a different license, then this copy of Lustre
- *   is open source software; you can redistribute it and/or modify it
- *   under the terms of version 2 of the GNU General Public License as
- *   published by the Free Software Foundation.
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
   *
- *   In either case, Lustre is distributed in the hope that it will be
- *   useful, but WITHOUT ANY WARRANTY; without even the implied warranty
- *   of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- *   license text for more details.
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright  2008 Sun Microsystems, Inc. All rights reserved
+ * Use is subject to license terms.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
   */
  
  #ifndef EXPORT_SYMTAB
@@ -106,7 +118,7 @@ out_free:
          if (imp_conn)
                  OBD_FREE(imp_conn, sizeof(*imp_conn));
  out_put:
-        ptlrpc_put_connection(ptlrpc_conn);
+        ptlrpc_connection_put(ptlrpc_conn);
          RETURN(rc);
  }
  
@@ -149,20 +161,20 @@ int client_import_del_conn(struct obd_import *imp, struct obd_uuid *uuid)
                                  GOTO(out, rc = -EBUSY);
                          }
  
-                        ptlrpc_put_connection(imp->imp_connection);
+                        ptlrpc_connection_put(imp->imp_connection);
                          imp->imp_connection = NULL;
  
                          dlmexp = class_conn2export(&imp->imp_dlm_handle);
                          if (dlmexp && dlmexp->exp_connection) {
                                  LASSERT(dlmexp->exp_connection ==
                                          imp_conn->oic_conn);
-                                ptlrpc_put_connection(dlmexp->exp_connection);
+                                ptlrpc_connection_put(dlmexp->exp_connection);
                                  dlmexp->exp_connection = NULL;
                          }
                  }
  
                  list_del(&imp_conn->oic_item);
-                ptlrpc_put_connection(imp_conn->oic_conn);
+                ptlrpc_connection_put(imp_conn->oic_conn);
                  OBD_FREE(imp_conn, sizeof(*imp_conn));
                  CDEBUG(D_HA, "imp %p@%s: remove connection %s\n",
                         imp, imp->imp_obd->obd_name, uuid->uuid);
@@ -254,6 +266,7 @@ int client_obd_setup(struct obd_device *obddev, obd_count len, void *buf)
                  cli->cl_dirty_max = num_physpages << (CFS_PAGE_SHIFT - 3);
          CFS_INIT_LIST_HEAD(&cli->cl_cache_waiters);
          CFS_INIT_LIST_HEAD(&cli->cl_loi_ready_list);
+        CFS_INIT_LIST_HEAD(&cli->cl_loi_hp_ready_list);
          CFS_INIT_LIST_HEAD(&cli->cl_loi_write_list);
          CFS_INIT_LIST_HEAD(&cli->cl_loi_read_list);
          client_obd_list_lock_init(&cli->cl_loi_list_lock);
@@ -324,7 +337,7 @@ int client_obd_setup(struct obd_device *obddev, obd_count len, void *buf)
  
          cli->cl_import = imp;
          /* cli->cl_max_mds_{easize,cookiesize} updated by mdc_init_ea_size() */
-        cli->cl_max_mds_easize = sizeof(struct lov_mds_md);
+        cli->cl_max_mds_easize = sizeof(struct lov_mds_md_v3);
          cli->cl_max_mds_cookiesize = sizeof(struct llog_cookie);
  
          if (LUSTRE_CFG_BUFLEN(lcfg, 3) > 0) {
@@ -480,7 +493,7 @@ int client_disconnect_export(struct obd_export *exp)
                  ldlm_cli_cancel_unused(obd->obd_namespace, NULL,
                                         obd->obd_force ? LDLM_FL_LOCAL_ONLY:0,
                                         NULL);
-                ldlm_namespace_free_prior(obd->obd_namespace, imp, 
+                ldlm_namespace_free_prior(obd->obd_namespace, imp,
                                            obd->obd_force);
                  to_be_freed = obd->obd_namespace;
          }
@@ -492,8 +505,11 @@ int client_disconnect_export(struct obd_export *exp)
           * some connect requests in flight, and his need store a connect flags
           * in obd_namespace. bug 14260 */
          obd->obd_namespace = NULL;
-       
-        ptlrpc_free_rq_pool(imp->imp_rq_pool);
+
+        if (imp->imp_rq_pool) {
+                ptlrpc_free_rq_pool(imp->imp_rq_pool);
+                imp->imp_rq_pool = NULL;
+        }
          class_destroy_import(imp);
          cli->cl_import = NULL;
  
@@ -563,11 +579,26 @@ void target_client_add_cb(struct obd_device *obd, __u64 transno, void *cb_data,
  }
  EXPORT_SYMBOL(target_client_add_cb);
  
-static void 
+static void
  target_start_and_reset_recovery_timer(struct obd_device *obd,
                                        svc_handler_t handler,
                                        struct ptlrpc_request *req,
                                        int new_client);
+void target_stop_recovery(void *, int);
+int target_recovery_check_and_stop(struct obd_device *obd)
+{
+        int abort_recovery = 0;
+
+        spin_lock_bh(&obd->obd_processing_task_lock);
+        abort_recovery = obd->obd_abort_recovery;
+        spin_unlock_bh(&obd->obd_processing_task_lock);
+        if (abort_recovery) {
+                target_stop_recovery(obd, 0);
+                return 1;
+        }
+        return 0;
+}
+EXPORT_SYMBOL(target_recovery_check_and_stop);
  
  int target_handle_connect(struct ptlrpc_request *req, svc_handler_t handler)
  {
@@ -579,9 +610,9 @@ int target_handle_connect(struct ptlrpc_request *req, svc_handler_t handler)
          struct obd_uuid cluuid;
          struct obd_uuid remote_uuid;
          char *str, *tmp;
-        int rc = 0, abort_recovery;
+        int rc = 0;
          struct obd_connect_data *data;
-        int size[2] = { sizeof(struct ptlrpc_body), sizeof(*data) };
+        __u32 size[2] = { sizeof(struct ptlrpc_body), sizeof(*data) };
          lnet_nid_t *client_nid = NULL;
          ENTRY;
  
@@ -618,13 +649,13 @@ int target_handle_connect(struct ptlrpc_request *req, svc_handler_t handler)
  
          if (target->obd_no_conn) {
                  LCONSOLE_WARN("%s: temporarily refusing client connection "
-                              "from %s\n", target->obd_name, 
+                              "from %s\n", target->obd_name,
                                libcfs_nid2str(req->rq_peer.nid));
                  GOTO(out, rc = -EAGAIN);
          }
  
-        /* Make sure the target isn't cleaned up while we're here. Yes, 
-           there's still a race between the above check and our incref here. 
+        /* Make sure the target isn't cleaned up while we're here. Yes,
+           there's still a race between the above check and our incref here.
             Really, class_uuid2obd should take the ref. */
          targref = class_incref(target);
  
@@ -653,11 +684,7 @@ int target_handle_connect(struct ptlrpc_request *req, svc_handler_t handler)
                  LBUG();
          }
  
-        spin_lock_bh(&target->obd_processing_task_lock);
-        abort_recovery = target->obd_abort_recovery;
-        spin_unlock_bh(&target->obd_processing_task_lock);
-        if (abort_recovery)
-                target_abort_recovery(target);
+        target_recovery_check_and_stop(target);
  
          tmp = lustre_msg_buf(req->rq_reqmsg, REQ_REC_OFF + 2, sizeof conn);
          if (tmp == NULL)
@@ -705,7 +732,7 @@ int target_handle_connect(struct ptlrpc_request *req, svc_handler_t handler)
                  goto dont_check_exports;
  
          spin_lock(&target->obd_dev_lock);
-        export = lustre_hash_get_object_by_key(target->obd_uuid_hash_body, &cluuid);
+        export = lustre_hash_lookup(target->obd_uuid_hash, &cluuid);
  
          if (export != NULL && export->exp_connecting) { /* bug 9635, et. al. */
                  CWARN("%s: exp %p already connecting\n",
@@ -717,20 +744,38 @@ int target_handle_connect(struct ptlrpc_request *req, svc_handler_t handler)
                     req->rq_peer.nid != export->exp_connection->c_peer.nid) {
                  /* make darn sure this is coming from the same peer
                   * if the UUIDs matched */
-                  CWARN("%s: cookie %s seen on new NID %s when "
-                          "existing NID %s is already connected\n",
-                        target->obd_name, cluuid.uuid,
-                  libcfs_nid2str(req->rq_peer.nid),
-                  libcfs_nid2str(export->exp_connection->c_peer.nid));
-                  class_export_put(export);
-                  export = NULL;
-                  rc = -EALREADY;
+                if (data && data->ocd_connect_flags & OBD_CONNECT_MDS) {
+                        /* the MDS UUID can be reused, don't need to wait
+                         * for the export to be evicted */
+                        CWARN("%s: received MDS connection from a new NID %s,"
+                              " removing former export from NID %s\n",
+                            target->obd_name,
+                            libcfs_nid2str(req->rq_peer.nid),
+                            libcfs_nid2str(export->exp_connection->c_peer.nid));
+                        class_fail_export(export);
+                } else {
+                        CWARN("%s: cookie %s seen on new NID %s when "
+                              "existing NID %s is already connected\n",
+                            target->obd_name, cluuid.uuid,
+                            libcfs_nid2str(req->rq_peer.nid),
+                            libcfs_nid2str(export->exp_connection->c_peer.nid));
+                        rc = -EALREADY;
+                }
+                class_export_put(export);
+                export = NULL;
          } else if (export != NULL && export->exp_failed) { /* bug 11327 */
                  CDEBUG(D_HA, "%s: exp %p evict in progress - new cookie needed "
                        "for connect\n", export->exp_obd->obd_name, export);
                  class_export_put(export);
                  export = NULL;
                  rc = -ENODEV;
+        } else if (export != NULL && export->exp_delayed &&
+                   !(data && data->ocd_connect_flags & OBD_CONNECT_VBR)) {
+                spin_unlock(&target->obd_dev_lock);
+                class_fail_export(export);
+                class_export_put(export);
+                export = NULL;
+                GOTO(out, rc = -ENODEV);
          } else if (export != NULL) {
                  spin_lock(&export->exp_lock);
                  export->exp_connecting = 1;
@@ -765,6 +810,12 @@ int target_handle_connect(struct ptlrpc_request *req, svc_handler_t handler)
                         "cookies not random?\n", target->obd_name,
                         libcfs_nid2str(req->rq_peer.nid), cluuid.uuid);
                  GOTO(out, rc = -EALREADY);
+        } else if (export->exp_delayed && target->obd_recovering) {
+                /* VBR: don't allow delayed connection during recovery */
+                CWARN("%s: NID %s (%s) export was already marked as delayed "
+                      "and will wait for end of recovery\n", target->obd_name,
+                       libcfs_nid2str(req->rq_peer.nid), cluuid.uuid);
+                GOTO(out, rc = -EBUSY);
          } else {
                  OBD_FAIL_TIMEOUT(OBD_FAIL_TGT_DELAY_RECONNECT, 2 * obd_timeout);
          }
@@ -796,6 +847,18 @@ int target_handle_connect(struct ptlrpc_request *req, svc_handler_t handler)
                  lustre_msg_add_op_flags(req->rq_repmsg, MSG_CONNECT_REPLAYABLE);
          client_nid = &req->rq_peer.nid;
  
+        /* VBR: for delayed connections we start recovery */
+        if (export && export->exp_delayed && !export->exp_in_recovery) {
+                LASSERT(!target->obd_recovering);
+                LASSERT(data && data->ocd_connect_flags & OBD_CONNECT_VBR);
+                lustre_msg_add_op_flags(req->rq_repmsg, MSG_CONNECT_DELAYED |
+                                        MSG_CONNECT_RECOVERING);
+                spin_lock_bh(&target->obd_processing_task_lock);
+                target->obd_version_recov = 1;
+                spin_unlock_bh(&target->obd_processing_task_lock);
+                target_start_and_reset_recovery_timer(target, handler, req, 1);
+        }
+
          if (export == NULL) {
                  if (target->obd_recovering) {
                          CERROR("%s: denying connection for new client %s (%s): "
@@ -812,7 +875,7 @@ int target_handle_connect(struct ptlrpc_request *req, svc_handler_t handler)
                                           client_nid);
                  }
          } else {
-                rc = obd_reconnect(export, target, &cluuid, data);
+                rc = obd_reconnect(export, target, &cluuid, data, client_nid);
          }
  
          if (rc)
@@ -876,27 +939,28 @@ int target_handle_connect(struct ptlrpc_request *req, svc_handler_t handler)
          }
  
          if (export->exp_connection != NULL)
-                ptlrpc_put_connection(export->exp_connection);
-        export->exp_connection = ptlrpc_get_connection(req->rq_peer,
+                ptlrpc_connection_put(export->exp_connection);
+        export->exp_connection = ptlrpc_connection_get(req->rq_peer,
                                                         req->rq_self,
                                                         &remote_uuid);
  
-        spin_lock(&target->obd_dev_lock);
-        /* Export might be hashed already, e.g. if this is reconnect */
-        if (hlist_unhashed(&export->exp_nid_hash))
-                lustre_hash_additem(export->exp_obd->obd_nid_hash_body,
-                                    &export->exp_connection->c_peer.nid,
-                                    &export->exp_nid_hash);
-        spin_unlock(&target->obd_dev_lock);
+        if (hlist_unhashed(&export->exp_nid_hash)) {
+                lustre_hash_add_unique(export->exp_obd->obd_nid_hash,
+                                       &export->exp_connection->c_peer.nid,
+                                       &export->exp_nid_hash);
+        }
  
          if (lustre_msg_get_op_flags(req->rq_repmsg) & MSG_CONNECT_RECONNECT) {
                  revimp = class_import_get(export->exp_imp_reverse);
                  GOTO(set_flags, rc = 0);
          }
  
-        if (target->obd_recovering)
+        if (target->obd_recovering && !export->exp_in_recovery) {
+                spin_lock(&export->exp_lock);
+                export->exp_in_recovery = 1;
+                spin_unlock(&export->exp_lock);
                  target->obd_connected_clients++;
-
+        }
          memcpy(&conn,
                 lustre_msg_buf(req->rq_reqmsg, REQ_REC_OFF + 2, sizeof conn),
                 sizeof conn);
@@ -1023,43 +1087,59 @@ static void target_release_saved_req(struct ptlrpc_request *req)
          OBD_FREE(req, sizeof *req);
  }
  
-static void target_finish_recovery(struct obd_device *obd)
+static void target_send_delayed_replies(struct obd_device *obd)
  {
-        struct list_head *tmp, *n;
+        struct ptlrpc_request *req, *tmp;
  
          LCONSOLE_INFO("%s: sending delayed replies to recovered clients\n",
                        obd->obd_name);
  
-        ldlm_reprocess_all_ns(obd->obd_namespace);
+        list_for_each_entry_safe(req, tmp, &obd->obd_delayed_reply_queue,
+                                 rq_list) {
+                list_del_init(&req->rq_list);
+                DEBUG_REQ(D_HA, req, "delayed:");
+                ptlrpc_reply(req);
+                target_release_saved_req(req);
+        }
+        obd->obd_recovery_end = cfs_time_current_sec();
+}
  
+static void target_finish_recovery(struct obd_device *obd)
+{
+        ldlm_reprocess_all_ns(obd->obd_namespace);
+        spin_lock_bh(&obd->obd_processing_task_lock);
+        if (list_empty(&obd->obd_recovery_queue)) {
+                obd->obd_processing_task = 0;
+        } else {
+                spin_unlock_bh(&obd->obd_processing_task_lock);
+                CERROR("%s: Recovery queue isn't empty\n", obd->obd_name);
+                LBUG();
+        }
+        spin_unlock_bh(&obd->obd_processing_task_lock);
+                ;
          /* when recovery finished, cleanup orphans on mds and ost */
          if (OBT(obd) && OBP(obd, postrecov)) {
                  int rc = OBP(obd, postrecov)(obd);
                  LCONSOLE_WARN("%s: recovery %s: rc %d\n", obd->obd_name,
                                rc < 0 ? "failed" : "complete", rc);
          }
-
-        list_for_each_safe(tmp, n, &obd->obd_delayed_reply_queue) {
-                struct ptlrpc_request *req;
-                req = list_entry(tmp, struct ptlrpc_request, rq_list);
-                list_del(&req->rq_list);
-                DEBUG_REQ(D_HA, req, "delayed:");
-                ptlrpc_reply(req);
-                target_release_saved_req(req);
-        }
-        obd->obd_recovery_end = cfs_time_current_sec();
+        target_send_delayed_replies(obd);
  }
  
  static void abort_recovery_queue(struct obd_device *obd)
  {
-        struct ptlrpc_request *req;
-        struct list_head *tmp, *n;
+        struct ptlrpc_request *req, *n;
+        struct list_head abort_list;
          int rc;
  
-        list_for_each_safe(tmp, n, &obd->obd_recovery_queue) {
-                req = list_entry(tmp, struct ptlrpc_request, rq_list);
+        CFS_INIT_LIST_HEAD(&abort_list);
+        spin_lock_bh(&obd->obd_processing_task_lock);
+        list_splice_init(&obd->obd_recovery_queue, &abort_list);
+        spin_unlock_bh(&obd->obd_processing_task_lock);
+        /* process abort list unlocked */
+        list_for_each_entry_safe(req, n, &abort_list, rq_list) {
                  target_exp_dequeue_req_replay(req);
-                list_del(&req->rq_list);
+                list_del_init(&req->rq_list);
                  DEBUG_REQ(D_ERROR, req, "aborted:");
                  req->rq_status = -ENOTCONN;
                  req->rq_type = PTL_RPC_MSG_ERR;
@@ -1086,6 +1166,7 @@ void target_cleanup_recovery(struct obd_device *obd)
  {
          struct list_head *tmp, *n;
          struct ptlrpc_request *req;
+        struct list_head clean_list;
          ENTRY;
  
          LASSERT(obd->obd_stopping);
@@ -1106,18 +1187,23 @@ void target_cleanup_recovery(struct obd_device *obd)
                  target_release_saved_req(req);
          }
  
-        list_for_each_safe(tmp, n, &obd->obd_recovery_queue) {
+        CFS_INIT_LIST_HEAD(&clean_list);
+        spin_lock_bh(&obd->obd_processing_task_lock);
+        list_splice_init(&obd->obd_recovery_queue, &clean_list);
+        spin_unlock_bh(&obd->obd_processing_task_lock);
+        list_for_each_safe(tmp, n, &clean_list) {
                  req = list_entry(tmp, struct ptlrpc_request, rq_list);
                  target_exp_dequeue_req_replay(req);
-                list_del(&req->rq_list);
+                list_del_init(&req->rq_list);
                  target_release_saved_req(req);
          }
          EXIT;
  }
  
-void target_abort_recovery(void *data)
+void target_stop_recovery(void *data, int abort)
  {
          struct obd_device *obd = data;
+        enum obd_option flags;
          ENTRY;
  
          spin_lock_bh(&obd->obd_processing_task_lock);
@@ -1126,35 +1212,66 @@ void target_abort_recovery(void *data)
                  EXIT;
                  return;
          }
-        obd->obd_recovering = obd->obd_abort_recovery = 0;
+        flags = exp_flags_from_obd(obd) | OBD_OPT_ABORT_RECOV;
+        obd->obd_recovering = 0;
+        obd->obd_abort_recovery = 0;
+        obd->obd_processing_task = 0;
+        if (abort == 0)
+                LASSERT(obd->obd_recoverable_clients == 0);
+
          target_cancel_recovery_timer(obd);
          spin_unlock_bh(&obd->obd_processing_task_lock);
  
-        LCONSOLE_WARN("%s: recovery period over; %d clients never reconnected "
-                      "after %lds (%d clients did)\n",
-                      obd->obd_name, obd->obd_recoverable_clients,
-                      cfs_time_current_sec()- obd->obd_recovery_start,
-                      obd->obd_connected_clients);
-        class_disconnect_stale_exports(obd);
+        if (abort) {
+                LCONSOLE_WARN("%s: recovery is aborted by administrative "
+                              "request; %d clients are not recovered "
+                              "(%d clients did)\n", obd->obd_name,
+                              obd->obd_recoverable_clients,
+                              obd->obd_connected_clients);
+                class_disconnect_stale_exports(obd, flags);
+        }
          abort_recovery_queue(obd);
-
          target_finish_recovery(obd);
          CDEBUG(D_HA, "%s: recovery complete\n", obd_uuid2str(&obd->obd_uuid));
          EXIT;
  }
  
+void target_abort_recovery(void *data)
+{
+        target_stop_recovery(data, 1);
+}
+
+static void reset_recovery_timer(struct obd_device *, int, int);
  static void target_recovery_expired(unsigned long castmeharder)
  {
          struct obd_device *obd = (struct obd_device *)castmeharder;
-        CERROR("%s: recovery timed out, aborting\n", obd->obd_name);
+        int version_recov = obd->obd_version_recov;
+        LCONSOLE_WARN("%s: recovery period over; %d clients never reconnected "
+                      "after %lds (%d clients did)\n",
+                      obd->obd_name, obd->obd_recoverable_clients,
+                      cfs_time_current_sec()- obd->obd_recovery_start,
+                      obd->obd_connected_clients);
+
+        /** check is fs version-capable */
+        if (target_fs_version_capable(obd)) {
+                class_handle_stale_exports(obd);
+        } else {
+                CWARN("Versions are not supported by ldiskfs, VBR is OFF\n");
+                class_disconnect_stale_exports(obd, exp_flags_from_obd(obd));
+        }
          spin_lock_bh(&obd->obd_processing_task_lock);
-        if (obd->obd_recovering)
+        /* VBR: no clients are remained to replay, stop recovery */
+        if (obd->obd_recovering && obd->obd_recoverable_clients == 0)
                  obd->obd_abort_recovery = 1;
+        /* always check versions now */
+        obd->obd_version_recov = 1;
          cfs_waitq_signal(&obd->obd_next_transno_waitq);
          spin_unlock_bh(&obd->obd_processing_task_lock);
+        /* reset timer if recovery will proceed with versions now */
+        reset_recovery_timer(obd, OBD_RECOVERY_FACTOR * obd_timeout,
+                             !version_recov);
  }
  
-
  /* obd_processing_task_lock should be held */
  void target_cancel_recovery_timer(struct obd_device *obd)
  {
@@ -1184,15 +1301,15 @@ static void reset_recovery_timer(struct obd_device *obd, int duration,
                  /* Track the client's largest expected replay time */
                  obd->obd_recovery_timeout = duration;
  #ifdef CRAY_XT3
-        /* 
-         * If total recovery time already exceed the 
-         * obd_recovery_max_time, then CRAY XT3 will 
+        /*
+         * If total recovery time already exceed the
+         * obd_recovery_max_time, then CRAY XT3 will
           * abort the recovery
           */
          if(obd->obd_recovery_timeout > obd->obd_recovery_max_time)
                  obd->obd_recovery_timeout = obd->obd_recovery_max_time;
  #endif
-        obd->obd_recovery_end = obd->obd_recovery_start + 
+        obd->obd_recovery_end = obd->obd_recovery_start +
                                  obd->obd_recovery_timeout;
          if (cfs_time_before(now, obd->obd_recovery_end)) {
                  left = cfs_time_sub(obd->obd_recovery_end, now);
@@ -1207,7 +1324,7 @@ static void check_and_start_recovery_timer(struct obd_device *obd,
                                             svc_handler_t handler)
  {
          spin_lock_bh(&obd->obd_processing_task_lock);
-        if (obd->obd_recovery_handler) { 
+        if (obd->obd_recovery_handler) {
                  spin_unlock_bh(&obd->obd_processing_task_lock);
                  return;
          }
@@ -1224,12 +1341,12 @@ static void check_and_start_recovery_timer(struct obd_device *obd,
  
  /* Reset the timer with each new client connection */
  /*
- * This timer is actually reconnect_timer, which is for making sure 
- * the total recovery window is at least as big as my reconnect 
+ * This timer is actually reconnect_timer, which is for making sure
+ * the total recovery window is at least as big as my reconnect
   * attempt timing. So the initial recovery time_out will be set to
   * OBD_RECOVERY_FACTOR * obd_timeout. If the timeout coming
   * from client is bigger than this, then the recovery time_out will
- * be extend to make sure the client could be reconnected, in the 
+ * be extend to make sure the client could be reconnected, in the
   * process, the timeout from the new client should be ignored.
   */
  
@@ -1239,13 +1356,27 @@ target_start_and_reset_recovery_timer(struct obd_device *obd,
                                        struct ptlrpc_request *req,
                                        int new_client)
  {
-        int req_timeout = OBD_RECOVERY_FACTOR * 
-                          lustre_msg_get_timeout(req->rq_reqmsg);
+        int service_time = lustre_msg_get_service_time(req->rq_reqmsg);
+
+        if (!new_client && service_time)
+                /* Teach server about old server's estimates, as first guess
+                   at how long new requests will take. */
+                at_add(&req->rq_rqbd->rqbd_service->srv_at_estimate,
+                       service_time);
  
          check_and_start_recovery_timer(obd, handler);
  
-        if (req_timeout > obd->obd_recovery_timeout && !new_client)
-                reset_recovery_timer(obd, req_timeout, 0);
+        /* convert the service time to rpc timeout,
+         * reuse service_time to limit stack usage */
+        service_time = at_est2timeout(service_time);
+
+        /* We expect other clients to timeout within service_time, then try
+         * to reconnect, then try the failover server.  The max delay between
+         * connect attempts is SWITCH_MAX + SWITCH_INC + INITIAL */
+        service_time += 2 * (CONNECTION_SWITCH_MAX + CONNECTION_SWITCH_INC +
+                             INITIAL_CONNECT_TIMEOUT);
+        if (service_time > obd->obd_recovery_timeout && !new_client)
+                reset_recovery_timer(obd, service_time, 0);
  }
  
  static int check_for_next_transno(struct obd_device *obd)
@@ -1260,13 +1391,15 @@ static int check_for_next_transno(struct obd_device *obd)
          max = obd->obd_max_recoverable_clients;
          req_transno = lustre_msg_get_transno(req->rq_reqmsg);
          connected = obd->obd_connected_clients;
-        completed = max - obd->obd_recoverable_clients;
+        completed = max - obd->obd_recoverable_clients -
+                    obd->obd_delayed_clients;
          queue_len = obd->obd_requests_queued_for_recovery;
          next_transno = obd->obd_next_recovery_transno;
  
-        CDEBUG(D_HA,"max: %d, connected: %d, completed: %d, queue_len: %d, "
-               "req_transno: "LPU64", next_transno: "LPU64"\n",
-               max, connected, completed, queue_len, req_transno, next_transno);
+        CDEBUG(D_HA,"max: %d, connected: %d, delayed %d, completed: %d, "
+               "queue_len: %d, req_transno: "LPU64", next_transno: "LPU64"\n",
+               max, connected, obd->obd_delayed_clients, completed, queue_len,
+               req_transno, next_transno);
          if (obd->obd_abort_recovery) {
                  CDEBUG(D_HA, "waking for aborted recovery\n");
                  wake_up = 1;
@@ -1276,7 +1409,7 @@ static int check_for_next_transno(struct obd_device *obd)
          } else if (req_transno == next_transno) {
                  CDEBUG(D_HA, "waking for next ("LPD64")\n", next_transno);
                  wake_up = 1;
-        } else if (queue_len + completed == max) {
+        } else if (queue_len == obd->obd_recoverable_clients) {
                  CDEBUG(D_ERROR,
                         "waking for skipped transno (skip: "LPD64
                         ", ql: %d, comp: %d, conn: %d, next: "LPD64")\n",
@@ -1292,7 +1425,6 @@ static int check_for_next_transno(struct obd_device *obd)
  static void process_recovery_queue(struct obd_device *obd)
  {
          struct ptlrpc_request *req;
-        int abort_recovery = 0;
          struct l_wait_info lwi = { 0 };
          ENTRY;
  
@@ -1312,13 +1444,8 @@ static void process_recovery_queue(struct obd_device *obd)
                                 req->rq_xid);
                          l_wait_event(obd->obd_next_transno_waitq,
                                       check_for_next_transno(obd), &lwi);
-                        spin_lock_bh(&obd->obd_processing_task_lock);
-                        abort_recovery = obd->obd_abort_recovery;
-                        spin_unlock_bh(&obd->obd_processing_task_lock);
-                        if (abort_recovery) {
-                                target_abort_recovery(obd);
+                        if (target_recovery_check_and_stop(obd))
                                  return;
-                        }
                          continue;
                  }
                  target_exp_dequeue_req_replay(req);
@@ -1329,8 +1456,9 @@ static void process_recovery_queue(struct obd_device *obd)
                  DEBUG_REQ(D_HA, req, "processing: ");
                  (void)obd->obd_recovery_handler(req);
                  obd->obd_replayed_requests++;
-                reset_recovery_timer(obd, OBD_RECOVERY_FACTOR *
-                       AT_OFF ? obd_timeout :
+                /* Extend the recovery timer enough to complete the next
+                 * replayed rpc */
+                reset_recovery_timer(obd, AT_OFF ? obd_timeout :
                         at_get(&req->rq_rqbd->rqbd_service->srv_at_estimate), 1);
                  /* bug 1580: decide how to properly sync() in recovery */
                  //mds_fsync_super(obd->u.obt.obt_sb);
@@ -1338,6 +1466,7 @@ static void process_recovery_queue(struct obd_device *obd)
                  ptlrpc_req_drop_rs(req);
                  OBD_FREE(req->rq_reqmsg, req->rq_reqlen);
                  OBD_FREE(req, sizeof *req);
+                OBD_RACE(OBD_FAIL_TGT_REPLAY_DELAY);
                  spin_lock_bh(&obd->obd_processing_task_lock);
                  obd->obd_next_recovery_transno++;
                  if (list_empty(&obd->obd_recovery_queue)) {
@@ -1481,7 +1610,8 @@ int target_queue_last_replay_reply(struct ptlrpc_request *req, int rc)
          struct obd_device *obd = target_req2obd(req);
          struct ptlrpc_request *saved_req;
          struct lustre_msg *reqmsg;
-        int recovery_done = 0;
+        struct obd_export *exp = req->rq_export;
+        int recovery_done = 0, delayed_done = 0;
  
          LASSERT ((rc == 0) == req->rq_packed_final);
  
@@ -1511,47 +1641,131 @@ int target_queue_last_replay_reply(struct ptlrpc_request *req, int rc)
          spin_lock_bh(&obd->obd_processing_task_lock);
          if (obd->obd_stopping) {
                  spin_unlock_bh(&obd->obd_processing_task_lock);
-                OBD_FREE(reqmsg, req->rq_reqlen);
-                OBD_FREE(saved_req, sizeof *req);
-                req->rq_status = -ENOTCONN;
-                /* rv is ignored anyhow */
-                return -ENOTCONN;
+                goto out_noconn;
+        }
+
+        if (!exp->exp_vbr_failed) {
+                ptlrpc_rs_addref(req->rq_reply_state);  /* +1 ref for saved reply */
+                req = saved_req;
+                req->rq_reqmsg = reqmsg;
+                CFS_INIT_LIST_HEAD(&req->rq_list);
+                CFS_INIT_LIST_HEAD(&req->rq_replay_list);
+                class_export_get(exp);
+                list_add(&req->rq_list, &obd->obd_delayed_reply_queue);
          }
-        ptlrpc_rs_addref(req->rq_reply_state);  /* +1 ref for saved reply */
-        req = saved_req;
-        req->rq_reqmsg = reqmsg;
-        class_export_get(req->rq_export);
-        list_add(&req->rq_list, &obd->obd_delayed_reply_queue);
  
          /* only count the first "replay over" request from each
             export */
-        if (req->rq_export->exp_replay_needed) {
-                --obd->obd_recoverable_clients;
-                
-                spin_lock(&req->rq_export->exp_lock);
-                req->rq_export->exp_replay_needed = 0;
-                spin_unlock(&req->rq_export->exp_lock);
+        if (exp->exp_replay_needed) {
+                spin_lock(&exp->exp_lock);
+                exp->exp_replay_needed = 0;
+                spin_unlock(&exp->exp_lock);
+
+                if (!exp->exp_delayed) {
+                        --obd->obd_recoverable_clients;
+                } else {
+                        spin_lock(&exp->exp_lock);
+                        exp->exp_delayed = 0;
+                        spin_unlock(&exp->exp_lock);
+                        delayed_done = 1;
+                        if (obd->obd_delayed_clients == 0) {
+                                spin_unlock_bh(&obd->obd_processing_task_lock);
+                                LBUG();
+                        }
+                        --obd->obd_delayed_clients;
+                }
          }
          recovery_done = (obd->obd_recoverable_clients == 0);
          spin_unlock_bh(&obd->obd_processing_task_lock);
  
+        if (delayed_done) {
+                /* start pinging export */
+                spin_lock(&obd->obd_dev_lock);
+                list_add_tail(&exp->exp_obd_chain_timed,
+                              &obd->obd_exports_timed);
+                list_move_tail(&exp->exp_obd_chain, &obd->obd_exports);
+                spin_unlock(&obd->obd_dev_lock);
+                target_send_delayed_replies(obd);
+        }
+
          OBD_RACE(OBD_FAIL_LDLM_RECOV_CLIENTS);
          if (recovery_done) {
                  spin_lock_bh(&obd->obd_processing_task_lock);
-                obd->obd_recovering = obd->obd_abort_recovery = 0;
+                obd->obd_recovering = 0;
+                obd->obd_version_recov = 0;
+                obd->obd_abort_recovery = 0;
                  target_cancel_recovery_timer(obd);
                  spin_unlock_bh(&obd->obd_processing_task_lock);
  
-                target_finish_recovery(obd);
+                OBD_RACE(OBD_FAIL_TGT_REPLAY_DELAY);
+
+                if (!delayed_done)
+                        target_finish_recovery(obd);
                  CDEBUG(D_HA, "%s: recovery complete\n",
                         obd_uuid2str(&obd->obd_uuid));
          } else {
                  CWARN("%s: %d recoverable clients remain\n",
-                       obd->obd_name, obd->obd_recoverable_clients);
+                      obd->obd_name, obd->obd_recoverable_clients);
                  cfs_waitq_signal(&obd->obd_next_transno_waitq);
          }
  
+        /* VBR: disconnect export with failed recovery */
+        if (exp->exp_vbr_failed) {
+                CWARN("%s: disconnect export %s\n", obd->obd_name,
+                      exp->exp_client_uuid.uuid);
+                class_fail_export(exp);
+                OBD_FREE(reqmsg, req->rq_reqlen);
+                OBD_FREE(saved_req, sizeof *req);
+                req->rq_status = 0;
+                ptlrpc_send_reply(req, 0);
+        }
+
          return 1;
+
+out_noconn:
+        OBD_FREE(reqmsg, req->rq_reqlen);
+        OBD_FREE(saved_req, sizeof *req);
+        req->rq_status = -ENOTCONN;
+        /* rv is ignored anyhow */
+        return -ENOTCONN;
+}
+
+int target_handle_reply(struct ptlrpc_request *req, int rc, int fail)
+{
+        struct obd_device *obd = NULL;
+
+        if (req->rq_export)
+                obd = target_req2obd(req);
+
+        /* handle replay reply for version recovery */
+        if (obd && obd->obd_version_recov &&
+            (lustre_msg_get_flags(req->rq_reqmsg) & MSG_REPLAY)) {
+                LASSERT(req->rq_repmsg);
+                lustre_msg_add_flags(req->rq_repmsg, MSG_VERSION_REPLAY);
+        }
+
+        /* handle last replay */
+        if (lustre_msg_get_flags(req->rq_reqmsg) & MSG_LAST_REPLAY) {
+                if (obd &&
+                    lustre_msg_get_flags(req->rq_reqmsg) & MSG_DELAY_REPLAY) {
+                        DEBUG_REQ(D_HA, req,
+                                  "delayed LAST_REPLAY, queuing reply");
+                        rc = target_queue_last_replay_reply(req, rc);
+                        LASSERT(req->rq_export->exp_delayed == 0);
+                        return rc;
+                }
+
+                if (obd && obd->obd_recovering) { /* normal recovery */
+                        DEBUG_REQ(D_HA, req, "LAST_REPLAY, queuing reply");
+                        rc = target_queue_last_replay_reply(req, rc);
+                        return rc;
+                }
+
+                /* Lost a race with recovery; let the error path DTRT. */
+                rc = req->rq_status = -ENOTCONN;
+        }
+        target_send_reply(req, rc, fail);
+        return 0;
  }
  
  static inline struct ldlm_pool *ldlm_exp2pl(struct obd_export *exp)
@@ -1564,9 +1778,9 @@ int target_pack_pool_reply(struct ptlrpc_request *req)
  {
          struct obd_device *obd;
          ENTRY;
-   
-        /* 
-         * Check that we still have all structures alive as this may 
+
+        /*
+         * Check that we still have all structures alive as this may
           * be some late rpc in shutdown time.
           */
          if (unlikely(!req->rq_export || !req->rq_export->exp_obd ||
@@ -1576,8 +1790,8 @@ int target_pack_pool_reply(struct ptlrpc_request *req)
                  RETURN(0);
          }
  
-        /* 
-         * OBD is alive here as export is alive, which we checked above. 
+        /*
+         * OBD is alive here as export is alive, which we checked above.
           */
          obd = req->rq_export->exp_obd;
  
@@ -1640,7 +1854,7 @@ target_send_reply(struct ptlrpc_request *req, int rc, int fail_id)
          LASSERT (list_empty(&rs->rs_obd_list));
          LASSERT (list_empty(&rs->rs_exp_list));
  
-        exp = class_export_get (req->rq_export);
+        exp = class_export_get(req->rq_export);
          obd = exp->exp_obd;
  
          /* disable reply scheduling onto srv_reply_queue while I'm setting up */
@@ -1650,15 +1864,16 @@ target_send_reply(struct ptlrpc_request *req, int rc, int fail_id)
          rs->rs_transno   = req->rq_transno;
          rs->rs_export    = exp;
  
-        spin_lock(&obd->obd_uncommitted_replies_lock);
+        spin_lock(&exp->exp_uncommitted_replies_lock);
  
-        if (rs->rs_transno > obd->obd_last_committed) {
+        /* VBR: use exp_last_committed */
+        if (rs->rs_transno > exp->exp_last_committed) {
                  /* not committed already */
                  list_add_tail (&rs->rs_obd_list,
-                               &obd->obd_uncommitted_replies);
+                               &exp->exp_uncommitted_replies);
          }
  
-        spin_unlock (&obd->obd_uncommitted_replies_lock);
+        spin_unlock (&exp->exp_uncommitted_replies_lock);
          spin_lock (&exp->exp_lock);
  
          list_add_tail (&rs->rs_exp_list, &exp->exp_outstanding_replies);
@@ -1697,28 +1912,33 @@ target_send_reply(struct ptlrpc_request *req, int rc, int fail_id)
  
  int target_handle_ping(struct ptlrpc_request *req)
  {
+        if (lustre_msg_get_flags(req->rq_reqmsg) & MSG_LAST_REPLAY &&
+            req->rq_export->exp_in_recovery) {
+                spin_lock(&req->rq_export->exp_lock);
+                req->rq_export->exp_in_recovery = 0;
+                spin_unlock(&req->rq_export->exp_lock);
+        }
          obd_ping(req->rq_export);
          return lustre_pack_reply(req, 1, NULL, NULL);
  }
  
  void target_committed_to_req(struct ptlrpc_request *req)
  {
-        struct obd_device *obd = req->rq_export->exp_obd;
-
-        if (!obd->obd_no_transno && req->rq_repmsg != NULL)
+        struct obd_export *exp = req->rq_export;
+        if (!exp->exp_obd->obd_no_transno && req->rq_repmsg != NULL) {
                  lustre_msg_set_last_committed(req->rq_repmsg,
-                                              obd->obd_last_committed);
-        else
+                                              exp->exp_last_committed);
+        } else {
                  DEBUG_REQ(D_IOCTL, req, "not sending last_committed update (%d/"
-                          "%d)", obd->obd_no_transno, req->rq_repmsg == NULL);
-
+                          "%d)", exp->exp_obd->obd_no_transno,
+                          req->rq_repmsg == NULL);
+        }
          CDEBUG(D_INFO, "last_committed x"LPU64", this req x"LPU64"\n",
-               obd->obd_last_committed, req->rq_xid);
+               exp->exp_obd->obd_last_committed, req->rq_xid);
  }
  
  EXPORT_SYMBOL(target_committed_to_req);
  
-#ifdef HAVE_QUOTA_SUPPORT
  int target_handle_qc_callback(struct ptlrpc_request *req)
  {
          struct obd_quotactl *oqctl;
@@ -1727,7 +1947,7 @@ int target_handle_qc_callback(struct ptlrpc_request *req)
          oqctl = lustre_swab_reqbuf(req, REQ_REC_OFF, sizeof(*oqctl),
                                     lustre_swab_obd_quotactl);
          if (oqctl == NULL) {
-                CERROR("Can't unpack obd_quotactl\n");
+                CERROR("Can't unpack obd_quatactl\n");
                  RETURN(-EPROTO);
          }
  
@@ -1736,6 +1956,7 @@ int target_handle_qc_callback(struct ptlrpc_request *req)
          return 0;
  }
  
+#ifdef HAVE_QUOTA_SUPPORT
  int target_handle_dqacq_callback(struct ptlrpc_request *req)
  {
  #ifdef __KERNEL__
@@ -1766,15 +1987,31 @@ int target_handle_dqacq_callback(struct ptlrpc_request *req)
                  RETURN(-ENOMEM);
          rc = quota_get_qdata(req, qdata, QUOTA_REQUEST, QUOTA_EXPORT);
          if (rc < 0) {
-                CDEBUG(D_ERROR, "Can't unpack qunit_data\n");
-                GOTO(out, rc = -EPROTO);
+                CDEBUG(D_ERROR, "Can't unpack qunit_data(rc: %d)\n", rc);
+                GOTO(out, rc);
          }
  
          /* we use the observer */
-        LASSERT(obd->obd_observer && obd->obd_observer->obd_observer);
+        if (!obd->obd_observer || !obd->obd_observer->obd_observer) {
+                CERROR("Can't find the observer, it is recovering\n");
+                req->rq_status = -EAGAIN;
+                GOTO(send_reply, rc = -EAGAIN);
+        }
+
          master_obd = obd->obd_observer->obd_observer;
          qctxt = &master_obd->u.obt.obt_qctxt;
  
+        if (!qctxt->lqc_setup) {
+                /* quota_type has not been processed yet, return EAGAIN
+                 * until we know whether or not quotas are supposed to
+                 * be enabled */
+                CDEBUG(D_QUOTA, "quota_type not processed yet, return "
+                                "-EAGAIN\n");
+                req->rq_status = -EAGAIN;
+                rc = ptlrpc_reply(req);
+                GOTO(out, rc);
+        }
+
          LASSERT(qctxt->lqc_handler);
          rc = qctxt->lqc_handler(master_obd, qdata,
                                  lustre_msg_get_opc(req->rq_reqmsg));
@@ -1787,13 +2024,13 @@ int target_handle_dqacq_callback(struct ptlrpc_request *req)
           * adjust the same form to different forms slaves needed */
          rc = quota_copy_qdata(req, qdata, QUOTA_REPLY, QUOTA_EXPORT);
          if (rc < 0) {
-                CDEBUG(D_ERROR, "Can't pack qunit_data\n");
-                GOTO(out, rc = -EPROTO);
+                CDEBUG(D_ERROR, "Can't pack qunit_data(rc: %d)\n", rc);
+                GOTO(out, rc);
          }
  
          /* Block the quota req. b=14840 */
          OBD_FAIL_TIMEOUT(OBD_FAIL_MDS_BLOCK_QUOTA_REQ, obd_timeout);
-
+ send_reply:
          rc = ptlrpc_reply(req);
  out:
          OBD_FREE(qdata, sizeof(struct qunit_data));
diff --git a/lustre/ldlm/ldlm_lock.c b/lustre/ldlm/ldlm_lock.c

index 2c25f41..48bb4d2 100644 (file)
--- a/lustre/ldlm/ldlm_lock.c
+++ b/lustre/ldlm/ldlm_lock.c
@@ -1,27 +1,42 @@
  /* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
   * vim:expandtab:shiftwidth=8:tabstop=8:
   *
- *  Copyright (c) 2002, 2003 Cluster File Systems, Inc.
- *   Author: Peter Braam <braam@clusterfs.com>
- *   Author: Phil Schwan <phil@clusterfs.com>
+ * GPL HEADER START
   *
- *   This file is part of the Lustre file system, http://www.lustre.org
- *   Lustre is a trademark of Cluster File Systems, Inc.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   *
- *   You may have signed or agreed to another license before downloading
- *   this software.  If so, you are bound by the terms and conditions
- *   of that agreement, and the following does not apply to you.  See the
- *   LICENSE file included with this distribution for more information.
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
   *
- *   If you did not agree to a different license, then this copy of Lustre
- *   is open source software; you can redistribute it and/or modify it
- *   under the terms of version 2 of the GNU General Public License as
- *   published by the Free Software Foundation.
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
   *
- *   In either case, Lustre is distributed in the hope that it will be
- *   useful, but WITHOUT ANY WARRANTY; without even the implied warranty
- *   of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- *   license text for more details.
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright  2008 Sun Microsystems, Inc. All rights reserved
+ * Use is subject to license terms.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/ldlm/ldlm_lock.c
+ *
+ * Author: Peter Braam <braam@clusterfs.com>
+ * Author: Phil Schwan <phil@clusterfs.com>
   */
  
  #define DEBUG_SUBSYSTEM S_LDLM
@@ -155,8 +170,8 @@ void ldlm_lock_put(struct ldlm_lock *lock)
                          OBD_FREE(lock->l_lvb_data, lock->l_lvb_len);
  
                  ldlm_interval_free(ldlm_interval_detach(lock));
-                OBD_FREE_RCU_CB(lock, sizeof(*lock), &lock->l_handle, 
-                               ldlm_lock_free);
+                OBD_FREE_RCU_CB(lock, sizeof(*lock), &lock->l_handle,
+                                ldlm_lock_free);
          }
  
          EXIT;
@@ -193,6 +208,7 @@ void ldlm_lock_add_to_lru_nolock(struct ldlm_lock *lock)
          struct ldlm_namespace *ns = lock->l_resource->lr_namespace;
          lock->l_last_used = cfs_time_current();
          LASSERT(list_empty(&lock->l_lru));
+        LASSERT(lock->l_resource->lr_type != LDLM_FLOCK);
          list_add_tail(&lock->l_lru, &ns->ns_unused_list);
          LASSERT(ns->ns_nr_unused >= 0);
          ns->ns_nr_unused++;
@@ -249,11 +265,9 @@ int ldlm_lock_destroy_internal(struct ldlm_lock *lock)
          }
          lock->l_destroyed = 1;
  
-        if (lock->l_export)
-                spin_lock(&lock->l_export->exp_ldlm_data.led_lock);
-        list_del_init(&lock->l_export_chain);
-        if (lock->l_export)
-                spin_unlock(&lock->l_export->exp_ldlm_data.led_lock);
+        if (lock->l_export && lock->l_export->exp_lock_hash)
+                lustre_hash_del(lock->l_export->exp_lock_hash,
+                                &lock->l_remote_handle, &lock->l_exp_hash);
  
          ldlm_lock_remove_from_lru(lock);
          class_handle_unhash(&lock->l_handle);
@@ -327,7 +341,6 @@ static struct ldlm_lock *ldlm_lock_new(struct ldlm_resource *resource)
          atomic_set(&lock->l_refc, 2);
          CFS_INIT_LIST_HEAD(&lock->l_res_link);
          CFS_INIT_LIST_HEAD(&lock->l_lru);
-        CFS_INIT_LIST_HEAD(&lock->l_export_chain);
          CFS_INIT_LIST_HEAD(&lock->l_pending_chain);
          CFS_INIT_LIST_HEAD(&lock->l_bl_ast);
          CFS_INIT_LIST_HEAD(&lock->l_cp_ast);
@@ -335,6 +348,7 @@ static struct ldlm_lock *ldlm_lock_new(struct ldlm_resource *resource)
          lock->l_blocking_lock = NULL;
          CFS_INIT_LIST_HEAD(&lock->l_sl_mode);
          CFS_INIT_LIST_HEAD(&lock->l_sl_policy);
+        CFS_INIT_HLIST_NODE(&lock->l_exp_hash);
  
          atomic_inc(&resource->lr_namespace->ns_locks);
          CFS_INIT_LIST_HEAD(&lock->l_handle.h_link);
@@ -343,6 +357,7 @@ static struct ldlm_lock *ldlm_lock_new(struct ldlm_resource *resource)
          CFS_INIT_LIST_HEAD(&lock->l_extents_list);
          spin_lock_init(&lock->l_extents_list_lock);
          CFS_INIT_LIST_HEAD(&lock->l_cache_locks_list);
+        lock->l_callback_timeout = 0;
  
          RETURN(lock);
  }
@@ -538,7 +553,7 @@ void ldlm_add_ast_work_item(struct ldlm_lock *lock, struct ldlm_lock *new,
          check_res_locked(lock->l_resource);
          if (new)
                  ldlm_add_bl_work_item(lock, new, work_list);
-        else 
+        else
                  ldlm_add_cp_work_item(lock, work_list);
          EXIT;
  }
@@ -574,7 +589,7 @@ void ldlm_lock_addref_internal(struct ldlm_lock *lock, __u32 mode)
  
  /* only called in ldlm_flock_destroy and for local locks.
   * for LDLM_FLOCK type locks, l_blocking_ast is null, and
- * ldlm_lock_remove_from_lru() does nothing, it is safe 
+ * ldlm_lock_remove_from_lru() does nothing, it is safe
   * for ldlm_flock_destroy usage by dropping some code */
  void ldlm_lock_decref_internal_nolock(struct ldlm_lock *lock, __u32 mode)
  {
@@ -623,6 +638,10 @@ void ldlm_lock_decref_internal(struct ldlm_lock *lock, __u32 mode)
                  LDLM_LOCK_GET(lock); /* dropped by bl thread */
                  ldlm_lock_remove_from_lru(lock);
                  unlock_res_and_lock(lock);
+
+                if (lock->l_flags & LDLM_FL_FAIL_LOC)
+                        OBD_RACE(OBD_FAIL_LDLM_CP_BL_RACE);
+
                  if ((lock->l_flags & LDLM_FL_ATOMIC_CB) ||
                      ldlm_bl_to_thread_lock(ns, NULL, lock) != 0)
                          ldlm_handle_bl_callback(ns, NULL, lock);
@@ -634,10 +653,10 @@ void ldlm_lock_decref_internal(struct ldlm_lock *lock, __u32 mode)
                   * reference, put it on the LRU. */
                  ldlm_lock_add_to_lru(lock);
                  unlock_res_and_lock(lock);
-                /* Call ldlm_cancel_lru() only if EARLY_CANCEL and LRU RESIZE 
-                 * are not supported by the server, otherwise, it is done on 
+                /* Call ldlm_cancel_lru() only if EARLY_CANCEL and LRU RESIZE
+                 * are not supported by the server, otherwise, it is done on
                   * enqueue. */
-                if (!exp_connect_cancelset(lock->l_conn_export) && 
+                if (!exp_connect_cancelset(lock->l_conn_export) &&
                      !ns_connect_lru_resize(ns))
                          ldlm_cancel_lru(ns, 0, LDLM_ASYNC, 0);
          } else {
@@ -746,7 +765,7 @@ static void search_granted_lock(struct list_head *queue,
  
                                  /* jump to next policy group within the mode group */
                                  tmp = policy_end->l_res_link.next;
-                                lock = list_entry(tmp, struct ldlm_lock, 
+                                lock = list_entry(tmp, struct ldlm_lock,
                                                    l_res_link);
                          }  /* loop over policy groups within the mode group */
  
@@ -772,7 +791,7 @@ static void search_granted_lock(struct list_head *queue,
          return;
  }
  
-static void ldlm_granted_list_add_lock(struct ldlm_lock *lock, 
+static void ldlm_granted_list_add_lock(struct ldlm_lock *lock,
                                         struct sl_insert_point *prev)
  {
          struct ldlm_resource *res = lock->l_resource;
@@ -780,7 +799,7 @@ static void ldlm_granted_list_add_lock(struct ldlm_lock *lock,
  
          check_res_locked(res);
  
-        ldlm_resource_dump(D_OTHER, res);
+        ldlm_resource_dump(D_INFO, res);
          CDEBUG(D_OTHER, "About to add this lock:\n");
          ldlm_lock_dump(D_OTHER, lock, 0);
  
@@ -931,15 +950,34 @@ int ldlm_lock_fast_match(struct ldlm_lock *lock, int rw,
                           void **cookie)
  {
          LASSERT(rw == OBD_BRW_READ || rw == OBD_BRW_WRITE);
-        /* should LCK_GROUP be handled in a special way? */
-        if (lock && (rw == OBD_BRW_READ ||
-                     (lock->l_granted_mode & (LCK_PW|LCK_GROUP))) &&
-            (lock->l_policy_data.l_extent.start <= start) &&
-            (lock->l_policy_data.l_extent.end >= end)) {
-                ldlm_lock_addref_internal(lock, rw == OBD_BRW_WRITE ? LCK_PW : LCK_PR);
-                *cookie = (void *)lock;
-                return 1; /* avoid using rc for stack relief */
-        }
+
+        if (!lock)
+                return 0;
+
+        lock_res_and_lock(lock);
+        /* check if granted mode is compatible */
+        if (rw == OBD_BRW_WRITE &&
+            !(lock->l_granted_mode & (LCK_PW|LCK_GROUP)))
+                goto no_match;
+
+        /* does the lock cover the region we would like to access? */
+        if ((lock->l_policy_data.l_extent.start > start) ||
+            (lock->l_policy_data.l_extent.end < end))
+                goto no_match;
+
+        /* if we received a blocking callback and the lock is no longer
+         * referenced, don't use it */
+        if ((lock->l_flags & LDLM_FL_CBPENDING) &&
+            !lock->l_writers && !lock->l_readers)
+                goto no_match;
+
+        ldlm_lock_addref_internal_nolock(lock, rw == OBD_BRW_WRITE ? LCK_PW : LCK_PR);
+        unlock_res_and_lock(lock);
+        *cookie = (void *)lock;
+        return 1; /* avoid using rc for stack relief */
+
+no_match:
+        unlock_res_and_lock(lock);
          return 0;
  }
  
@@ -1178,6 +1216,7 @@ ldlm_error_t ldlm_lock_enqueue(struct ldlm_namespace *ns,
  
          ldlm_resource_unlink_lock(lock);
          if (res->lr_type == LDLM_EXTENT && lock->l_tree_node == NULL) {
+                LASSERT(!local && (*flags & LDLM_FL_REPLAY));
                  if (node == NULL) {
                          ldlm_lock_destroy_nolock(lock);
                          GOTO(out, rc = -ENOMEM);
@@ -1267,7 +1306,7 @@ int ldlm_reprocess_queue(struct ldlm_resource *res, struct list_head *queue,
  }
  
  /* Helper function for pair ldlm_run_{bl,cp}_ast_work().
- * 
+ *
   * Send an existing rpc set specified by @arg->set and then
   * destroy it. Create new one if @do_create flag is set. */
  static void
@@ -1316,7 +1355,7 @@ int ldlm_run_bl_ast_work(struct list_head *rpc_list)
  
                  LDLM_LOCK_PUT(lock->l_blocking_lock);
                  lock->l_blocking_lock = NULL;
-                rc = lock->l_blocking_ast(lock, &d, (void *)&arg, 
+                rc = lock->l_blocking_ast(lock, &d, (void *)&arg,
                                            LDLM_CB_BLOCKING);
                  LDLM_LOCK_PUT(lock);
                  ast_count++;
@@ -1335,7 +1374,7 @@ int ldlm_run_bl_ast_work(struct list_head *rpc_list)
          else
                  /* In case when number of ASTs is multiply of
                   * PARALLEL_AST_LIMIT or @rpc_list was initially empty,
-                 * @arg.set must be destroyed here, otherwise we get 
+                 * @arg.set must be destroyed here, otherwise we get
                   * write memory leaking. */
                  ptlrpc_set_destroy(arg.set);
  
@@ -1364,7 +1403,7 @@ int ldlm_run_cp_ast_work(struct list_head *rpc_list)
           * will never call the local blocking_ast until we drop our
           * reader/writer reference, which we won't do until we get the
           * reply and finish enqueueing. */
-        
+
          ast_count = 0;
          list_for_each_safe(tmp, pos, rpc_list) {
                  struct ldlm_lock *lock =
@@ -1401,7 +1440,7 @@ int ldlm_run_cp_ast_work(struct list_head *rpc_list)
          else
                  /* In case when number of ASTs is multiply of
                   * PARALLEL_AST_LIMIT or @rpc_list was initially empty,
-                 * @arg.set must be destroyed here, otherwise we get 
+                 * @arg.set must be destroyed here, otherwise we get
                   * write memory leaking. */
                  ptlrpc_set_destroy(arg.set);
  
@@ -1525,7 +1564,7 @@ void ldlm_lock_cancel(struct ldlm_lock *lock)
  
          /* Yes, second time, just in case it was added again while we were
             running with no res lock in ldlm_cancel_callback */
-        ldlm_del_waiting_lock(lock); 
+        ldlm_del_waiting_lock(lock);
          ldlm_resource_unlink_lock(lock);
          ldlm_lock_destroy_nolock(lock);
  
@@ -1553,30 +1592,29 @@ int ldlm_lock_set_data(struct lustre_handle *lockh, void *data)
          RETURN(0);
  }
  
-void ldlm_cancel_locks_for_export(struct obd_export *exp)
+void ldlm_cancel_locks_for_export_cb(void *obj, void *data)
  {
-        struct ldlm_lock *lock;
-        struct ldlm_resource *res;
+        struct obd_export     *exp = data;
+        struct ldlm_lock      *lock = obj;
+        struct ldlm_resource  *res;
  
-        spin_lock(&exp->exp_ldlm_data.led_lock);
-        while(!list_empty(&exp->exp_ldlm_data.led_held_locks)) {
-                lock = list_entry(exp->exp_ldlm_data.led_held_locks.next,
-                                  struct ldlm_lock, l_export_chain);
-                res = ldlm_resource_getref(lock->l_resource);
-                LDLM_LOCK_GET(lock);
-                spin_unlock(&exp->exp_ldlm_data.led_lock);
+        res = ldlm_resource_getref(lock->l_resource);
+        LDLM_LOCK_GET(lock);
  
-                LDLM_DEBUG(lock, "export %p", exp);
-                ldlm_res_lvbo_update(res, NULL, 0, 1);
+        LDLM_DEBUG(lock, "export %p", exp);
+        ldlm_res_lvbo_update(res, NULL, 0, 1);
  
-                ldlm_lock_cancel(lock);
-                ldlm_reprocess_all(res);
+        ldlm_lock_cancel(lock);
+        ldlm_reprocess_all(res);
  
-                ldlm_resource_putref(res);
-                LDLM_LOCK_PUT(lock);
-                spin_lock(&exp->exp_ldlm_data.led_lock);
-        }
-        spin_unlock(&exp->exp_ldlm_data.led_lock);
+        ldlm_resource_putref(res);
+        LDLM_LOCK_PUT(lock);
+}
+
+void ldlm_cancel_locks_for_export(struct obd_export *exp)
+{
+        lustre_hash_for_each_empty(exp->exp_lock_hash,
+                                   ldlm_cancel_locks_for_export_cb, exp);
  }
  
  struct ldlm_resource *ldlm_lock_convert(struct ldlm_lock *lock, int new_mode,
@@ -1614,8 +1652,8 @@ struct ldlm_resource *ldlm_lock_convert(struct ldlm_lock *lock, int new_mode,
          old_mode = lock->l_req_mode;
          lock->l_req_mode = new_mode;
          if (res->lr_type == LDLM_PLAIN || res->lr_type == LDLM_IBITS) {
-                /* remember the lock position where the lock might be 
-                 * added back to the granted list later and also 
+                /* remember the lock position where the lock might be
+                 * added back to the granted list later and also
                   * remember the join mode for skiplist fixing. */
                  prev.res_link = lock->l_res_link.prev;
                  prev.mode_link = lock->l_sl_mode.prev;
@@ -1624,7 +1662,7 @@ struct ldlm_resource *ldlm_lock_convert(struct ldlm_lock *lock, int new_mode,
          } else {
                  ldlm_resource_unlink_lock(lock);
                  if (res->lr_type == LDLM_EXTENT) {
-                        /* FIXME: ugly code, I have to attach the lock to a 
+                        /* FIXME: ugly code, I have to attach the lock to a
                           * interval node again since perhaps it will be granted
                           * soon */
                          CFS_INIT_LIST_HEAD(&node->li_group);
@@ -1710,7 +1748,7 @@ void ldlm_lock_dump(int level, struct ldlm_lock *lock, int pos)
                 lock->l_resource->lr_name.name[0],
                 lock->l_resource->lr_name.name[1]);
          CDEBUG(level, "  Req mode: %s, grant mode: %s, rc: %u, read: %d, "
-               "write: %d flags: %#x\n", ldlm_lockname[lock->l_req_mode],
+               "write: %d flags: "LPX64"\n", ldlm_lockname[lock->l_req_mode],
                 ldlm_lockname[lock->l_granted_mode],
                 atomic_read(&lock->l_refc), lock->l_readers, lock->l_writers,
                 lock->l_flags);
@@ -1747,19 +1785,19 @@ void ldlm_lock_dump_handle(int level, struct lustre_handle *lockh)
  }
  
  void _ldlm_lock_debug(struct ldlm_lock *lock, __u32 level,
-                     struct libcfs_debug_msg_data *data, const char *fmt,
+                      struct libcfs_debug_msg_data *data, const char *fmt,
                        ...)
  {
          va_list args;
          cfs_debug_limit_state_t *cdls = data->msg_cdls;
  
-       va_start(args, fmt);
+        va_start(args, fmt);
          if (lock->l_resource == NULL) {
                  libcfs_debug_vmsg2(cdls, data->msg_subsys, level, data->msg_file,
                                     data->msg_fn, data->msg_line, fmt, args,
                                     " ns: \?\? lock: %p/"LPX64" lrc: %d/%d,%d mode: %s/%s "
-                                   "res: \?\? rrc=\?\? type: \?\?\? flags: %x remote: "
-                                   LPX64" expref: %d pid: %u\n", lock,
+                                   "res: \?\? rrc=\?\? type: \?\?\? flags: "LPX64" remote: "
+                                   LPX64" expref: %d pid: %u timeout: %lu\n", lock,
                                     lock->l_handle.h_cookie, atomic_read(&lock->l_refc),
                                     lock->l_readers, lock->l_writers,
                                     ldlm_lockname[lock->l_granted_mode],
@@ -1767,8 +1805,8 @@ void _ldlm_lock_debug(struct ldlm_lock *lock, __u32 level,
                                     lock->l_flags, lock->l_remote_handle.cookie,
                                     lock->l_export ?
                                          atomic_read(&lock->l_export->exp_refcount) : -99,
-                                   lock->l_pid);
-               va_end(args);
+                                   lock->l_pid, lock->l_callback_timeout);
+                va_end(args);
                  return;
          }
  
@@ -1778,8 +1816,8 @@ void _ldlm_lock_debug(struct ldlm_lock *lock, __u32 level,
                                     data->msg_fn, data->msg_line, fmt, args,
                                     " ns: %s lock: %p/"LPX64" lrc: %d/%d,%d mode: %s/%s "
                                     "res: "LPU64"/"LPU64" rrc: %d type: %s ["LPU64"->"LPU64
-                                   "] (req "LPU64"->"LPU64") flags: %x remote: "LPX64
-                                    " expref: %d pid: %u\n",
+                                   "] (req "LPU64"->"LPU64") flags: "LPX64" remote: "LPX64
+                                    " expref: %d pid: %u timeout %lu\n",
                                      lock->l_resource->lr_namespace->ns_name, lock,
                                      lock->l_handle.h_cookie, atomic_read(&lock->l_refc),
                                      lock->l_readers, lock->l_writers,
@@ -1795,15 +1833,15 @@ void _ldlm_lock_debug(struct ldlm_lock *lock, __u32 level,
                                      lock->l_flags, lock->l_remote_handle.cookie,
                                      lock->l_export ?
                                          atomic_read(&lock->l_export->exp_refcount) : -99,
-                                    lock->l_pid);
+                                    lock->l_pid, lock->l_callback_timeout);
                  break;
          case LDLM_FLOCK:
                  libcfs_debug_vmsg2(cdls, data->msg_subsys, level, data->msg_file,
                                     data->msg_fn, data->msg_line, fmt, args,
                                     " ns: %s lock: %p/"LPX64" lrc: %d/%d,%d mode: %s/%s "
                                     "res: "LPU64"/"LPU64" rrc: %d type: %s pid: %d "
-                                   "["LPU64"->"LPU64"] flags: %x remote: "LPX64
-                                   " expref: %d pid: %u\n",
+                                   "["LPU64"->"LPU64"] flags: "LPX64" remote: "LPX64
+                                   " expref: %d pid: %u timeout: %lu\n",
                                     lock->l_resource->lr_namespace->ns_name, lock,
                                     lock->l_handle.h_cookie, atomic_read(&lock->l_refc),
                                     lock->l_readers, lock->l_writers,
@@ -1819,15 +1857,15 @@ void _ldlm_lock_debug(struct ldlm_lock *lock, __u32 level,
                                     lock->l_flags, lock->l_remote_handle.cookie,
                                     lock->l_export ?
                                          atomic_read(&lock->l_export->exp_refcount) : -99,
-                                   lock->l_pid);
+                                   lock->l_pid, lock->l_callback_timeout);
                  break;
          case LDLM_IBITS:
                  libcfs_debug_vmsg2(cdls, data->msg_subsys, level, data->msg_file,
                                     data->msg_fn, data->msg_line, fmt, args,
                                     " ns: %s lock: %p/"LPX64" lrc: %d/%d,%d mode: %s/%s "
                                     "res: "LPU64"/"LPU64" bits "LPX64" rrc: %d type: %s "
-                                   "flags: %x remote: "LPX64" expref: %d "
-                                   "pid %u\n",
+                                   "flags: "LPX64" remote: "LPX64" expref: %d "
+                                   "pid: %u timeout: %lu\n",
                                     lock->l_resource->lr_namespace->ns_name,
                                     lock, lock->l_handle.h_cookie,
                                     atomic_read (&lock->l_refc),
@@ -1842,14 +1880,14 @@ void _ldlm_lock_debug(struct ldlm_lock *lock, __u32 level,
                                     lock->l_flags, lock->l_remote_handle.cookie,
                                     lock->l_export ?
                                          atomic_read(&lock->l_export->exp_refcount) : -99,
-                                   lock->l_pid);
+                                   lock->l_pid, lock->l_callback_timeout);
                  break;
          default:
                  libcfs_debug_vmsg2(cdls, data->msg_subsys, level, data->msg_file,
                                     data->msg_fn, data->msg_line, fmt, args,
                                     " ns: %s lock: %p/"LPX64" lrc: %d/%d,%d mode: %s/%s "
-                                   "res: "LPU64"/"LPU64" rrc: %d type: %s flags: %x "
-                                   "remote: "LPX64" expref: %d pid: %u\n",
+                                   "res: "LPU64"/"LPU64" rrc: %d type: %s flags: "LPX64" "
+                                   "remote: "LPX64" expref: %d pid: %u timeout %lu\n",
                                     lock->l_resource->lr_namespace->ns_name,
                                     lock, lock->l_handle.h_cookie,
                                     atomic_read (&lock->l_refc),
@@ -1863,7 +1901,7 @@ void _ldlm_lock_debug(struct ldlm_lock *lock, __u32 level,
                                     lock->l_flags, lock->l_remote_handle.cookie,
                                     lock->l_export ?
                                           atomic_read(&lock->l_export->exp_refcount) : -99,
-                                   lock->l_pid);
+                                   lock->l_pid, lock->l_callback_timeout);
                  break;
          }
          va_end(args);
diff --git a/lustre/ldlm/ldlm_lockd.c b/lustre/ldlm/ldlm_lockd.c

index 1caf259..dc0dea9 100644 (file)
--- a/lustre/ldlm/ldlm_lockd.c
+++ b/lustre/ldlm/ldlm_lockd.c
@@ -1,27 +1,42 @@
  /* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
   * vim:expandtab:shiftwidth=8:tabstop=8:
   *
- * Copyright (C) 2002-2004 Cluster File Systems, Inc.
- *   Author: Peter Braam <braam@clusterfs.com>
- *   Author: Phil Schwan <phil@clusterfs.com>
+ * GPL HEADER START
   *
- *   This file is part of the Lustre file system, http://www.lustre.org
- *   Lustre is a trademark of Cluster File Systems, Inc.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   *
- *   You may have signed or agreed to another license before downloading
- *   this software.  If so, you are bound by the terms and conditions
- *   of that agreement, and the following does not apply to you.  See the
- *   LICENSE file included with this distribution for more information.
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
   *
- *   If you did not agree to a different license, then this copy of Lustre
- *   is open source software; you can redistribute it and/or modify it
- *   under the terms of version 2 of the GNU General Public License as
- *   published by the Free Software Foundation.
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
   *
- *   In either case, Lustre is distributed in the hope that it will be
- *   useful, but WITHOUT ANY WARRANTY; without even the implied warranty
- *   of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- *   license text for more details.
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright  2008 Sun Microsystems, Inc. All rights reserved
+ * Use is subject to license terms.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/ldlm/ldlm_lockd.c
+ *
+ * Author: Peter Braam <braam@clusterfs.com>
+ * Author: Phil Schwan <phil@clusterfs.com>
   */
  
  #ifndef EXPORT_SYMTAB
@@ -125,8 +140,8 @@ struct ldlm_bl_work_item {
  static inline int have_expired_locks(void)
  {
          int need_to_run;
-
          ENTRY;
+
          spin_lock_bh(&waiting_locks_spinlock);
          need_to_run = !list_empty(&expired_lock_thread.elt_expired_locks);
          spin_unlock_bh(&waiting_locks_spinlock);
@@ -187,11 +202,19 @@ static int expired_lock_main(void *arg)
                                         lock->l_export);
                                  lock->l_export = NULL;
                                  LDLM_ERROR(lock, "free export");
+                                /* release extra ref grabbed by
+                                 * ldlm_add_waiting_lock() or
+                                 * ldlm_failed_ast() */
+                                LDLM_LOCK_PUT(lock);
                                  continue;
                          }
                          export = class_export_get(lock->l_export);
                          spin_unlock_bh(&waiting_locks_spinlock);
  
+                        /* release extra ref grabbed by ldlm_add_waiting_lock()
+                         * or ldlm_failed_ast() */
+                        LDLM_LOCK_PUT(lock);
+
                          do_dump++;
                          class_fail_export(export);
                          class_export_put(export);
@@ -213,6 +236,31 @@ static int expired_lock_main(void *arg)
          RETURN(0);
  }
  
+/**
+ * Check if there is a request in the export request list
+ * which prevents the lock canceling.
+ */
+static int ldlm_lock_busy(struct ldlm_lock *lock)
+{
+        struct ptlrpc_request *req;
+        int match = 0;
+        ENTRY;
+
+        if (lock->l_export == NULL)
+                return 0;
+
+        spin_lock(&lock->l_export->exp_lock);
+        list_for_each_entry(req, &lock->l_export->exp_queued_rpc, rq_exp_list) {
+                if (req->rq_ops->hpreq_lock_match) {
+                        match = req->rq_ops->hpreq_lock_match(req, lock);
+                        if (match)
+                                break;
+                }
+        }
+        spin_unlock(&lock->l_export->exp_lock);
+        RETURN(match);
+}
+
  /* This is called from within a timer interrupt and cannot schedule */
  static void waiting_locks_callback(unsigned long unused)
  {
@@ -222,11 +270,34 @@ static void waiting_locks_callback(unsigned long unused)
          while (!list_empty(&waiting_locks_list)) {
                  lock = list_entry(waiting_locks_list.next, struct ldlm_lock,
                                    l_pending_chain);
-
                  if (cfs_time_after(lock->l_callback_timeout, cfs_time_current())
                      || (lock->l_req_mode == LCK_GROUP))
                          break;
  
+                /* Check if we need to prolong timeout */
+                if (!OBD_FAIL_CHECK(OBD_FAIL_PTLRPC_HPREQ_TIMEOUT) &&
+                    ldlm_lock_busy(lock)) {
+                        int cont = 1;
+
+                        if (lock->l_pending_chain.next == &waiting_locks_list)
+                                cont = 0;
+
+                        LDLM_LOCK_GET(lock);
+                        spin_unlock_bh(&waiting_locks_spinlock);
+                        LDLM_DEBUG(lock, "prolong the busy lock");
+                        ldlm_refresh_waiting_lock(lock, 
+                                                  ldlm_get_enq_timeout(lock));
+                        spin_lock_bh(&waiting_locks_spinlock);
+
+                        if (!cont) {
+                                LDLM_LOCK_PUT(lock);
+                                break;
+                        }
+
+                        LDLM_LOCK_PUT(lock);
+                        continue;
+                }
+                lock->l_resource->lr_namespace->ns_timeouts++;
                  LDLM_ERROR(lock, "lock callback timer expired after %lds: "
                             "evicting client at %s ",
                             cfs_time_current_sec()- lock->l_enqueued_time.tv_sec,
@@ -253,6 +324,9 @@ static void waiting_locks_callback(unsigned long unused)
                  }
                  last = lock;
  
+                /* no needs to take an extra ref on the lock since it was in
+                 * the waiting_locks_list and ldlm_add_waiting_lock()
+                 * already grabbed a ref */
                  list_del(&lock->l_pending_chain);
                  list_add(&lock->l_pending_chain,
                           &expired_lock_thread.elt_expired_locks);
@@ -284,24 +358,30 @@ static void waiting_locks_callback(unsigned long unused)
   * lock.  We add it to the pending-callback chain, and schedule the lock-timeout
   * timer to fire appropriately.  (We round up to the next second, to avoid
   * floods of timer firings during periods of high lock contention and traffic).
+ * As done by ldlm_add_waiting_lock(), the caller must grab a lock reference
+ * if it has been added to the waiting list (1 is returned).
   *
   * Called with the namespace lock held.
   */
-static int __ldlm_add_waiting_lock(struct ldlm_lock *lock)
+static int __ldlm_add_waiting_lock(struct ldlm_lock *lock, int seconds)
  {
-        int timeout;
+        cfs_time_t timeout;
          cfs_time_t timeout_rounded;
  
          if (!list_empty(&lock->l_pending_chain))
                  return 0;
  
-        timeout = ldlm_get_enq_timeout(lock);
+        if (OBD_FAIL_CHECK(OBD_FAIL_PTLRPC_HPREQ_NOTIMEOUT) ||
+            OBD_FAIL_CHECK(OBD_FAIL_PTLRPC_HPREQ_TIMEOUT))
+                seconds = 2;
  
-        lock->l_callback_timeout = cfs_time_shift(timeout);
+        timeout = cfs_time_shift(seconds);
+        if (likely(cfs_time_after(timeout, lock->l_callback_timeout)))
+                lock->l_callback_timeout = timeout;
  
          timeout_rounded = round_timeout(lock->l_callback_timeout);
  
-        if (cfs_time_before(timeout_rounded, 
+        if (cfs_time_before(timeout_rounded,
                              cfs_timer_deadline(&waiting_locks_timer)) ||
              !cfs_timer_is_armed(&waiting_locks_timer)) {
                  cfs_timer_arm(&waiting_locks_timer, timeout_rounded);
@@ -330,7 +410,11 @@ static int ldlm_add_waiting_lock(struct ldlm_lock *lock)
                  return 0;
          }
  
-        ret = __ldlm_add_waiting_lock(lock);
+        ret = __ldlm_add_waiting_lock(lock, ldlm_get_enq_timeout(lock));
+        if (ret)
+                /* grab ref on the lock if it has been added to the
+                 * waiting list */
+                LDLM_LOCK_GET(lock);
          spin_unlock_bh(&waiting_locks_spinlock);
  
          LDLM_DEBUG(lock, "%sadding to wait list",
@@ -342,10 +426,12 @@ static int ldlm_add_waiting_lock(struct ldlm_lock *lock)
   * Remove a lock from the pending list, likely because it had its cancellation
   * callback arrive without incident.  This adjusts the lock-timeout timer if
   * needed.  Returns 0 if the lock wasn't pending after all, 1 if it was.
+ * As done by ldlm_del_waiting_lock(), the caller must release the lock
+ * reference when the lock is removed from any list (1 is returned).
   *
   * Called with namespace lock held.
   */
-int __ldlm_del_waiting_lock(struct ldlm_lock *lock)
+static int __ldlm_del_waiting_lock(struct ldlm_lock *lock)
  {
          struct list_head *list_next;
  
@@ -384,6 +470,10 @@ int ldlm_del_waiting_lock(struct ldlm_lock *lock)
          spin_lock_bh(&waiting_locks_spinlock);
          ret = __ldlm_del_waiting_lock(lock);
          spin_unlock_bh(&waiting_locks_spinlock);
+        if (ret)
+                /* release lock ref if it has indeed been removed
+                 * from a list */
+                LDLM_LOCK_PUT(lock);
  
          LDLM_DEBUG(lock, "%s", ret == 0 ? "wasn't waiting" : "removed");
          return ret;
@@ -394,7 +484,7 @@ int ldlm_del_waiting_lock(struct ldlm_lock *lock)
   *
   * Called with namespace lock held.
   */
-int ldlm_refresh_waiting_lock(struct ldlm_lock *lock)
+int ldlm_refresh_waiting_lock(struct ldlm_lock *lock, int timeout)
  {
          if (lock->l_export == NULL) {
                  /* We don't have a "waiting locks list" on clients. */
@@ -410,14 +500,15 @@ int ldlm_refresh_waiting_lock(struct ldlm_lock *lock)
                  return 0;
          }
  
+        /* we remove/add the lock to the waiting list, so no needs to
+         * release/take a lock reference */
          __ldlm_del_waiting_lock(lock);
-        __ldlm_add_waiting_lock(lock);
+        __ldlm_add_waiting_lock(lock, timeout);
          spin_unlock_bh(&waiting_locks_spinlock);
  
          LDLM_DEBUG(lock, "refreshed");
          return 1;
  }
-
  #else /* !__KERNEL__ */
  
  static int ldlm_add_waiting_lock(struct ldlm_lock *lock)
@@ -431,7 +522,7 @@ int ldlm_del_waiting_lock(struct ldlm_lock *lock)
          RETURN(0);
  }
  
-int ldlm_refresh_waiting_lock(struct ldlm_lock *lock)
+int ldlm_refresh_waiting_lock(struct ldlm_lock *lock, int timeout)
  {
          RETURN(0);
  }
@@ -450,7 +541,18 @@ static void ldlm_failed_ast(struct ldlm_lock *lock, int rc,
  
          if (obd_dump_on_timeout)
                  libcfs_debug_dumplog();
+#ifdef __KERNEL__
+        spin_lock_bh(&waiting_locks_spinlock);
+        if (__ldlm_del_waiting_lock(lock) == 0)
+                /* the lock was not in any list, grab an extra ref before adding
+                 * the lock to the expired list */
+                LDLM_LOCK_GET(lock);
+        list_add(&lock->l_pending_chain, &expired_lock_thread.elt_expired_locks);
+        cfs_waitq_signal(&expired_lock_thread.elt_waitq);
+        spin_unlock_bh(&waiting_locks_spinlock);
+#else
          class_fail_export(lock->l_export);
+#endif
  }
  
  static int ldlm_handle_ast_error(struct ldlm_lock *lock,
@@ -553,6 +655,30 @@ static inline int ldlm_bl_and_cp_ast_fini(struct ptlrpc_request *req,
          RETURN(rc);
  }
  
+/**
+ * Check if there are requests in the export request list which prevent
+ * the lock canceling and make these requests high priority ones.
+ */
+static void ldlm_lock_reorder_req(struct ldlm_lock *lock)
+{
+        struct ptlrpc_request *req;
+        ENTRY;
+
+        if (lock->l_export == NULL) {
+                LDLM_DEBUG(lock, "client lock: no-op");
+                RETURN_EXIT;
+        }
+
+        spin_lock(&lock->l_export->exp_lock);
+        list_for_each_entry(req, &lock->l_export->exp_queued_rpc, rq_exp_list) {
+                if (!req->rq_hp && req->rq_ops->hpreq_lock_match &&
+                    req->rq_ops->hpreq_lock_match(req, lock))
+                        ptlrpc_hpreq_reorder(req);
+        }
+        spin_unlock(&lock->l_export->exp_lock);
+        EXIT;
+}
+
  /*
   * ->l_blocking_ast() method for server-side locks. This is invoked when newly
   * enqueued server lock conflicts with given one.
@@ -567,7 +693,7 @@ int ldlm_server_blocking_ast(struct ldlm_lock *lock,
          struct ldlm_cb_set_arg *arg = data;
          struct ldlm_request *body;
          struct ptlrpc_request *req;
-        int size[] = { [MSG_PTLRPC_BODY_OFF] = sizeof(struct ptlrpc_body),
+        __u32 size[] = { [MSG_PTLRPC_BODY_OFF] = sizeof(struct ptlrpc_body),
                         [DLM_LOCKREQ_OFF]     = sizeof(*body) };
          int instant_cancel = 0, rc;
          ENTRY;
@@ -580,6 +706,8 @@ int ldlm_server_blocking_ast(struct ldlm_lock *lock,
          LASSERT(lock);
          LASSERT(data != NULL);
  
+        ldlm_lock_reorder_req(lock);
+
          req = ptlrpc_prep_req(lock->l_export->exp_imp_reverse,
                                LUSTRE_DLM_VERSION, LDLM_BL_CALLBACK, 2, size,
                                NULL);
@@ -642,9 +770,11 @@ int ldlm_server_blocking_ast(struct ldlm_lock *lock,
          if (AT_OFF)
                  req->rq_timeout = ldlm_get_rq_timeout();
  
-        if (lock->l_export && lock->l_export->exp_ldlm_stats)
-                lprocfs_counter_incr(lock->l_export->exp_ldlm_stats,
+        if (lock->l_export && lock->l_export->exp_nid_stats &&
+            lock->l_export->exp_nid_stats->nid_ldlm_stats) {
+                lprocfs_counter_incr(lock->l_export->exp_nid_stats->nid_ldlm_stats,
                                       LDLM_BL_CALLBACK - LDLM_FIRST_OPC);
+        }
  
          rc = ldlm_bl_and_cp_ast_fini(req, arg, lock, instant_cancel);
  
@@ -658,7 +788,7 @@ int ldlm_server_completion_ast(struct ldlm_lock *lock, int flags, void *data)
          struct ptlrpc_request *req;
          struct timeval granted_time;
          long total_enqueue_wait;
-        int size[3] = { [MSG_PTLRPC_BODY_OFF] = sizeof(struct ptlrpc_body),
+        __u32 size[3] = { [MSG_PTLRPC_BODY_OFF] = sizeof(struct ptlrpc_body),
                          [DLM_LOCKREQ_OFF]     = sizeof(*body) };
          int rc, buffers = 2, instant_cancel = 0;
          ENTRY;
@@ -747,9 +877,11 @@ int ldlm_server_completion_ast(struct ldlm_lock *lock, int flags, void *data)
          }
          unlock_res_and_lock(lock);
  
-        if (lock->l_export && lock->l_export->exp_ldlm_stats)
-                lprocfs_counter_incr(lock->l_export->exp_ldlm_stats,
+        if (lock->l_export && lock->l_export->exp_nid_stats &&
+            lock->l_export->exp_nid_stats->nid_ldlm_stats) {
+                lprocfs_counter_incr(lock->l_export->exp_nid_stats->nid_ldlm_stats,
                                       LDLM_CP_CALLBACK - LDLM_FIRST_OPC);
+        }
  
          rc = ldlm_bl_and_cp_ast_fini(req, arg, lock, instant_cancel);
  
@@ -761,7 +893,7 @@ int ldlm_server_glimpse_ast(struct ldlm_lock *lock, void *data)
          struct ldlm_resource *res = lock->l_resource;
          struct ldlm_request *body;
          struct ptlrpc_request *req;
-        int size[] = { [MSG_PTLRPC_BODY_OFF] = sizeof(struct ptlrpc_body),
+        __u32 size[] = { [MSG_PTLRPC_BODY_OFF] = sizeof(struct ptlrpc_body),
                         [DLM_LOCKREQ_OFF]     = sizeof(*body) };
          int rc = 0;
          ENTRY;
@@ -789,9 +921,11 @@ int ldlm_server_glimpse_ast(struct ldlm_lock *lock, void *data)
          if (AT_OFF)
                  req->rq_timeout = ldlm_get_rq_timeout();
  
-        if (lock->l_export && lock->l_export->exp_ldlm_stats)
-                lprocfs_counter_incr(lock->l_export->exp_ldlm_stats,
+        if (lock->l_export && lock->l_export->exp_nid_stats &&
+            lock->l_export->exp_nid_stats->nid_ldlm_stats) {
+                lprocfs_counter_incr(lock->l_export->exp_nid_stats->nid_ldlm_stats,
                                       LDLM_GL_CALLBACK - LDLM_FIRST_OPC);
+        }
  
          rc = ptlrpc_queue_wait(req);
          if (rc == -ELDLM_NO_LOCK_DATA)
@@ -808,25 +942,6 @@ int ldlm_server_glimpse_ast(struct ldlm_lock *lock, void *data)
          RETURN(rc);
  }
  
-static struct ldlm_lock *
-find_existing_lock(struct obd_export *exp, struct lustre_handle *remote_hdl)
-{
-        struct list_head *iter;
-
-        spin_lock(&exp->exp_ldlm_data.led_lock);
-        list_for_each(iter, &exp->exp_ldlm_data.led_held_locks) {
-                struct ldlm_lock *lock;
-                lock = list_entry(iter, struct ldlm_lock, l_export_chain);
-                if (lock->l_remote_handle.cookie == remote_hdl->cookie) {
-                        LDLM_LOCK_GET(lock);
-                        spin_unlock(&exp->exp_ldlm_data.led_lock);
-                        return lock;
-                }
-        }
-        spin_unlock(&exp->exp_ldlm_data.led_lock);
-        return NULL;
-}
-
  static void ldlm_svc_get_eopc(struct ldlm_request *dlm_req,
                         struct lprocfs_stats *srv_stats)
  {
@@ -873,7 +988,7 @@ int ldlm_handle_enqueue(struct ptlrpc_request *req,
          struct obd_device *obddev = req->rq_export->exp_obd;
          struct ldlm_reply *dlm_rep;
          struct ldlm_request *dlm_req;
-        int size[3] = { [MSG_PTLRPC_BODY_OFF] = sizeof(struct ptlrpc_body),
+        __u32 size[3] = { [MSG_PTLRPC_BODY_OFF] = sizeof(struct ptlrpc_body),
                          [DLM_LOCKREPLY_OFF]   = sizeof(*dlm_rep) };
          int rc = 0;
          __u32 flags;
@@ -900,9 +1015,11 @@ int ldlm_handle_enqueue(struct ptlrpc_request *req,
                  ldlm_svc_get_eopc(dlm_req,
                                    req->rq_rqbd->rqbd_service->srv_stats);
  
-        if (req->rq_export->exp_ldlm_stats)
-                lprocfs_counter_incr(req->rq_export->exp_ldlm_stats,
+        if (req->rq_export && req->rq_export->exp_nid_stats &&
+            req->rq_export->exp_nid_stats->nid_ldlm_stats) {
+                lprocfs_counter_incr(req->rq_export->exp_nid_stats->nid_ldlm_stats,
                                       LDLM_ENQUEUE - LDLM_FIRST_OPC);
+        }
  
          if (dlm_req->lock_desc.l_resource.lr_type < LDLM_MIN_TYPE ||
              dlm_req->lock_desc.l_resource.lr_type >= LDLM_MAX_TYPE) {
@@ -949,8 +1066,9 @@ int ldlm_handle_enqueue(struct ptlrpc_request *req,
  #endif
  
          if (flags & LDLM_FL_REPLAY) {
-                lock = find_existing_lock(req->rq_export,
-                                          &dlm_req->lock_handle[0]);
+                /* Find an existing lock in the per-export lock hash */
+                lock = lustre_hash_lookup(req->rq_export->exp_lock_hash,
+                                          (void *)&dlm_req->lock_handle[0]);
                  if (lock != NULL) {
                          DEBUG_REQ(D_DLMTRACE, req, "found existing lock cookie "
                                    LPX64, lock->l_handle.h_cookie);
@@ -980,10 +1098,10 @@ int ldlm_handle_enqueue(struct ptlrpc_request *req,
                  GOTO(out, rc = -ENOTCONN);
          }
          lock->l_export = class_export_get(req->rq_export);
-        spin_lock(&lock->l_export->exp_ldlm_data.led_lock);
-        list_add(&lock->l_export_chain,
-                 &lock->l_export->exp_ldlm_data.led_held_locks);
-        spin_unlock(&lock->l_export->exp_ldlm_data.led_lock);
+
+        if (lock->l_export->exp_lock_hash)
+                lustre_hash_add(lock->l_export->exp_lock_hash,
+                                &lock->l_remote_handle, &lock->l_exp_hash);
  
  existing_lock:
  
@@ -1065,7 +1183,7 @@ existing_lock:
                  if (!(lock->l_flags & LDLM_FL_CANCEL_ON_BLOCK) ||
                      !(dlm_rep->lock_flags & LDLM_FL_CANCEL_ON_BLOCK)) {
                          CERROR("Granting sync lock to libclient. "
-                               "req fl %d, rep fl %d, lock fl %d\n",
+                               "req fl %d, rep fl %d, lock fl "LPX64"\n",
                                 dlm_req->lock_flags, dlm_rep->lock_flags,
                                 lock->l_flags);
                          LDLM_ERROR(lock, "sync lock");
@@ -1138,7 +1256,7 @@ int ldlm_handle_convert(struct ptlrpc_request *req)
          struct ldlm_reply *dlm_rep;
          struct ldlm_lock *lock;
          int rc;
-        int size[2] = { [MSG_PTLRPC_BODY_OFF] = sizeof(struct ptlrpc_body),
+        __u32 size[2] = { [MSG_PTLRPC_BODY_OFF] = sizeof(struct ptlrpc_body),
                          [DLM_LOCKREPLY_OFF]   = sizeof(*dlm_rep) };
          ENTRY;
  
@@ -1149,9 +1267,11 @@ int ldlm_handle_convert(struct ptlrpc_request *req)
                  RETURN (-EFAULT);
          }
  
-        if (req->rq_export && req->rq_export->exp_ldlm_stats)
-                lprocfs_counter_incr(req->rq_export->exp_ldlm_stats,
+        if (req->rq_export && req->rq_export->exp_nid_stats &&
+            req->rq_export->exp_nid_stats->nid_ldlm_stats) {
+                lprocfs_counter_incr(req->rq_export->exp_nid_stats->nid_ldlm_stats,
                                       LDLM_CONVERT - LDLM_FIRST_OPC);
+        }
  
          rc = lustre_pack_reply(req, 2, size, NULL);
          if (rc)
@@ -1258,9 +1378,11 @@ int ldlm_handle_cancel(struct ptlrpc_request *req)
                  RETURN(-EFAULT);
          }
  
-        if (req->rq_export && req->rq_export->exp_ldlm_stats)
-                lprocfs_counter_incr(req->rq_export->exp_ldlm_stats,
+        if (req->rq_export && req->rq_export->exp_nid_stats &&
+            req->rq_export->exp_nid_stats->nid_ldlm_stats) {
+                lprocfs_counter_incr(req->rq_export->exp_nid_stats->nid_ldlm_stats,
                                       LDLM_CANCEL - LDLM_FIRST_OPC);
+        }
  
          rc = lustre_pack_reply(req, 1, NULL, NULL);
          if (rc)
@@ -1356,7 +1478,7 @@ static void ldlm_handle_cp_callback(struct ptlrpc_request *req,
                     &lock->l_resource->lr_name,
                     sizeof(lock->l_resource->lr_name)) != 0) {
                  unlock_res_and_lock(lock);
-                if (ldlm_lock_change_resource(ns, lock, 
+                if (ldlm_lock_change_resource(ns, lock,
                                  dlm_req->lock_desc.l_resource.lr_name)) {
                          LDLM_ERROR(lock, "Failed to allocate resource");
                          LDLM_LOCK_PUT(lock);
@@ -1425,7 +1547,7 @@ static void ldlm_handle_gl_callback(struct ptlrpc_request *req,
          if (lock->l_granted_mode == LCK_PW &&
              !lock->l_readers && !lock->l_writers &&
              cfs_time_after(cfs_time_current(),
-                           cfs_time_add(lock->l_last_used, 
+                           cfs_time_add(lock->l_last_used,
                                          cfs_time_seconds(10)))) {
                  unlock_res_and_lock(lock);
                  if (ldlm_bl_to_thread_lock(ns, NULL, lock))
@@ -1561,6 +1683,7 @@ static int ldlm_callback_handler(struct ptlrpc_request *req)
          case OBD_LOG_CANCEL: /* remove this eventually - for 1.4.0 compat */
                  OBD_FAIL_RETURN(OBD_FAIL_OBD_LOG_CANCEL_NET, 0);
                  rc = llog_origin_handle_cancel(req);
+                OBD_FAIL_RETURN(OBD_FAIL_OBD_LOG_CANCEL_REP, 0);
                  ldlm_callback_reply(req, rc);
                  RETURN(0);
          case OBD_QC_CALLBACK:
@@ -1628,15 +1751,21 @@ static int ldlm_callback_handler(struct ptlrpc_request *req)
                  RETURN(0);
          }
  
+        if ((lock->l_flags & LDLM_FL_FAIL_LOC) && 
+            lustre_msg_get_opc(req->rq_reqmsg) == LDLM_BL_CALLBACK)
+                OBD_RACE(OBD_FAIL_LDLM_CP_BL_RACE);
+
          /* Copy hints/flags (e.g. LDLM_FL_DISCARD_DATA) from AST. */
          lock_res_and_lock(lock);
          lock->l_flags |= (dlm_req->lock_flags & LDLM_AST_FLAGS);
          if (lustre_msg_get_opc(req->rq_reqmsg) == LDLM_BL_CALLBACK) {
-                /* If somebody cancels locks and cache is already droped,
+                /* If somebody cancels lock and cache is already droped,
+                 * or lock is failed before cp_ast received on client,
                   * we can tell the server we have no lock. Otherwise, we
                   * should send cancel after dropping the cache. */
-                if ((lock->l_flags & LDLM_FL_CANCELING) &&
-                    (lock->l_flags & LDLM_FL_BL_DONE)) {
+                if (((lock->l_flags & LDLM_FL_CANCELING) &&
+                    (lock->l_flags & LDLM_FL_BL_DONE)) ||
+                    (lock->l_flags & LDLM_FL_FAILED)) {
                          LDLM_DEBUG(lock, "callback on lock "
                                     LPX64" - lock disappeared\n",
                                     dlm_req->lock_handle[0].cookie);
@@ -1727,6 +1856,7 @@ static int ldlm_cancel_handler(struct ptlrpc_request *req)
          case OBD_LOG_CANCEL:
                  OBD_FAIL_RETURN(OBD_FAIL_OBD_LOG_CANCEL_NET, 0);
                  rc = llog_origin_handle_cancel(req);
+                OBD_FAIL_RETURN(OBD_FAIL_OBD_LOG_CANCEL_REP, 0);
                  ldlm_callback_reply(req, rc);
                  RETURN(0);
          default:
@@ -1865,6 +1995,88 @@ static int ldlm_bl_thread_main(void *arg)
  
  #endif
  
+/* 
+ * Export handle<->lock hash operations. 
+ */
+static unsigned
+ldlm_export_lock_hash(lustre_hash_t *lh, void *key, unsigned mask)
+{
+        return lh_u64_hash(((struct lustre_handle *)key)->cookie, mask);
+}
+
+static void *
+ldlm_export_lock_key(struct hlist_node *hnode)
+{
+        struct ldlm_lock *lock;
+        ENTRY;
+
+        lock = hlist_entry(hnode, struct ldlm_lock, l_exp_hash);
+        RETURN(&lock->l_remote_handle);
+}
+
+static int
+ldlm_export_lock_compare(void *key, struct hlist_node *hnode)
+{
+        ENTRY;
+        RETURN(lustre_handle_equal(ldlm_export_lock_key(hnode), key));
+}
+
+static void *
+ldlm_export_lock_get(struct hlist_node *hnode)
+{
+        struct ldlm_lock *lock;
+        ENTRY;
+
+        lock = hlist_entry(hnode, struct ldlm_lock, l_exp_hash);
+        LDLM_LOCK_GET(lock);
+
+        RETURN(lock);
+}
+
+static void *
+ldlm_export_lock_put(struct hlist_node *hnode)
+{
+        struct ldlm_lock *lock;
+        ENTRY;
+
+        lock = hlist_entry(hnode, struct ldlm_lock, l_exp_hash);
+        LDLM_LOCK_PUT(lock);
+
+        RETURN(lock);
+}
+
+static lustre_hash_ops_t ldlm_export_lock_ops = {
+        .lh_hash    = ldlm_export_lock_hash,
+        .lh_key     = ldlm_export_lock_key,
+        .lh_compare = ldlm_export_lock_compare,
+        .lh_get     = ldlm_export_lock_get,
+        .lh_put     = ldlm_export_lock_put
+};
+
+int ldlm_init_export(struct obd_export *exp)
+{
+        ENTRY;
+
+        exp->exp_lock_hash =
+                lustre_hash_init(obd_uuid2str(&exp->exp_client_uuid),
+                                 7, 16, &ldlm_export_lock_ops, LH_REHASH);
+
+        if (!exp->exp_lock_hash)
+                RETURN(-ENOMEM);
+
+        RETURN(0);
+}
+EXPORT_SYMBOL(ldlm_init_export);
+
+void ldlm_destroy_export(struct obd_export *exp)
+{
+        ENTRY;
+        lustre_hash_exit(exp->exp_lock_hash);
+        exp->exp_lock_hash = NULL;
+        EXIT;
+}
+EXPORT_SYMBOL(ldlm_destroy_export);
+
  static int ldlm_setup(void);
  static int ldlm_cleanup(void);
  
@@ -1943,7 +2155,7 @@ static int ldlm_setup(void)
                                  ldlm_callback_handler, "ldlm_cbd",
                                  ldlm_svc_proc_dir, NULL,
                                  ldlm_min_threads, ldlm_max_threads,
-                                "ldlm_cb");
+                                "ldlm_cb", NULL);
  
          if (!ldlm_state->ldlm_cb_service) {
                  CERROR("failed to start service\n");
@@ -1957,7 +2169,7 @@ static int ldlm_setup(void)
                                  ldlm_cancel_handler, "ldlm_canceld",
                                  ldlm_svc_proc_dir, NULL,
                                  ldlm_min_threads, ldlm_max_threads,
-                                "ldlm_cn");
+                                "ldlm_cn", NULL);
  
          if (!ldlm_state->ldlm_cancel_service) {
                  CERROR("failed to start service\n");
@@ -2042,7 +2254,7 @@ static int ldlm_cleanup(void)
  #endif
          ENTRY;
  
-        if (!list_empty(ldlm_namespace_list(LDLM_NAMESPACE_SERVER)) || 
+        if (!list_empty(ldlm_namespace_list(LDLM_NAMESPACE_SERVER)) ||
              !list_empty(ldlm_namespace_list(LDLM_NAMESPACE_CLIENT))) {
                  CERROR("ldlm still has namespaces; clean these up first.\n");
                  ldlm_dump_all_namespaces(LDLM_NAMESPACE_SERVER, D_DLMTRACE);
@@ -2224,9 +2436,8 @@ EXPORT_SYMBOL(target_queue_recovery_request);
  EXPORT_SYMBOL(target_handle_ping);
  EXPORT_SYMBOL(target_pack_pool_reply);
  EXPORT_SYMBOL(target_handle_disconnect);
-EXPORT_SYMBOL(target_queue_last_replay_reply);
+EXPORT_SYMBOL(target_handle_reply);
  
  /* l_lock.c */
  EXPORT_SYMBOL(lock_res_and_lock);
  EXPORT_SYMBOL(unlock_res_and_lock);
-
diff --git a/lustre/ldlm/ldlm_plain.c b/lustre/ldlm/ldlm_plain.c

index 68b5bf3..945ec7f 100644 (file)
--- a/lustre/ldlm/ldlm_plain.c
+++ b/lustre/ldlm/ldlm_plain.c
@@ -1,27 +1,42 @@
  /* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
   * vim:expandtab:shiftwidth=8:tabstop=8:
   *
- *  Copyright (c) 2002, 2003 Cluster File Systems, Inc.
- *   Author: Peter Braam <braam@clusterfs.com>
- *   Author: Phil Schwan <phil@clusterfs.com>
+ * GPL HEADER START
   *
- *   This file is part of the Lustre file system, http://www.lustre.org
- *   Lustre is a trademark of Cluster File Systems, Inc.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   *
- *   You may have signed or agreed to another license before downloading
- *   this software.  If so, you are bound by the terms and conditions
- *   of that agreement, and the following does not apply to you.  See the
- *   LICENSE file included with this distribution for more information.
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
   *
- *   If you did not agree to a different license, then this copy of Lustre
- *   is open source software; you can redistribute it and/or modify it
- *   under the terms of version 2 of the GNU General Public License as
- *   published by the Free Software Foundation.
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
   *
- *   In either case, Lustre is distributed in the hope that it will be
- *   useful, but WITHOUT ANY WARRANTY; without even the implied warranty
- *   of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- *   license text for more details.
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright  2008 Sun Microsystems, Inc. All rights reserved
+ * Use is subject to license terms.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/ldlm/ldlm_plain.c
+ *
+ * Author: Peter Braam <braam@clusterfs.com>
+ * Author: Phil Schwan <phil@clusterfs.com>
   */
  
  #define DEBUG_SUBSYSTEM S_LDLM
diff --git a/lustre/ldlm/ldlm_pool.c b/lustre/ldlm/ldlm_pool.c

index bd89cfa..b7c0013 100644 (file)
--- a/lustre/ldlm/ldlm_pool.c
+++ b/lustre/ldlm/ldlm_pool.c
@@ -1,26 +1,41 @@
  /* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
   * vim:expandtab:shiftwidth=8:tabstop=8:
   *
- *  Copyright (c) 2007 Cluster File Systems, Inc.
- *   Author: Yury Umanets <umka@clusterfs.com>
+ * GPL HEADER START
   *
- *   This file is part of the Lustre file system, http://www.lustre.org
- *   Lustre is a trademark of Cluster File Systems, Inc.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   *
- *   You may have signed or agreed to another license before downloading
- *   this software.  If so, you are bound by the terms and conditions
- *   of that agreement, and the following does not apply to you.  See the
- *   LICENSE file included with this distribution for more information.
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
   *
- *   If you did not agree to a different license, then this copy of Lustre
- *   is open source software; you can redistribute it and/or modify it
- *   under the terms of version 2 of the GNU General Public License as
- *   published by the Free Software Foundation.
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
   *
- *   In either case, Lustre is distributed in the hope that it will be
- *   useful, but WITHOUT ANY WARRANTY; without even the implied warranty
- *   of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- *   license text for more details.
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright  2008 Sun Microsystems, Inc. All rights reserved
+ * Use is subject to license terms.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/ldlm/ldlm_pool.c
+ *
+ * Author: Yury Umanets <umka@clusterfs.com>
   */
  
  /* 
@@ -71,10 +86,6 @@
   * pl_cancel_rate - Number of canceled locks for last T (calculated);
   * pl_grant_speed - Grant speed (GR - CR) for last T (calculated);
   * pl_grant_plan - Planned number of granted locks for next T (calculated);
- *
- * pl_grant_step - Grant plan step, that is how ->pl_grant_plan
- * will change in next T (tunable);
- *
   * pl_server_lock_volume - Current server lock volume (calculated);
   *
   * As it may be seen from list above, we have few possible tunables which may
@@ -105,14 +116,27 @@
  #define LDLM_POOL_HOST_L ((num_physpages >> (20 - CFS_PAGE_SHIFT)) * 50)
  
  /*
- * Default step in % for grant plan. 
+ * Maximal possible grant step plan in %. 
   */
-#define LDLM_POOL_GSP (10)
+#define LDLM_POOL_MAX_GSP (30)
+
+/*
+ * Minimal possible grant step plan in %. 
+ */
+#define LDLM_POOL_MIN_GSP (1)
+
+/*
+ * This controls the speed of reaching LDLM_POOL_MAX_GSP
+ * with increasing thread period. This is 4s which means
+ * that for 10s thread period we will have 2 steps by 4s
+ * each.
+ */
+#define LDLM_POOL_GSP_STEP (4)
  
  /* 
   * LDLM_POOL_GSP% of all locks is default GP. 
   */
-#define LDLM_POOL_GP(L)   (((L) * LDLM_POOL_GSP) / 100)
+#define LDLM_POOL_GP(L)   (((L) * LDLM_POOL_MAX_GSP) / 100)
  
  /* 
   * Max age for locks on clients. 
@@ -170,6 +194,37 @@ static inline struct ldlm_namespace *ldlm_pl2ns(struct ldlm_pool *pl)
  }
  
  /**
+ * Calculates suggested grant_step in % of available locks for passed 
+ * \a period. This is later used in grant_plan calculations.
+ */
+static inline int ldlm_pool_t2gsp(int t)
+{
+        /*
+         * This yeilds 1% grant step for anything below LDLM_POOL_GSP_STEP
+         * and up to 30% for anything higher than LDLM_POOL_GSP_STEP.
+         * 
+         * How this will affect execution is the following:
+         *
+         * - for thread peroid 1s we will have grant_step 1% which good from
+         * pov of taking some load off from server and push it out to clients.
+         * This is like that because 1% for grant_step means that server will
+         * not allow clients to get lots of locks inshort period of time and
+         * keep all old locks in their caches. Clients will always have to
+         * get some locks back if they want to take some new;
+         *
+         * - for thread period 10s (which is default) we will have 23% which
+         * means that clients will have enough of room to take some new locks
+         * without getting some back. All locks from this 23% which were not 
+         * taken by clients in current period will contribute in SLV growing.
+         * SLV growing means more locks cached on clients until limit or grant
+         * plan is reached.
+         */
+        return LDLM_POOL_MAX_GSP - 
+                (LDLM_POOL_MAX_GSP - LDLM_POOL_MIN_GSP) / 
+                (1 << (t / LDLM_POOL_GSP_STEP));
+}
+
+/**
   * Recalculates next grant limit on passed \a pl.
   *
   * \pre ->pl_lock is locked. 
@@ -177,11 +232,12 @@ static inline struct ldlm_namespace *ldlm_pl2ns(struct ldlm_pool *pl)
  static inline void ldlm_pool_recalc_grant_plan(struct ldlm_pool *pl)
  {
          int granted, grant_step, limit;
-        
+
          limit = ldlm_pool_get_limit(pl);
          granted = atomic_read(&pl->pl_granted);
  
-        grant_step = ((limit - granted) * pl->pl_grant_step) / 100;
+        grant_step = ldlm_pool_t2gsp(pl->pl_recalc_period);
+        grant_step = ((limit - granted) * grant_step) / 100;
          pl->pl_grant_plan = granted + grant_step;
  }
  
@@ -288,12 +344,7 @@ static int ldlm_srv_pool_recalc(struct ldlm_pool *pl)
  
          spin_lock(&pl->pl_lock);
          recalc_interval_sec = cfs_time_current_sec() - pl->pl_recalc_time;
-        if (recalc_interval_sec > 0) {
-                /* 
-                 * Update statistics.
-                 */
-                ldlm_pool_recalc_stats(pl);
-
+        if (recalc_interval_sec >= pl->pl_recalc_period) {
                  /* 
                   * Recalc SLV after last period. This should be done
                   * _before_ recalculating new grant plan. 
@@ -310,14 +361,8 @@ static int ldlm_srv_pool_recalc(struct ldlm_pool *pl)
                   */
                  ldlm_pool_recalc_grant_plan(pl);
  
-                /* 
-                 * Zero out all rates and speed for the last period. 
-                 */
-                atomic_set(&pl->pl_grant_rate, 0);
-                atomic_set(&pl->pl_cancel_rate, 0);
-                atomic_set(&pl->pl_grant_speed, 0);
                  pl->pl_recalc_time = cfs_time_current_sec();
-                lprocfs_counter_add(pl->pl_stats, LDLM_POOL_TIMING_STAT, 
+                lprocfs_counter_add(pl->pl_stats, LDLM_POOL_TIMING_STAT,
                                      recalc_interval_sec);
          }
          spin_unlock(&pl->pl_lock);
@@ -432,29 +477,23 @@ static int ldlm_cli_pool_recalc(struct ldlm_pool *pl)
          ENTRY;
  
          spin_lock(&pl->pl_lock);
+        /*
+         * Check if we need to recalc lists now.
+         */
+        recalc_interval_sec = cfs_time_current_sec() - pl->pl_recalc_time;
+        if (recalc_interval_sec < pl->pl_recalc_period) {
+                spin_unlock(&pl->pl_lock);
+                RETURN(0);
+        }
  
          /* 
           * Make sure that pool knows last SLV and Limit from obd. 
           */
          ldlm_cli_pool_pop_slv(pl);
  
-        recalc_interval_sec = cfs_time_current_sec() - pl->pl_recalc_time;
-        if (recalc_interval_sec > 0) {
-                /* 
-                 * Update statistics only every T. 
-                 */
-                ldlm_pool_recalc_stats(pl);
-
-                /* 
-                 * Zero out grant/cancel rates and speed for last period. 
-                 */
-                atomic_set(&pl->pl_grant_rate, 0);
-                atomic_set(&pl->pl_cancel_rate, 0);
-                atomic_set(&pl->pl_grant_speed, 0);
-                pl->pl_recalc_time = cfs_time_current_sec();
-                lprocfs_counter_add(pl->pl_stats, LDLM_POOL_TIMING_STAT, 
-                                    recalc_interval_sec);
-        }
+        pl->pl_recalc_time = cfs_time_current_sec();
+        lprocfs_counter_add(pl->pl_stats, LDLM_POOL_TIMING_STAT, 
+                            recalc_interval_sec);
          spin_unlock(&pl->pl_lock);
  
          /* 
@@ -482,8 +521,8 @@ static int ldlm_cli_pool_shrink(struct ldlm_pool *pl,
                                  int nr, unsigned int gfp_mask)
  {
          ENTRY;
-        
-        /* 
+
+        /*
           * Do not cancel locks in case lru resize is disabled for this ns. 
           */
          if (!ns_connect_lru_resize(ldlm_pl2ns(pl)))
@@ -526,14 +565,33 @@ struct ldlm_pool_ops ldlm_cli_pool_ops = {
   */
  int ldlm_pool_recalc(struct ldlm_pool *pl)
  {
+        time_t recalc_interval_sec;
          int count;
  
+        spin_lock(&pl->pl_lock);
+        recalc_interval_sec = cfs_time_current_sec() - pl->pl_recalc_time;
+        if (recalc_interval_sec > 0) {
+                /*
+                 * Update pool statistics every 1s.
+                 */
+                ldlm_pool_recalc_stats(pl);
+
+                /*
+                 * Zero out all rates and speed for the last period. 
+                 */
+                atomic_set(&pl->pl_grant_rate, 0);
+                atomic_set(&pl->pl_cancel_rate, 0);
+                atomic_set(&pl->pl_grant_speed, 0);
+        }
+        spin_unlock(&pl->pl_lock);
+
          if (pl->pl_ops->po_recalc != NULL) {
                  count = pl->pl_ops->po_recalc(pl);
                  lprocfs_counter_add(pl->pl_stats, LDLM_POOL_RECALC_STAT, 
                                      count);
                  return count;
          }
+
          return 0;
  }
  EXPORT_SYMBOL(ldlm_pool_recalc);
@@ -546,7 +604,7 @@ int ldlm_pool_shrink(struct ldlm_pool *pl, int nr,
                       unsigned int gfp_mask)
  {
          int cancel = 0;
-        
+
          if (pl->pl_ops->po_shrink != NULL) {
                  cancel = pl->pl_ops->po_shrink(pl, nr, gfp_mask);
                  if (nr > 0) {
@@ -584,7 +642,7 @@ static int lprocfs_rd_pool_state(char *page, char **start, off_t off,
                                   int count, int *eof, void *data)
  {
          int granted, grant_rate, cancel_rate, grant_step;
-        int nr = 0, grant_speed, grant_plan;
+        int nr = 0, grant_speed, grant_plan, lvf;
          struct ldlm_pool *pl = data;
          __u64 slv, clv;
          __u32 limit;
@@ -594,25 +652,26 @@ static int lprocfs_rd_pool_state(char *page, char **start, off_t off,
          clv = pl->pl_client_lock_volume;
          limit = ldlm_pool_get_limit(pl);
          grant_plan = pl->pl_grant_plan;
-        grant_step = pl->pl_grant_step;
          granted = atomic_read(&pl->pl_granted);
          grant_rate = atomic_read(&pl->pl_grant_rate);
+        lvf = atomic_read(&pl->pl_lock_volume_factor);
          grant_speed = atomic_read(&pl->pl_grant_speed);
          cancel_rate = atomic_read(&pl->pl_cancel_rate);
+        grant_step = ldlm_pool_t2gsp(pl->pl_recalc_period);
          spin_unlock(&pl->pl_lock);
  
          nr += snprintf(page + nr, count - nr, "LDLM pool state (%s):\n",
                         pl->pl_name);
          nr += snprintf(page + nr, count - nr, "  SLV: "LPU64"\n", slv);
          nr += snprintf(page + nr, count - nr, "  CLV: "LPU64"\n", clv);
+        nr += snprintf(page + nr, count - nr, "  LVF: %d\n", lvf);
  
-        nr += snprintf(page + nr, count - nr, "  LVF: %d\n",
-                       atomic_read(&pl->pl_lock_volume_factor));
-
-        nr += snprintf(page + nr, count - nr, "  GSP: %d%%\n",
-                       grant_step);
-        nr += snprintf(page + nr, count - nr, "  GP:  %d\n",
-                       grant_plan);
+        if (ns_is_server(ldlm_pl2ns(pl))) {
+                nr += snprintf(page + nr, count - nr, "  GSP: %d%%\n",
+                               grant_step);
+                nr += snprintf(page + nr, count - nr, "  GP:  %d\n",
+                               grant_plan);
+        }
          nr += snprintf(page + nr, count - nr, "  GR:  %d\n",
                         grant_rate);
          nr += snprintf(page + nr, count - nr, "  CR:  %d\n",
@@ -627,8 +686,8 @@ static int lprocfs_rd_pool_state(char *page, char **start, off_t off,
  }
  
  LDLM_POOL_PROC_READER(grant_plan, int);
-LDLM_POOL_PROC_READER(grant_step, int);
-LDLM_POOL_PROC_WRITER(grant_step, int);
+LDLM_POOL_PROC_READER(recalc_period, int);
+LDLM_POOL_PROC_WRITER(recalc_period, int);
  
  static int ldlm_pool_proc_init(struct ldlm_pool *pl)
  {
@@ -697,11 +756,10 @@ static int ldlm_pool_proc_init(struct ldlm_pool *pl)
          pool_vars[0].read_fptr = lprocfs_rd_grant_plan;
          lprocfs_add_vars(pl->pl_proc_dir, pool_vars, 0);
  
-        snprintf(var_name, MAX_STRING_SIZE, "grant_step");
+        snprintf(var_name, MAX_STRING_SIZE, "recalc_period");
          pool_vars[0].data = pl;
-        pool_vars[0].read_fptr = lprocfs_rd_grant_step;
-        if (ns_is_server(ns))
-                pool_vars[0].write_fptr = lprocfs_wr_grant_step;
+        pool_vars[0].read_fptr = lprocfs_rd_recalc_period;
+        pool_vars[0].write_fptr = lprocfs_wr_recalc_period;
          lprocfs_add_vars(pl->pl_proc_dir, pool_vars, 0);
  
          snprintf(var_name, MAX_STRING_SIZE, "lock_volume_factor");
@@ -791,7 +849,6 @@ int ldlm_pool_init(struct ldlm_pool *pl, struct ldlm_namespace *ns,
          atomic_set(&pl->pl_grant_rate, 0);
          atomic_set(&pl->pl_cancel_rate, 0);
          atomic_set(&pl->pl_grant_speed, 0);
-        pl->pl_grant_step = LDLM_POOL_GSP;
          pl->pl_grant_plan = LDLM_POOL_GP(LDLM_POOL_HOST_L);
  
          snprintf(pl->pl_name, sizeof(pl->pl_name), "ldlm-pool-%s-%d",
@@ -800,11 +857,13 @@ int ldlm_pool_init(struct ldlm_pool *pl, struct ldlm_namespace *ns,
          if (client == LDLM_NAMESPACE_SERVER) {
                  pl->pl_ops = &ldlm_srv_pool_ops;
                  ldlm_pool_set_limit(pl, LDLM_POOL_HOST_L);
+                pl->pl_recalc_period = LDLM_POOL_SRV_DEF_RECALC_PERIOD;
                  pl->pl_server_lock_volume = ldlm_pool_slv_max(LDLM_POOL_HOST_L);
          } else {
-                pl->pl_server_lock_volume = 1;
                  ldlm_pool_set_limit(pl, 1);
+                pl->pl_server_lock_volume = 1;
                  pl->pl_ops = &ldlm_cli_pool_ops;
+                pl->pl_recalc_period = LDLM_POOL_CLI_DEF_RECALC_PERIOD;
          }
          pl->pl_client_lock_volume = 0;
          rc = ldlm_pool_proc_init(pl);
@@ -821,7 +880,7 @@ void ldlm_pool_fini(struct ldlm_pool *pl)
  {
          ENTRY;
          ldlm_pool_proc_fini(pl);
-        
+
          /* 
           * Pool should not be used after this point. We can't free it here as
           * it lives in struct ldlm_namespace, but still interested in catching
@@ -845,16 +904,15 @@ void ldlm_pool_add(struct ldlm_pool *pl, struct ldlm_lock *lock)
           */
          if (lock->l_resource->lr_type == LDLM_FLOCK)
                  return;
-
          ENTRY;
-                
+
          atomic_inc(&pl->pl_granted);
          atomic_inc(&pl->pl_grant_rate);
          atomic_inc(&pl->pl_grant_speed);
  
          lprocfs_counter_incr(pl->pl_stats, LDLM_POOL_GRANT_STAT);
- 
-        /* 
+
+        /*
           * Do not do pool recalc for client side as all locks which
           * potentially may be canceled has already been packed into 
           * enqueue/cancel rpc. Also we do not want to run out of stack
@@ -877,11 +935,12 @@ void ldlm_pool_del(struct ldlm_pool *pl, struct ldlm_lock *lock)
          if (lock->l_resource->lr_type == LDLM_FLOCK)
                  return;
          ENTRY;
+
          LASSERT(atomic_read(&pl->pl_granted) > 0);
          atomic_dec(&pl->pl_granted);
          atomic_inc(&pl->pl_cancel_rate);
          atomic_dec(&pl->pl_grant_speed);
-        
+
          lprocfs_counter_incr(pl->pl_stats, LDLM_POOL_CANCEL_STAT);
  
          if (ns_is_server(ldlm_pl2ns(pl)))
@@ -984,17 +1043,6 @@ static struct shrinker *ldlm_pools_srv_shrinker;
  static struct shrinker *ldlm_pools_cli_shrinker;
  static struct completion ldlm_pools_comp;
  
-void ldlm_pools_wakeup(void)
-{
-        ENTRY;
-        if (ldlm_pools_thread == NULL)
-                return;
-        ldlm_pools_thread->t_flags |= SVC_EVENT;
-        cfs_waitq_signal(&ldlm_pools_thread->t_ctl_waitq);
-        EXIT;
-}
-EXPORT_SYMBOL(ldlm_pools_wakeup);
-
  /* 
   * Cancel \a nr locks from all namespaces (if possible). Returns number of
   * cached locks after shrink is finished. All namespaces are asked to
@@ -1060,7 +1108,7 @@ static int ldlm_pools_shrink(ldlm_side_t client, int nr,
                  ldlm_namespace_get(ns);
                  ldlm_namespace_move_locked(ns, client);
                  mutex_up(ldlm_namespace_lock(client));
-                
+
                  nr_locks = ldlm_pool_granted(&ns->ns_pool);
                  cancel = 1 + nr_locks * nr / total;
                  ldlm_pool_shrink(&ns->ns_pool, cancel, gfp_mask);
@@ -1208,7 +1256,7 @@ static int ldlm_pools_thread_main(void *arg)
                   */
                  ldlm_pools_recalc(LDLM_NAMESPACE_SERVER);
                  ldlm_pools_recalc(LDLM_NAMESPACE_CLIENT);
-                
+
                  /*
                   * Wait until the next check time, or until we're
                   * stopped. 
@@ -1423,12 +1471,6 @@ void ldlm_pools_fini(void)
  }
  EXPORT_SYMBOL(ldlm_pools_fini);
  
-void ldlm_pools_wakeup(void)
-{
-        return;
-}
-EXPORT_SYMBOL(ldlm_pools_wakeup);
-
  void ldlm_pools_recalc(ldlm_side_t client)
  {
          return;
diff --git a/lustre/ldlm/ldlm_request.c b/lustre/ldlm/ldlm_request.c

index 75963f9..e318e88 100644 (file)
--- a/lustre/ldlm/ldlm_request.c
+++ b/lustre/ldlm/ldlm_request.c
@@ -1,25 +1,37 @@
  /* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
   * vim:expandtab:shiftwidth=8:tabstop=8:
   *
- *  Copyright (C) 2002, 2003 Cluster File Systems, Inc.
+ * GPL HEADER START
   *
- *   This file is part of the Lustre file system, http://www.lustre.org
- *   Lustre is a trademark of Cluster File Systems, Inc.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   *
- *   You may have signed or agreed to another license before downloading
- *   this software.  If so, you are bound by the terms and conditions
- *   of that agreement, and the following does not apply to you.  See the
- *   LICENSE file included with this distribution for more information.
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
   *
- *   If you did not agree to a different license, then this copy of Lustre
- *   is open source software; you can redistribute it and/or modify it
- *   under the terms of version 2 of the GNU General Public License as
- *   published by the Free Software Foundation.
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
   *
- *   In either case, Lustre is distributed in the hope that it will be
- *   useful, but WITHOUT ANY WARRANTY; without even the implied warranty
- *   of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- *   license text for more details.
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright  2008 Sun Microsystems, Inc. All rights reserved
+ * Use is subject to license terms.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
   */
  
  #define DEBUG_SUBSYSTEM S_LDLM
@@ -102,6 +114,7 @@ int ldlm_get_enq_timeout(struct ldlm_lock *lock)
          timeout = timeout + (timeout >> 1); /* 150% */
          return max(timeout, ldlm_enqueue_min);
  }
+EXPORT_SYMBOL(ldlm_get_enq_timeout);
  
  static int is_granted_or_cancelled(struct ldlm_lock *lock)
  {
@@ -175,8 +188,17 @@ noreproc:
                  spin_unlock(&imp->imp_lock);
          }
  
-        /* Go to sleep until the lock is granted or cancelled. */
-        rc = l_wait_event(lock->l_waitq, is_granted_or_cancelled(lock), &lwi);
+        if (ns_is_client(lock->l_resource->lr_namespace) && 
+            lock->l_resource->lr_type == LDLM_EXTENT &&
+            OBD_FAIL_CHECK(OBD_FAIL_LDLM_INTR_CP_AST | OBD_FAIL_ONCE)) {
+                obd_fail_loc = OBD_FAIL_LDLM_CP_BL_RACE | OBD_FAIL_ONCE;
+                lock->l_flags |= LDLM_FL_FAIL_LOC;
+                rc = -EINTR;
+        } else {
+                /* Go to sleep until the lock is granted or cancelled. */
+                rc = l_wait_event(lock->l_waitq, 
+                                  is_granted_or_cancelled(lock), &lwi);
+        }
  
          if (lock->l_destroyed || lock->l_flags & LDLM_FL_FAILED) {
                  LDLM_DEBUG(lock, "client-side enqueue waking up: destroyed");
@@ -337,13 +359,31 @@ static void failed_lock_cleanup(struct ldlm_namespace *ns,
                                  struct ldlm_lock *lock,
                                  struct lustre_handle *lockh, int mode)
  {
+        int need_cancel = 0;
+
          /* Set a flag to prevent us from sending a CANCEL (bug 407) */
          lock_res_and_lock(lock);
-        lock->l_flags |= LDLM_FL_LOCAL_ONLY;
+        /* Check that lock is not granted or failed, we might race. */
+        if ((lock->l_req_mode != lock->l_granted_mode) && 
+            !(lock->l_flags & LDLM_FL_FAILED)) {
+                /* Make sure that this lock will not be found by raced
+                 * bl_ast and -EINVAL reply is sent to server anyways. 
+                 * bug 17645 */
+                lock->l_flags |= LDLM_FL_LOCAL_ONLY | LDLM_FL_FAILED | 
+                                 LDLM_FL_ATOMIC_CB;
+                need_cancel = 1;
+        }
          unlock_res_and_lock(lock);
-        LDLM_DEBUG(lock, "setting FL_LOCAL_ONLY");
-
-        ldlm_lock_decref_and_cancel(lockh, mode);
+  
+        if (need_cancel) {
+                LDLM_DEBUG(lock, 
+                           "setting FL_LOCAL_ONLY | LDLM_FL_FAILED | " 
+                           "LDLM_FL_ATOMIC_CB");
+                ldlm_lock_decref_and_cancel(lockh, mode);
+        } else {
+                LDLM_DEBUG(lock, "lock was granted or failed in race");
+                ldlm_lock_decref(lockh, mode);
+        }
  
          /* XXX - HACK because we shouldn't call ldlm_lock_destroy()
           *       from llite/file.c/ll_file_flock(). */
@@ -359,6 +399,7 @@ int ldlm_cli_enqueue_fini(struct obd_export *exp, struct ptlrpc_request *req,
  {
          struct ldlm_namespace *ns = exp->exp_obd->obd_namespace;
          int is_replay = *flags & LDLM_FL_REPLAY;
+        struct lustre_handle old_hash_key;
          struct ldlm_lock *lock;
          struct ldlm_reply *reply;
          int cleanup_phase = 1;
@@ -410,7 +451,15 @@ int ldlm_cli_enqueue_fini(struct obd_export *exp, struct ptlrpc_request *req,
          cleanup_phase = 0;
  
          lock_res_and_lock(lock);
+        old_hash_key = lock->l_remote_handle;
          lock->l_remote_handle = reply->lock_handle;
+
+        /* Key change rehash lock in per-export hash with new key */
+        if (exp->exp_lock_hash)
+                lustre_hash_rehash_key(exp->exp_lock_hash, &old_hash_key,
+                                       &lock->l_remote_handle,
+                                       &lock->l_exp_hash);
+
          *flags = reply->lock_flags;
          lock->l_flags |= reply->lock_flags & LDLM_INHERIT_FLAGS;
          /* move NO_TIMEOUT flag to the lock to force ldlm_lock_match()
@@ -433,12 +482,18 @@ int ldlm_cli_enqueue_fini(struct obd_export *exp, struct ptlrpc_request *req,
                          lock->l_req_mode = newmode;
                  }
  
-                if (reply->lock_desc.l_resource.lr_name.name[0] !=
-                    lock->l_resource->lr_name.name[0]) {
-                        CDEBUG(D_INFO, "remote intent success, locking %ld "
-                               "instead of %ld\n",
-                              (long)reply->lock_desc.l_resource.lr_name.name[0],
-                               (long)lock->l_resource->lr_name.name[0]);
+                if (memcmp(reply->lock_desc.l_resource.lr_name.name,
+                          lock->l_resource->lr_name.name,
+                          sizeof(struct ldlm_res_id))) {
+                        CDEBUG(D_INFO, "remote intent success, locking "
+                                        "("LPU64"/"LPU64"/"LPU64") instead of "
+                                        "("LPU64"/"LPU64"/"LPU64")\n",
+                               reply->lock_desc.l_resource.lr_name.name[0],
+                               reply->lock_desc.l_resource.lr_name.name[1],
+                               reply->lock_desc.l_resource.lr_name.name[2],
+                               lock->l_resource->lr_name.name[0],
+                               lock->l_resource->lr_name.name[1],
+                               lock->l_resource->lr_name.name[2]);
  
                          rc = ldlm_lock_change_resource(ns, lock,
                                             reply->lock_desc.l_resource.lr_name);
@@ -509,13 +564,16 @@ cleanup:
   * a single page on the send/receive side. XXX: 512 should be changed
   * to more adequate value. */
  static inline int ldlm_req_handles_avail(struct obd_export *exp,
-                                         int *size, int bufcount, int off)
+                                         __u32 *size, int bufcount, int off)
  {
          int avail = min_t(int, LDLM_MAXREQSIZE, CFS_PAGE_SIZE - 512);
  
          avail -= lustre_msg_size(class_exp2cliimp(exp)->imp_msg_magic,
                                   bufcount, size);
-        avail /= sizeof(struct lustre_handle);
+        if (likely(avail >= 0))
+                avail /= (int)sizeof(struct lustre_handle);
+        else
+                avail = 0;
          avail += LDLM_LOCKREQ_HANDLES - off;
  
          return avail;
@@ -523,7 +581,7 @@ static inline int ldlm_req_handles_avail(struct obd_export *exp,
  
  static inline int ldlm_cancel_handles_avail(struct obd_export *exp)
  {
-        int size[2] = { sizeof(struct ptlrpc_body),
+        __u32 size[2] = { sizeof(struct ptlrpc_body),
                          sizeof(struct ldlm_request) };
          return ldlm_req_handles_avail(exp, size, 2, 0);
  }
@@ -531,7 +589,7 @@ static inline int ldlm_cancel_handles_avail(struct obd_export *exp)
  /* Cancel lru locks and pack them into the enqueue request. Pack there the given
   * @count locks in @cancels. */
  struct ptlrpc_request *ldlm_prep_elc_req(struct obd_export *exp, int version,
-                                         int opc, int bufcount, int *size,
+                                         int opc, int bufcount, __u32 *size,
                                           int bufoff, int canceloff,
                                           struct list_head *cancels, int count)
  {
@@ -549,12 +607,12 @@ struct ptlrpc_request *ldlm_prep_elc_req(struct obd_export *exp, int version,
                  LASSERT(bufoff < bufcount);
  
                  avail = ldlm_req_handles_avail(exp, size, bufcount, canceloff);
-                flags = ns_connect_lru_resize(ns) ? 
+                flags = ns_connect_lru_resize(ns) ?
                          LDLM_CANCEL_LRUR : LDLM_CANCEL_AGED;
                  to_free = !ns_connect_lru_resize(ns) &&
                            opc == LDLM_ENQUEUE ? 1 : 0;
  
-                /* Cancel lru locks here _only_ if the server supports 
+                /* Cancel lru locks here _only_ if the server supports
                   * EARLY_CANCEL. Otherwise we have to send extra CANCEL
                   * rpc, what will make us slower. */
                  if (avail > count)
@@ -566,8 +624,10 @@ struct ptlrpc_request *ldlm_prep_elc_req(struct obd_export *exp, int version,
                          pack = avail;
                  size[bufoff] = ldlm_request_bufsize(pack, opc);
          }
+
          req = ptlrpc_prep_req(class_exp2cliimp(exp), version,
                                opc, bufcount, size, NULL);
+        req->rq_export = class_export_get(exp);
          if (exp_connect_cancelset(exp) && req) {
                  if (canceloff) {
                          dlm = lustre_msg_buf(req->rq_reqmsg, bufoff,
@@ -589,7 +649,7 @@ struct ptlrpc_request *ldlm_prep_elc_req(struct obd_export *exp, int version,
  }
  
  struct ptlrpc_request *ldlm_prep_enqueue_req(struct obd_export *exp,
-                                             int bufcount, int *size,
+                                             int bufcount, __u32 *size,
                                               struct list_head *cancels,
                                               int count)
  {
@@ -614,9 +674,10 @@ int ldlm_cli_enqueue(struct obd_export *exp, struct ptlrpc_request **reqp,
          struct ldlm_lock *lock;
          struct ldlm_request *body;
          struct ldlm_reply *reply;
-        int size[3] = { [MSG_PTLRPC_BODY_OFF] = sizeof(struct ptlrpc_body),
+        __u32 size[3] = { [MSG_PTLRPC_BODY_OFF] = sizeof(struct ptlrpc_body),
                          [DLM_LOCKREQ_OFF]     = sizeof(*body),
-                        [DLM_REPLY_REC_OFF]   = lvb_len };
+                        [DLM_REPLY_REC_OFF]   = lvb_len ? lvb_len :
+                                                sizeof(struct ost_lvb) };
          int is_replay = *flags & LDLM_FL_REPLAY;
          int req_passed_in = 1, rc, err;
          struct ptlrpc_request *req;
@@ -696,7 +757,7 @@ int ldlm_cli_enqueue(struct obd_export *exp, struct ptlrpc_request **reqp,
          /* Continue as normal. */
          if (!req_passed_in) {
                  size[DLM_LOCKREPLY_OFF] = sizeof(*reply);
-                ptlrpc_req_set_repsize(req, 2 + (lvb_len > 0), size);
+                ptlrpc_req_set_repsize(req, 3, size);
          }
  
          /*
@@ -769,7 +830,7 @@ int ldlm_cli_convert(struct lustre_handle *lockh, int new_mode, __u32 *flags)
          struct ldlm_lock *lock;
          struct ldlm_resource *res;
          struct ptlrpc_request *req;
-        int size[2] = { [MSG_PTLRPC_BODY_OFF] = sizeof(struct ptlrpc_body),
+        __u32 size[2] = { [MSG_PTLRPC_BODY_OFF] = sizeof(struct ptlrpc_body),
                          [DLM_LOCKREQ_OFF]     = sizeof(*body) };
          int rc;
          ENTRY;
@@ -922,7 +983,7 @@ int ldlm_cli_cancel_req(struct obd_export *exp,
  {
          struct ptlrpc_request *req = NULL;
          struct ldlm_request *body;
-        int size[2] = { [MSG_PTLRPC_BODY_OFF] = sizeof(struct ptlrpc_body),
+        __u32 size[2] = { [MSG_PTLRPC_BODY_OFF] = sizeof(struct ptlrpc_body),
                          [DLM_LOCKREQ_OFF]     = sizeof(*body) };
          struct obd_import *imp;
          int free, sent = 0;
@@ -1006,28 +1067,28 @@ int ldlm_cli_update_pool(struct ptlrpc_request *req)
          __u64 old_slv, new_slv;
          __u32 new_limit;
          ENTRY;
-    
-        if (unlikely(!req->rq_import || !req->rq_import->imp_obd || 
+
+        if (unlikely(!req->rq_import || !req->rq_import->imp_obd ||
                       !imp_connect_lru_resize(req->rq_import)))
          {
-                /* 
-                 * Do nothing for corner cases. 
+                /*
+                 * Do nothing for corner cases.
                   */
                  RETURN(0);
          }
  
-        /* 
-         * In some cases RPC may contain slv and limit zeroed out. This is 
+        /*
+         * In some cases RPC may contain slv and limit zeroed out. This is
           * the case when server does not support lru resize feature. This is
           * also possible in some recovery cases when server side reqs have no
-         * ref to obd export and thus access to server side namespace is no 
-         * possible. 
+         * ref to obd export and thus access to server side namespace is no
+         * possible.
           */
-        if (lustre_msg_get_slv(req->rq_repmsg) == 0 || 
+        if (lustre_msg_get_slv(req->rq_repmsg) == 0 ||
              lustre_msg_get_limit(req->rq_repmsg) == 0) {
                  DEBUG_REQ(D_HA, req, "Zero SLV or Limit found "
-                          "(SLV: "LPU64", Limit: %u)", 
-                          lustre_msg_get_slv(req->rq_repmsg), 
+                          "(SLV: "LPU64", Limit: %u)",
+                          lustre_msg_get_slv(req->rq_repmsg),
                            lustre_msg_get_limit(req->rq_repmsg));
                  RETURN(0);
          }
@@ -1036,12 +1097,12 @@ int ldlm_cli_update_pool(struct ptlrpc_request *req)
          new_slv = lustre_msg_get_slv(req->rq_repmsg);
          obd = req->rq_import->imp_obd;
  
-        /* 
-         * Set new SLV and Limit to obd fields to make accessible for pool 
+        /*
+         * Set new SLV and Limit to obd fields to make accessible for pool
           * thread. We do not access obd_namespace and pool directly here
           * as there is no reliable way to make sure that they are still
           * alive in cleanup time. Evil races are possible which may cause
-         * oops in that time. 
+         * oops in that time.
           */
          write_lock(&obd->obd_pool_lock);
          old_slv = obd->obd_pool_slv;
@@ -1049,26 +1110,6 @@ int ldlm_cli_update_pool(struct ptlrpc_request *req)
          obd->obd_pool_limit = new_limit;
          write_unlock(&obd->obd_pool_lock);
  
-        /* 
-         * Check if we need to wakeup pools thread for fast SLV change. 
-         * This is only done when threads period is noticably long like 
-         * 10s or more. 
-         */
-#if defined(__KERNEL__) && (LDLM_POOLS_THREAD_PERIOD >= 10)
-        if (old_slv > 0) {
-                __u64 fast_change = old_slv * LDLM_POOLS_FAST_SLV_CHANGE;
-                do_div(fast_change, 100);
-
-                /* 
-                 * Wake up pools thread only if SLV has changed more than 
-                 * 50% since last update. In this case we want to react asap. 
-                 * Otherwise it is no sense to wake up pools as they are 
-                 * re-calculated every LDLM_POOLS_THREAD_PERIOD anyways. 
-                 */
-                if (old_slv > new_slv && old_slv - new_slv > fast_change)
-                        ldlm_pools_wakeup();
-        }
-#endif
          RETURN(0);
  }
  EXPORT_SYMBOL(ldlm_cli_update_pool);
@@ -1087,7 +1128,7 @@ int ldlm_cli_cancel(struct lustre_handle *lockh)
                  LDLM_DEBUG_NOLOCK("lock is already being destroyed\n");
                  RETURN(0);
          }
-        
+
          rc = ldlm_cli_cancel_local(lock);
          if (rc < 0 || rc == LDLM_FL_LOCAL_ONLY) {
                  LDLM_LOCK_PUT(lock);
@@ -1155,17 +1196,17 @@ static int ldlm_cancel_list(struct list_head *cancels, int count, int flags)
          RETURN(count);
  }
  
-/* Return 1 to stop lru processing and keep current lock cached. Return zero 
+/* Return 1 to stop lru processing and keep current lock cached. Return zero
   * otherwise. */
  static ldlm_policy_res_t ldlm_cancel_shrink_policy(struct ldlm_namespace *ns,
                                                     struct ldlm_lock *lock,
-                                                   int unused, int added, 
+                                                   int unused, int added,
                                                     int count)
  {
          int lock_cost;
          __u64 page_nr;
  
-        /* Stop lru processing when we reached passed @count or checked all 
+        /* Stop lru processing when we reached passed @count or checked all
           * locks in lru. */
          if (count && added >= count)
                  return LDLM_POLICY_KEEP_LOCK;
@@ -1182,7 +1223,7 @@ static ldlm_policy_res_t ldlm_cancel_shrink_policy(struct ldlm_namespace *ns,
  #ifdef __KERNEL__
                  /* XXX: In fact this is evil hack, we can't access inode
                   * here. For doing it right we need somehow to have number
-                 * of covered by lock. This should be fixed later when 10718 
+                 * of covered by lock. This should be fixed later when 10718
                   * is landed. */
                  if (lock->l_ast_data != NULL) {
                          struct inode *inode = lock->l_ast_data;
@@ -1199,15 +1240,15 @@ static ldlm_policy_res_t ldlm_cancel_shrink_policy(struct ldlm_namespace *ns,
          /* Keep all expensive locks in lru for the memory pressure time
           * cancel policy. They anyways may be canceled by lru resize
           * pplicy if they have not small enough CLV. */
-        return lock_cost > ns->ns_shrink_thumb ? 
+        return lock_cost > ns->ns_shrink_thumb ?
                  LDLM_POLICY_KEEP_LOCK : LDLM_POLICY_CANCEL_LOCK;
  }
  
-/* Return 1 to stop lru processing and keep current lock cached. Return zero 
+/* Return 1 to stop lru processing and keep current lock cached. Return zero
   * otherwise. */
  static ldlm_policy_res_t ldlm_cancel_lrur_policy(struct ldlm_namespace *ns,
-                                                 struct ldlm_lock *lock, 
-                                                 int unused, int added, 
+                                                 struct ldlm_lock *lock,
+                                                 int unused, int added,
                                                   int count)
  {
          cfs_time_t cur = cfs_time_current();
@@ -1215,7 +1256,7 @@ static ldlm_policy_res_t ldlm_cancel_lrur_policy(struct ldlm_namespace *ns,
          __u64 slv, lvf, lv;
          cfs_time_t la;
  
-        /* Stop lru processing when we reached passed @count or checked all 
+        /* Stop lru processing when we reached passed @count or checked all
           * locks in lru. */
          if (count && added >= count)
                  return LDLM_POLICY_KEEP_LOCK;
@@ -1223,63 +1264,63 @@ static ldlm_policy_res_t ldlm_cancel_lrur_policy(struct ldlm_namespace *ns,
          slv = ldlm_pool_get_slv(pl);
          lvf = ldlm_pool_get_lvf(pl);
  
-        la = cfs_duration_sec(cfs_time_sub(cur, 
+        la = cfs_duration_sec(cfs_time_sub(cur,
                                lock->l_last_used));
  
-        /* Stop when slv is not yet come from server or 
+        /* Stop when slv is not yet come from server or
           * lv is smaller than it is. */
          lv = lvf * la * unused;
  
          /* Inform pool about current CLV to see it via proc. */
          ldlm_pool_set_clv(pl, lv);
-        return (slv == 1 || lv < slv) ? 
+        return (slv == 1 || lv < slv) ?
                  LDLM_POLICY_KEEP_LOCK : LDLM_POLICY_CANCEL_LOCK;
  }
  
-/* Return 1 to stop lru processing and keep current lock cached. Return zero 
+/* Return 1 to stop lru processing and keep current lock cached. Return zero
   * otherwise. */
  static ldlm_policy_res_t ldlm_cancel_passed_policy(struct ldlm_namespace *ns,
-                                                   struct ldlm_lock *lock, 
+                                                   struct ldlm_lock *lock,
                                                     int unused, int added,
                                                     int count)
  {
-        /* Stop lru processing when we reached passed @count or checked all 
+        /* Stop lru processing when we reached passed @count or checked all
           * locks in lru. */
-        return (added >= count) ? 
+        return (added >= count) ?
                  LDLM_POLICY_KEEP_LOCK : LDLM_POLICY_CANCEL_LOCK;
  }
  
-/* Return 1 to stop lru processing and keep current lock cached. Return zero 
+/* Return 1 to stop lru processing and keep current lock cached. Return zero
   * otherwise. */
  static ldlm_policy_res_t ldlm_cancel_aged_policy(struct ldlm_namespace *ns,
-                                                 struct ldlm_lock *lock, 
+                                                 struct ldlm_lock *lock,
                                                   int unused, int added,
                                                   int count)
  {
-        /* Stop lru processing if young lock is found and we reached passed 
+        /* Stop lru processing if young lock is found and we reached passed
           * @count. */
-        return ((added >= count) && 
+        return ((added >= count) &&
                  cfs_time_before(cfs_time_current(),
                                  cfs_time_add(lock->l_last_used,
-                                             ns->ns_max_age))) ? 
+                                             ns->ns_max_age))) ?
                  LDLM_POLICY_KEEP_LOCK : LDLM_POLICY_CANCEL_LOCK;
  }
  
-/* Return 1 to stop lru processing and keep current lock cached. Return zero 
+/* Return 1 to stop lru processing and keep current lock cached. Return zero
   * otherwise. */
  static ldlm_policy_res_t ldlm_cancel_default_policy(struct ldlm_namespace *ns,
-                                                    struct ldlm_lock *lock, 
+                                                    struct ldlm_lock *lock,
                                                      int unused, int added,
                                                      int count)
  {
-        /* Stop lru processing when we reached passed @count or checked all 
+        /* Stop lru processing when we reached passed @count or checked all
           * locks in lru. */
-        return (added >= count) ? 
+        return (added >= count) ?
                  LDLM_POLICY_KEEP_LOCK : LDLM_POLICY_CANCEL_LOCK;
  }
  
-typedef ldlm_policy_res_t (*ldlm_cancel_lru_policy_t)(struct ldlm_namespace *, 
-                                                      struct ldlm_lock *, int, 
+typedef ldlm_policy_res_t (*ldlm_cancel_lru_policy_t)(struct ldlm_namespace *,
+                                                      struct ldlm_lock *, int,
                                                        int, int);
  
  static ldlm_cancel_lru_policy_t
@@ -1296,10 +1337,10 @@ ldlm_cancel_lru_policy(struct ldlm_namespace *ns, int flags)
                  if (flags & LDLM_CANCEL_AGED)
                          return ldlm_cancel_aged_policy;
          }
-        
+
          return ldlm_cancel_default_policy;
  }
- 
+
  /* - Free space in lru for @count new locks,
   *   redundant unused locks are canceled locally;
   * - also cancel locally unused aged locks;
@@ -1342,7 +1383,7 @@ int ldlm_cancel_lru_local(struct ldlm_namespace *ns, struct list_head *cancels,
  
          pf = ldlm_cancel_lru_policy(ns, flags);
          LASSERT(pf != NULL);
-        
+
          while (!list_empty(&ns->ns_unused_list)) {
                  /* For any flags, stop scanning if @max is reached. */
                  if (max && added >= max)
@@ -1369,11 +1410,11 @@ int ldlm_cancel_lru_local(struct ldlm_namespace *ns, struct list_head *cancels,
                   * we find a lock that should stay in the cache.
                   * We should take into account lock age anyway
                   * as new lock even if it is small of weight is
-                 * valuable resource. 
+                 * valuable resource.
                   *
                   * That is, for shrinker policy we drop only
                   * old locks, but additionally chose them by
-                 * their weight. Big extent locks will stay in 
+                 * their weight. Big extent locks will stay in
                   * the cache. */
                  if (pf(ns, lock, unused, added, count) == LDLM_POLICY_KEEP_LOCK)
                          break;
@@ -1399,8 +1440,8 @@ int ldlm_cancel_lru_local(struct ldlm_namespace *ns, struct list_head *cancels,
  
                  /* If we have chosen to cancel this lock voluntarily, we
                   * better send cancel notification to server, so that it
-                 * frees appropriate state. This might lead to a race 
-                 * where while we are doing cancel here, server is also 
+                 * frees appropriate state. This might lead to a race
+                 * where while we are doing cancel here, server is also
                   * silently cancelling this lock. */
                  lock->l_flags &= ~LDLM_FL_CANCEL_ON_BLOCK;
  
@@ -1429,7 +1470,7 @@ int ldlm_cancel_lru_local(struct ldlm_namespace *ns, struct list_head *cancels,
          RETURN(ldlm_cancel_list(cancels, added, cancel_flags));
  }
  
-/* Returns number of locks which could be canceled next time when 
+/* Returns number of locks which could be canceled next time when
   * ldlm_cancel_lru() is called. Used from locks pool shrinker. */
  int ldlm_cancel_lru_estimate(struct ldlm_namespace *ns,
                               int count, int max, int flags)
@@ -1450,10 +1491,10 @@ int ldlm_cancel_lru_estimate(struct ldlm_namespace *ns,
                          break;
  
                  /* Somebody is already doing CANCEL or there is a
-                 * blocking request will send cancel. Let's not count 
+                 * blocking request will send cancel. Let's not count
                   * this lock. */
                  if ((lock->l_flags & LDLM_FL_CANCELING) ||
-                    (lock->l_flags & LDLM_FL_BL_AST)) 
+                    (lock->l_flags & LDLM_FL_BL_AST))
                          continue;
  
                  /* Pass the lock through the policy filter and see if it
@@ -1472,7 +1513,7 @@ int ldlm_cancel_lru_estimate(struct ldlm_namespace *ns,
   * in a thread and this function will return after the thread has been
   * asked to call the callback.  when called with LDLM_SYNC the blocking
   * callback will be performed in this function. */
-int ldlm_cancel_lru(struct ldlm_namespace *ns, int nr, ldlm_sync_t sync, 
+int ldlm_cancel_lru(struct ldlm_namespace *ns, int nr, ldlm_sync_t sync,
                      int flags)
  {
          CFS_LIST_HEAD(cancels);
@@ -1527,7 +1568,7 @@ int ldlm_cancel_resource_local(struct ldlm_resource *res,
  
                  /* If somebody is already doing CANCEL, or blocking ast came,
                   * skip this lock. */
-                if (lock->l_flags & LDLM_FL_BL_AST || 
+                if (lock->l_flags & LDLM_FL_BL_AST ||
                      lock->l_flags & LDLM_FL_CANCELING)
                          continue;
  
@@ -1870,15 +1911,16 @@ static int ldlm_chain_lock_for_replay(struct ldlm_lock *lock, void *closure)
  static int replay_lock_interpret(struct ptlrpc_request *req,
                                   struct ldlm_async_args *aa, int rc)
  {
-        struct ldlm_lock *lock;
-        struct ldlm_reply *reply;
+        struct lustre_handle  old_hash_key;
+        struct ldlm_lock     *lock;
+        struct ldlm_reply    *reply;
+        struct obd_export    *exp;
  
          ENTRY;
          atomic_dec(&req->rq_import->imp_replay_inflight);
          if (rc != ELDLM_OK)
                  GOTO(out, rc);
  
-
          reply = lustre_swab_repbuf(req, DLM_LOCKREPLY_OFF, sizeof(*reply),
                                     lustre_swab_ldlm_reply);
          if (reply == NULL) {
@@ -1896,7 +1938,16 @@ static int replay_lock_interpret(struct ptlrpc_request *req,
                  GOTO(out, rc = -ESTALE);
          }
  
+        old_hash_key = lock->l_remote_handle;
          lock->l_remote_handle = reply->lock_handle;
+
+        /* Key change rehash lock in per-export hash with new key */
+       exp = req->rq_export;
+        if (exp && exp->exp_lock_hash)
+                lustre_hash_rehash_key(exp->exp_lock_hash, &old_hash_key,
+                                      &lock->l_remote_handle,
+                                       &lock->l_exp_hash);
+
          LDLM_DEBUG(lock, "replayed lock:");
          ptlrpc_import_recovery_state_machine(req->rq_import);
          LDLM_LOCK_PUT(lock);
@@ -1915,7 +1966,7 @@ static int replay_one_lock(struct obd_import *imp, struct ldlm_lock *lock)
          struct ldlm_reply *reply;
          struct ldlm_async_args *aa;
          int buffers = 2;
-        int size[3] = { sizeof(struct ptlrpc_body) };
+        __u32 size[3] = { sizeof(struct ptlrpc_body) };
          int flags;
          ENTRY;
  
@@ -1934,6 +1985,7 @@ static int replay_one_lock(struct obd_import *imp, struct ldlm_lock *lock)
                  ldlm_lock_cancel(lock);
                  RETURN(0);
          }
+
          /*
           * If granted mode matches the requested mode, this lock is granted.
           *
@@ -1982,7 +2034,7 @@ static int replay_one_lock(struct obd_import *imp, struct ldlm_lock *lock)
  
          atomic_inc(&req->rq_import->imp_replay_inflight);
          CLASSERT(sizeof(*aa) <= sizeof(req->rq_async_args));
-        aa = (struct ldlm_async_args *)&req->rq_async_args;
+        aa = ptlrpc_req_async_args(req);
          aa->lock_handle = body->lock_handle[0];
          req->rq_interpret_reply = replay_lock_interpret;
          ptlrpcd_add_req(req);
@@ -2005,15 +2057,22 @@ int ldlm_replay_locks(struct obd_import *imp)
          /* ensure this doesn't fall to 0 before all have been queued */
          atomic_inc(&imp->imp_replay_inflight);
  
-        (void)ldlm_namespace_foreach(ns, ldlm_chain_lock_for_replay, &list);
-
-        list_for_each_entry_safe(lock, next, &list, l_pending_chain) {
-                list_del_init(&lock->l_pending_chain);
-                if (rc)
-                        continue; /* or try to do the rest? */
-                rc = replay_one_lock(imp, lock);
+        if (imp->imp_no_lock_replay) {
+                /* VBR: locks should be cancelled here */
+                ldlm_namespace_cleanup(ns, LDLM_FL_LOCAL_ONLY);
+                spin_lock(&imp->imp_lock);
+                imp->imp_no_lock_replay = 0;
+                spin_unlock(&imp->imp_lock);
+        } else {
+                (void)ldlm_namespace_foreach(ns, ldlm_chain_lock_for_replay,
+                                             &list);
+                list_for_each_entry_safe(lock, next, &list, l_pending_chain) {
+                        list_del_init(&lock->l_pending_chain);
+                        if (rc)
+                                continue; /* or try to do the rest? */
+                        rc = replay_one_lock(imp, lock);
+                }
          }
-
          atomic_dec(&imp->imp_replay_inflight);
  
          RETURN(rc);
diff --git a/lustre/ldlm/ldlm_resource.c b/lustre/ldlm/ldlm_resource.c

index cf8405e..cf3207b 100644 (file)
--- a/lustre/ldlm/ldlm_resource.c
+++ b/lustre/ldlm/ldlm_resource.c
@@ -1,27 +1,42 @@
  /* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
   * vim:expandtab:shiftwidth=8:tabstop=8:
   *
- * Copyright (C) 2002, 2003 Cluster File Systems, Inc.
- *   Author: Phil Schwan <phil@clusterfs.com>
- *   Author: Peter Braam <braam@clusterfs.com>
+ * GPL HEADER START
   *
- *   This file is part of the Lustre file system, http://www.lustre.org
- *   Lustre is a trademark of Cluster File Systems, Inc.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   *
- *   You may have signed or agreed to another license before downloading
- *   this software.  If so, you are bound by the terms and conditions
- *   of that agreement, and the following does not apply to you.  See the
- *   LICENSE file included with this distribution for more information.
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
   *
- *   If you did not agree to a different license, then this copy of Lustre
- *   is open source software; you can redistribute it and/or modify it
- *   under the terms of version 2 of the GNU General Public License as
- *   published by the Free Software Foundation.
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
   *
- *   In either case, Lustre is distributed in the hope that it will be
- *   useful, but WITHOUT ANY WARRANTY; without even the implied warranty
- *   of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- *   license text for more details.
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright  2008 Sun Microsystems, Inc. All rights reserved
+ * Use is subject to license terms.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/ldlm/ldlm_resource.c
+ *
+ * Author: Phil Schwan <phil@clusterfs.com>
+ * Author: Peter Braam <braam@clusterfs.com>
   */
  
  #define DEBUG_SUBSYSTEM S_LDLM
@@ -268,6 +283,12 @@ void ldlm_proc_namespace(struct ldlm_namespace *ns)
                  lock_vars[0].write_fptr = lprocfs_wr_uint;
                  lprocfs_add_vars(ldlm_ns_proc_dir, lock_vars, 0);
          } else {
+                snprintf(lock_name, MAX_STRING_SIZE, "%s/lock_timeouts",
+                         ns->ns_name);
+                lock_vars[0].data = &ns->ns_timeouts;
+                lock_vars[0].read_fptr = lprocfs_rd_uint;
+                lprocfs_add_vars(ldlm_ns_proc_dir, lock_vars, 0);
+
                  snprintf(lock_name, MAX_STRING_SIZE, "%s/max_nolock_bytes",
                           ns->ns_name);
                  lock_vars[0].data = &ns->ns_max_nolock_size;
@@ -348,9 +369,11 @@ ldlm_namespace_new(struct obd_device *obd, char *name,
                  CFS_INIT_LIST_HEAD(bucket);
  
          CFS_INIT_LIST_HEAD(&ns->ns_unused_list);
+        CFS_INIT_LIST_HEAD(&ns->ns_list_chain);
          ns->ns_nr_unused = 0;
          ns->ns_max_unused = LDLM_DEFAULT_LRU_SIZE;
          ns->ns_max_age = LDLM_DEFAULT_MAX_ALIVE;
+        ns->ns_timeouts = 0;
          spin_lock_init(&ns->ns_unused_lock);
          ns->ns_orig_connect_flags = 0;
          ns->ns_connect_flags = 0;
@@ -395,7 +418,6 @@ static void cleanup_resource(struct ldlm_resource *res, struct list_head *q,
          int local_only = (flags & LDLM_FL_LOCAL_ONLY);
          ENTRY;
  
-
          do {
                  struct ldlm_lock *lock = NULL;
  
@@ -979,8 +1001,8 @@ void ldlm_resource_add_lock(struct ldlm_resource *res, struct list_head *head,
          check_res_locked(res);
  
          ldlm_resource_dump(D_INFO, res);
-        CDEBUG(D_INFO, "About to add this lock:\n");
-        ldlm_lock_dump(D_INFO, lock, 0);
+        CDEBUG(D_OTHER, "About to add this lock:\n");
+        ldlm_lock_dump(D_OTHER, lock, 0);
  
          if (lock->l_destroyed) {
                  CDEBUG(D_OTHER, "Lock destroyed, not adding to resource\n");
@@ -999,7 +1021,7 @@ void ldlm_resource_insert_lock_after(struct ldlm_lock *original,
  
          check_res_locked(res);
  
-        ldlm_resource_dump(D_OTHER, res);
+        ldlm_resource_dump(D_INFO, res);
          CDEBUG(D_OTHER, "About to insert this lock after %p:\n", original);
          ldlm_lock_dump(D_OTHER, new, 0);
  
diff --git a/lustre/liblustre/Makefile.am b/lustre/liblustre/Makefile.am

index cf60902..bfe9644 100644 (file)
--- a/lustre/liblustre/Makefile.am
+++ b/lustre/liblustre/Makefile.am
@@ -19,7 +19,7 @@ LUSTRE_LIBS = libllite.a \
                $(top_builddir)/lustre/obdclass/liblustreclass.a \
                $(top_builddir)/lustre/lvfs/liblvfs.a
  
-if QUOTA
+if LIBLUSTRE
  QUOTA_LIBS = $(top_builddir)/lustre/quota/libquota.a
  endif
  
diff --git a/lustre/liblustre/dir.c b/lustre/liblustre/dir.c

index e6788bf..62538d4 100644 (file)
--- a/lustre/liblustre/dir.c
+++ b/lustre/liblustre/dir.c
@@ -1,24 +1,41 @@
  /* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
   * vim:expandtab:shiftwidth=8:tabstop=8:
   *
- * Lustre Light directory handling
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   *
- *  Copyright (c) 2002-2004 Cluster File Systems, Inc.
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
   *
- *   This file is part of Lustre, http://www.lustre.org.
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
   *
- *   Lustre is free software; you can redistribute it and/or
- *   modify it under the terms of version 2 of the GNU General Public
- *   License as published by the Free Software Foundation.
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
   *
- *   Lustre is distributed in the hope that it will be useful,
- *   but WITHOUT ANY WARRANTY; without even the implied warranty of
- *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- *   GNU General Public License for more details.
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
   *
- *   You should have received a copy of the GNU General Public License
- *   along with Lustre; if not, write to the Free Software
- *   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ * GPL HEADER END
+ */
+/*
+ * Copyright  2008 Sun Microsystems, Inc. All rights reserved
+ * Use is subject to license terms.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/liblustre/dir.c
+ *
+ * Lustre Light directory handling
   */
  
  #define DEBUG_SUBSYSTEM S_LLITE
@@ -102,7 +119,7 @@ static int llu_dir_do_readpage(struct inode *inode, struct page *page)
          }
          ldlm_lock_dump_handle(D_OTHER, &lockh);
  
-        mdc_pack_fid(&mdc_fid, st->st_ino, lli->lli_st_generation, S_IFDIR);
+        ll_pack_fid(&mdc_fid, st->st_ino, lli->lli_st_generation, S_IFDIR);
  
          offset = (__u64)page->index << CFS_PAGE_SHIFT;
          rc = mdc_readpage(sbi->ll_mdc_exp, &mdc_fid,
diff --git a/lustre/liblustre/file.c b/lustre/liblustre/file.c

index bd67c0f..accc378 100644 (file)
--- a/lustre/liblustre/file.c
+++ b/lustre/liblustre/file.c
@@ -1,24 +1,41 @@
  /* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
   * vim:expandtab:shiftwidth=8:tabstop=8:
   *
- * Lustre Light file operations
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
   *
- *  Copyright (c) 2002-2004 Cluster File Systems, Inc.
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
   *
- *   This file is part of Lustre, http://www.lustre.org.
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
   *
- *   Lustre is free software; you can redistribute it and/or
- *   modify it under the terms of version 2 of the GNU General Public
- *   License as published by the Free Software Foundation.
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright  2008 Sun Microsystems, Inc. All rights reserved
+ * Use is subject to license terms.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
   *
- *   Lustre is distributed in the hope that it will be useful,
- *   but WITHOUT ANY WARRANTY; without even the implied warranty of
- *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- *   GNU General Public License for more details.
+ * lustre/liblustre/file.c
   *
- *   You should have received a copy of the GNU General Public License
- *   along with Lustre; if not, write to the Free Software
- *   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ * Lustre Light file operations
   */
  
  #define DEBUG_SUBSYSTEM S_LLITE
@@ -82,14 +99,14 @@ void llu_prepare_mdc_op_data(struct mdc_op_data *data,
  
          if (i1) {
                  ll_i2gids(data->suppgids, i1, i2);
-                ll_inode2fid(&data->fid1, i1);
+                llu_inode2fid(&data->fid1, i1);
          }else {
                  ll_i2gids(data->suppgids, i2, i1);
-                ll_inode2fid(&data->fid1, i2);
+                llu_inode2fid(&data->fid1, i2);
          }
  
          if (i2)
-                ll_inode2fid(&data->fid2, i2);
+                llu_inode2fid(&data->fid2, i2);
          else
                  memset(&data->fid2, 0, sizeof(data->fid2));
  
@@ -115,16 +132,10 @@ void obdo_refresh_inode(struct inode *dst,
  
          if (valid & OBD_MD_FLATIME && src->o_atime > LTIME_S(st->st_atime))
                  LTIME_S(st->st_atime) = src->o_atime;
-        
-        /* mtime is always updated with ctime, but can be set in past.
-           As write and utime(2) may happen within 1 second, and utime's
-           mtime has a priority over write's one, leave mtime from mds 
-           for the same ctimes. */
-        if (valid & OBD_MD_FLCTIME && src->o_ctime > LTIME_S(st->st_ctime)) {
+        if (valid & OBD_MD_FLMTIME && src->o_mtime > LTIME_S(st->st_mtime))
+                LTIME_S(st->st_mtime) = src->o_mtime;
+        if (valid & OBD_MD_FLCTIME && src->o_ctime > LTIME_S(st->st_ctime))
                  LTIME_S(st->st_ctime) = src->o_ctime;
-                if (valid & OBD_MD_FLMTIME)
-                        LTIME_S(st->st_mtime) = src->o_mtime;
-        }
          if (valid & OBD_MD_FLSIZE && src->o_size > st->st_size)
                  st->st_size = src->o_size;
          /* optimum IO size */
@@ -319,6 +330,7 @@ int llu_mdc_close(struct obd_export *mdc_exp, struct inode *inode)
          struct ptlrpc_request *req = NULL;
          struct obd_client_handle *och = &fd->fd_mds_och;
          struct obdo obdo;
+        struct mdc_op_data data = { { 0 } };
          int rc, valid;
          ENTRY;
  
@@ -343,7 +355,8 @@ int llu_mdc_close(struct obd_export *mdc_exp, struct inode *inode)
                  obdo.o_flags = MDS_BFLAG_UNCOMMITTED_WRITES;
                  obdo.o_valid |= OBD_MD_FLFLAGS;
          }
-        rc = mdc_close(mdc_exp, &obdo, och, &req);
+        data.fid1 = lli->lli_fid;
+        rc = mdc_close(mdc_exp, &data, &obdo, och, &req);
          if (rc == EAGAIN) {
                  /* We are the last writer, so the MDS has instructed us to get
                   * the file size and any write cookies, then close again. */
@@ -461,6 +474,7 @@ static void llu_truncate(struct inode *inode, obd_flag flags)
          struct intnl_stat *st = llu_i2stat(inode);
          struct obd_info oinfo = { { { 0 } } };
          struct obdo oa = { 0 };
+        obd_valid valid;
          int rc;
          ENTRY;
          CDEBUG(D_VFSTRACE, "VFS Op:inode=%llu/%lu(%p) to %llu\n",
@@ -482,9 +496,41 @@ static void llu_truncate(struct inode *inode, obd_flag flags)
          oa.o_valid = OBD_MD_FLID | OBD_MD_FLFLAGS;
          oa.o_flags = flags; /* We don't actually want to copy inode flags */
   
-        obdo_from_inode(&oa, inode,
-                        OBD_MD_FLTYPE | OBD_MD_FLMODE | OBD_MD_FLATIME |
-                        OBD_MD_FLMTIME | OBD_MD_FLCTIME);
+        valid = OBD_MD_FLTYPE | OBD_MD_FLMODE | OBD_MD_FLATIME;
+        if (flags & OBD_FL_TRUNCLOCK) {
+                /* lockless truncate
+                 *
+                 * 1. do not use inode's timestamps because concurrent
+                 * stat might fill the inode with out-of-date times,
+                 * send current instead
+                 *
+                 * 2.do no update lsm, as long as stat (via
+                 * llu_glimpse_size) will bring attributes from osts
+                 * anyway */
+                oa.o_mtime = oa.o_ctime = CURRENT_TIME;
+                oa.o_valid |= OBD_MD_FLMTIME | OBD_MD_FLCTIME;
+        } else {
+                /* truncate under locks
+                 *
+                 * 1. update inode's mtime and ctime as long as
+                 * concurrent stat (via llu_glimpse_size) might bring
+                 * out-of-date ones
+                 *
+                 * 2. update lsm so that next stat (via
+                 * llu_glimpse_size) could get correct values in lsm */
+                struct ost_lvb xtimes;
+
+                lov_stripe_lock(lli->lli_smd);
+                st->st_mtime = st->st_ctime = CURRENT_TIME;
+                xtimes.lvb_mtime = st->st_mtime;
+                xtimes.lvb_ctime = st->st_ctime;
+                obd_update_lvb(llu_i2obdexp(inode), lli->lli_smd, &xtimes,
+                               OBD_MD_FLMTIME | OBD_MD_FLCTIME);
+                lov_stripe_unlock(lli->lli_smd);
+
+                valid |= OBD_MD_FLMTIME | OBD_MD_FLCTIME;
+        }
+        obdo_from_inode(&oa, inode, valid);
  
          obd_adjust_kms(llu_i2obdexp(inode), lli->lli_smd, st->st_size, 1);
  
diff --git a/lustre/liblustre/genlib.sh b/lustre/liblustre/genlib.sh

index 8bbecd8..ef6f81f 100755 (executable)
--- a/lustre/liblustre/genlib.sh
+++ b/lustre/liblustre/genlib.sh
@@ -13,7 +13,12 @@ set -e
  
  AR=/usr/bin/ar
  # see http://osdir.com/ml/gmane.comp.gnu.binutils.bugs/2006-01/msg00016.php
-LD=gcc
+ppc64_CPU=`uname -p`
+if [ ${ppc64_CPU} == "ppc64" ]; then
+  LD="gcc -m64"
+else
+  LD="gcc"
+fi
  RANLIB=/usr/bin/ranlib
  
  CWD=`pwd`
diff --git a/lustre/liblustre/llite_lib.c b/lustre/liblustre/llite_lib.c

index 09d17c3..96c27ef 100644 (file)
--- a/lustre/liblustre/llite_lib.c
+++ b/lustre/liblustre/llite_lib.c
@@ -1,24 +1,41 @@
  /* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
   * vim:expandtab:shiftwidth=8:tabstop=8:
   *
- * Lustre Light common routines
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
   *
- *  Copyright (c) 2002-2004 Cluster File Systems, Inc.
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
   *
- *   This file is part of Lustre, http://www.lustre.org.
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
   *
- *   Lustre is free software; you can redistribute it and/or
- *   modify it under the terms of version 2 of the GNU General Public
- *   License as published by the Free Software Foundation.
+ * GPL HEADER END
+ */
+/*
+ * Copyright  2008 Sun Microsystems, Inc. All rights reserved
+ * Use is subject to license terms.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
   *
- *   Lustre is distributed in the hope that it will be useful,
- *   but WITHOUT ANY WARRANTY; without even the implied warranty of
- *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- *   GNU General Public License for more details.
+ * lustre/liblustre/llite_lib.c
   *
- *   You should have received a copy of the GNU General Public License
- *   along with Lustre; if not, write to the Free Software
- *   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ * Lustre Light common routines
   */
  
  #include <stdlib.h>
@@ -156,7 +173,8 @@ int liblustre_process_log(struct config_llog_instance *cfg,
          if (ocd == NULL)
                  GOTO(out_cleanup, rc = -ENOMEM);
  
-        ocd->ocd_connect_flags = OBD_CONNECT_VERSION | OBD_CONNECT_AT;
+        ocd->ocd_connect_flags = OBD_CONNECT_VERSION | OBD_CONNECT_AT |
+                                 OBD_CONNECT_VBR;
  #ifdef LIBLUSTRE_POSIX_ACL
          ocd->ocd_connect_flags |= OBD_CONNECT_ACL;
  #endif
@@ -307,6 +325,8 @@ int _sysio_lustre_init(void)
  
  extern int _sysio_native_init();
  
+static int mnt_retry = 0;
+
  char *lustre_path = NULL;
  
  void __liblustre_setup_(void)
@@ -314,13 +334,20 @@ void __liblustre_setup_(void)
          char *target = NULL;
          char *lustre_driver = "lustre";
          unsigned mntflgs = 0;
-        int err;
+        int err, count;
  
          lustre_path = getenv("LIBLUSTRE_MOUNT_POINT");
          if (!lustre_path) {
                  lustre_path = "/mnt/lustre";
          }
  
+        target = getenv("LIBLUSTRE_MOUNT_RETRY");
+        if (target) {
+                mnt_retry = atoi(target);
+                if (mnt_retry < 0)
+                        mnt_retry = 0;
+        }
+
          /* mount target */
          target = getenv("LIBLUSTRE_MOUNT_TARGET");
          if (!target) {
@@ -349,7 +376,16 @@ void __liblustre_setup_(void)
                  exit(1);
  #endif /* INIT_SYSIO */
  
-        err = mount(target, lustre_path, lustre_driver, mntflgs, NULL);
+        count = mnt_retry;
+        do {
+                err = mount(target, lustre_path, lustre_driver, mntflgs, NULL);
+                if (err && mnt_retry && (-- count)) {
+                        fprintf(stderr, "Lustre mount failed: %s. "
+                                 "Will retry %d more times\n",
+                                strerror(errno), mnt_retry - count );
+                        sleep(2);
+                }
+        } while (err && count > 0);
          if (err) {
                  fprintf(stderr, "Lustre mount failed: %s\n", strerror(errno));
                  exit(1);
diff --git a/lustre/liblustre/llite_lib.h b/lustre/liblustre/llite_lib.h

index e943aa4..a40098a 100644 (file)
--- a/lustre/liblustre/llite_lib.h
+++ b/lustre/liblustre/llite_lib.h
@@ -1,5 +1,37 @@
  /* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
   * vim:expandtab:shiftwidth=8:tabstop=8:
+ *
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright  2008 Sun Microsystems, Inc. All rights reserved
+ * Use is subject to license terms.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
   */
  
  #ifndef __LLU_H_
@@ -75,6 +107,9 @@ struct llu_inode_info {
          /* not for stat, change it later */
          int                     lli_st_flags;
          unsigned long           lli_st_generation;
+        /* the most recent attributes from mds, it is used for timestampts
+         * only so far */
+        struct ost_lvb         lli_lvb;
  };
  
  static inline struct llu_sb_info *llu_fs2sbi(struct filesys *fs)
@@ -139,7 +174,7 @@ do {                                                                           \
  #define LL_LOOKUP_POSITIVE 1
  #define LL_LOOKUP_NEGATIVE 2
  
-static inline void ll_inode2fid(struct ll_fid *fid, struct inode *inode)
+static inline void llu_inode2fid(struct ll_fid *fid, struct inode *inode)
  {
          *fid = llu_i2info(inode)->lli_fid;
  }
diff --git a/lustre/liblustre/lutil.c b/lustre/liblustre/lutil.c

index 99f1cfe..f907890 100644 (file)
--- a/lustre/liblustre/lutil.c
+++ b/lustre/liblustre/lutil.c
@@ -1,22 +1,37 @@
  /* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
   * vim:expandtab:shiftwidth=8:tabstop=8:
   *
- *  Copyright (c) 2004 Cluster File Systems, Inc.
+ * GPL HEADER START
   *
- *   This file is part of Lustre, http://www.lustre.org.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   *
- *   Lustre is free software; you can redistribute it and/or
- *   modify it under the terms of version 2 of the GNU General Public
- *   License as published by the Free Software Foundation.
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
   *
- *   Lustre is distributed in the hope that it will be useful,
- *   but WITHOUT ANY WARRANTY; without even the implied warranty of
- *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- *   GNU General Public License for more details.
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
   *
- *   You should have received a copy of the GNU General Public License
- *   along with Lustre; if not, write to the Free Software
- *   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright  2008 Sun Microsystems, Inc. All rights reserved
+ * Use is subject to license terms.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
   */
  
  #include <stdlib.h>
@@ -62,14 +77,12 @@ void *inter_module_get(char *arg)
                  return ldlm_namespace_cleanup;
          else if (!strcmp(arg, "ldlm_replay_locks"))
                  return ldlm_replay_locks;
-#ifdef HAVE_QUOTA_SUPPORT
          else if (!strcmp(arg, "osc_quota_interface"))
                  return &osc_quota_interface;
          else if (!strcmp(arg, "mdc_quota_interface"))
                  return &mdc_quota_interface;
          else if (!strcmp(arg, "lov_quota_interface"))
                  return &lov_quota_interface;
-#endif
          else
                  return NULL;
  }
@@ -212,6 +225,46 @@ int liblustre_init_current(char *comm)
          return 0;
  }
  
+void cfs_cap_raise(cfs_cap_t cap)
+{
+        current->cap_effective |= (1 << cap);
+}
+
+void cfs_cap_lower(cfs_cap_t cap)
+{
+        current->cap_effective &= ~(1 << cap);
+}
+
+int cfs_cap_raised(cfs_cap_t cap)
+{
+        return current->cap_effective & (1 << cap);
+}
+
+void cfs_kernel_cap_pack(cfs_kernel_cap_t kcap, cfs_cap_t *cap)
+{
+        *cap = kcap;
+}
+
+void cfs_kernel_cap_unpack(cfs_kernel_cap_t *kcap, cfs_cap_t cap)
+{
+        *kcap = cap;
+}
+
+cfs_cap_t cfs_curproc_cap_pack(void) {
+        cfs_cap_t cap;
+        cfs_kernel_cap_pack(cfs_current()->cap_effective, &cap);
+        return cap;
+}
+
+void cfs_curproc_cap_unpack(cfs_cap_t cap) {
+        cfs_kernel_cap_unpack(&cfs_current()->cap_effective, cap);
+}
+
+int cfs_capable(cfs_cap_t cap)
+{
+        return cfs_cap_raised(cap);
+}
+
  int init_lib_portals()
  {
          int rc;
diff --git a/lustre/liblustre/lutil.h b/lustre/liblustre/lutil.h

index d235eb8..6ad418f 100644 (file)
--- a/lustre/liblustre/lutil.h
+++ b/lustre/liblustre/lutil.h
@@ -1,22 +1,37 @@
  /* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
   * vim:expandtab:shiftwidth=8:tabstop=8:
   *
- *  Copyright (c) 2004 Cluster File Systems, Inc.
+ * GPL HEADER START
   *
- *   This file is part of Lustre, http://www.lustre.org.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   *
- *   Lustre is free software; you can redistribute it and/or
- *   modify it under the terms of version 2 of the GNU General Public
- *   License as published by the Free Software Foundation.
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
   *
- *   Lustre is distributed in the hope that it will be useful,
- *   but WITHOUT ANY WARRANTY; without even the implied warranty of
- *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- *   GNU General Public License for more details.
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
   *
- *   You should have received a copy of the GNU General Public License
- *   along with Lustre; if not, write to the Free Software
- *   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright  2008 Sun Microsystems, Inc. All rights reserved
+ * Use is subject to license terms.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
   */
  
  #ifndef __LUTIL_H_
diff --git a/lustre/liblustre/namei.c b/lustre/liblustre/namei.c

index 2404ac5..ef60178 100644 (file)
--- a/lustre/liblustre/namei.c
+++ b/lustre/liblustre/namei.c
@@ -1,24 +1,41 @@
  /* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
   * vim:expandtab:shiftwidth=8:tabstop=8:
   *
- * Lustre Light name resolution
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   *
- *  Copyright (c) 2002-2004 Cluster File Systems, Inc.
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
   *
- *   This file is part of Lustre, http://www.lustre.org.
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
   *
- *   Lustre is free software; you can redistribute it and/or
- *   modify it under the terms of version 2 of the GNU General Public
- *   License as published by the Free Software Foundation.
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
   *
- *   Lustre is distributed in the hope that it will be useful,
- *   but WITHOUT ANY WARRANTY; without even the implied warranty of
- *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- *   GNU General Public License for more details.
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
   *
- *   You should have received a copy of the GNU General Public License
- *   along with Lustre; if not, write to the Free Software
- *   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ * GPL HEADER END
+ */
+/*
+ * Copyright  2008 Sun Microsystems, Inc. All rights reserved
+ * Use is subject to license terms.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/liblustre/namei.c
+ *
+ * Lustre Light name resolution
   */
  
  #define DEBUG_SUBSYSTEM S_LLITE
@@ -584,6 +601,8 @@ int llu_iop_lookup(struct pnode *pnode,
          }
  
  out:
+        if (it)
+                OBD_FREE(it, sizeof(*it));
          liblustre_wait_event(0);
          RETURN(rc);
  }
diff --git a/lustre/liblustre/rw.c b/lustre/liblustre/rw.c

index e49d518..96a4b95 100644 (file)
--- a/lustre/liblustre/rw.c
+++ b/lustre/liblustre/rw.c
@@ -1,24 +1,41 @@
  /* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
   * vim:expandtab:shiftwidth=8:tabstop=8:
   *
- * Lustre Light block IO
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
   *
- *  Copyright (c) 2002-2004 Cluster File Systems, Inc.
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
   *
- *   This file is part of Lustre, http://www.lustre.org.
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
   *
- *   Lustre is free software; you can redistribute it and/or
- *   modify it under the terms of version 2 of the GNU General Public
- *   License as published by the Free Software Foundation.
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright  2008 Sun Microsystems, Inc. All rights reserved
+ * Use is subject to license terms.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
   *
- *   Lustre is distributed in the hope that it will be useful,
- *   but WITHOUT ANY WARRANTY; without even the implied warranty of
- *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- *   GNU General Public License for more details.
+ * lustre/liblustre/rw.c
   *
- *   You should have received a copy of the GNU General Public License
- *   along with Lustre; if not, write to the Free Software
- *   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ * Lustre Light block IO
   */
  
  #define DEBUG_SUBSYSTEM S_LLITE
@@ -93,8 +110,7 @@ static int llu_lock_to_stripe_offset(struct inode *inode, struct ldlm_lock *lock
          struct {
                  char name[16];
                  struct ldlm_lock *lock;
-                struct lov_stripe_md *lsm;
-        } key = { .name = KEY_LOCK_TO_STRIPE, .lock = lock, .lsm = lsm };
+        } key = { .name = KEY_LOCK_TO_STRIPE, .lock = lock };
          __u32 stripe, vallen = sizeof(stripe);
          int rc;
          ENTRY;
@@ -103,7 +119,7 @@ static int llu_lock_to_stripe_offset(struct inode *inode, struct ldlm_lock *lock
                  RETURN(0);
  
          /* get our offset in the lov */
-        rc = obd_get_info(exp, sizeof(key), &key, &vallen, &stripe);
+        rc = obd_get_info(exp, sizeof(key), &key, &vallen, &stripe, lsm);
          if (rc != 0) {
                  CERROR("obd_get_info: rc = %d\n", rc);
                  LBUG();
@@ -179,7 +195,7 @@ static int llu_glimpse_callback(struct ldlm_lock *lock, void *reqp)
          struct inode *inode = llu_inode_from_lock(lock);
          struct llu_inode_info *lli;
          struct ost_lvb *lvb;
-        int size[2] = { sizeof(struct ptlrpc_body), sizeof(*lvb) };
+        __u32 size[2] = { sizeof(struct ptlrpc_body), sizeof(*lvb) };
          int rc, stripe = 0;
          ENTRY;
  
@@ -256,7 +272,11 @@ int llu_glimpse_size(struct inode *inode)
                  RETURN(rc > 0 ? -EIO : rc);
          }
  
+        lov_stripe_lock(lli->lli_smd);
          inode_init_lvb(inode, &lvb);
+        /* merge timestamps the most resently obtained from mds with
+           timestamps obtained from osts */
+        lvb = lli->lli_lvb;
          rc = obd_merge_lvb(sbi->ll_osc_exp, lli->lli_smd, &lvb, 0);
          st->st_size = lvb.lvb_size;
          st->st_blocks = lvb.lvb_blocks;
@@ -266,6 +286,7 @@ int llu_glimpse_size(struct inode *inode)
          st->st_mtime = lvb.lvb_mtime;
          st->st_atime = lvb.lvb_atime;
          st->st_ctime = lvb.lvb_ctime;
+        lov_stripe_unlock(lli->lli_smd);
  
          CDEBUG(D_DLMTRACE, "glimpse: size: "LPU64", blocks: "LPU64"\n",
                 (__u64)st->st_size, (__u64)st->st_blocks);
@@ -691,6 +712,23 @@ ssize_t llu_file_prwv(const struct iovec *iovec, int iovlen,
                  pos = st->st_size;
          }
  
+        if (local_lock) {
+                struct ost_lvb xtimes;
+
+                lov_stripe_lock(lsm);
+                /* inode might mtime and ctime set earlier in race with stat
+                 * which merged into inode timestamps obtained from mds and
+                 * osts */
+                st->st_atime = st->st_mtime = st->st_ctime = CURRENT_TIME;
+                xtimes.lvb_atime = st->st_atime;
+                xtimes.lvb_mtime = st->st_mtime;
+                xtimes.lvb_ctime = st->st_ctime;
+                obd_update_lvb(exp, lsm, &xtimes,
+                               is_read ? OBD_MD_FLATIME :
+                               (OBD_MD_FLMTIME | OBD_MD_FLCTIME));
+                lov_stripe_unlock(lsm);
+        }
+
          for (iovidx = 0; iovidx < iovlen; iovidx++) {
                  char *buf = (char *) iovec[iovidx].iov_base;
                  size_t count = iovec[iovidx].iov_len;
diff --git a/lustre/liblustre/super.c b/lustre/liblustre/super.c

index baf8ed9..107cd02 100644 (file)
--- a/lustre/liblustre/super.c
+++ b/lustre/liblustre/super.c
@@ -1,24 +1,41 @@
  /* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
   * vim:expandtab:shiftwidth=8:tabstop=8:
   *
- * Lustre Light Super operations
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
   *
- *  Copyright (c) 2002-2004 Cluster File Systems, Inc.
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
   *
- *   This file is part of Lustre, http://www.lustre.org.
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
   *
- *   Lustre is free software; you can redistribute it and/or
- *   modify it under the terms of version 2 of the GNU General Public
- *   License as published by the Free Software Foundation.
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright  2008 Sun Microsystems, Inc. All rights reserved
+ * Use is subject to license terms.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
   *
- *   Lustre is distributed in the hope that it will be useful,
- *   but WITHOUT ANY WARRANTY; without even the implied warranty of
- *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- *   GNU General Public License for more details.
+ * lustre/liblustre/super.c
   *
- *   You should have received a copy of the GNU General Public License
- *   along with Lustre; if not, write to the Free Software
- *   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ * Lustre Light Super operations
   */
  
  #define DEBUG_SUBSYSTEM S_LLITE
@@ -75,12 +92,12 @@ static int ll_permission(struct inode *inode, int mask)
  
          if ((mask & (MAY_READ|MAY_WRITE)) ||
              (st->st_mode & S_IXUGO))
-                if (capable(CAP_DAC_OVERRIDE))
+                if (cfs_capable(CFS_CAP_DAC_OVERRIDE))
                          return 0;
  
          if (mask == MAY_READ ||
              (S_ISDIR(st->st_mode) && !(mask & MAY_WRITE))) {
-                if (capable(CAP_DAC_READ_SEARCH))
+                if (cfs_capable(CFS_CAP_DAC_READ_SEARCH))
                          return 0;
          }
  
@@ -135,19 +152,22 @@ void llu_update_inode(struct inode *inode, struct mds_body *body,
  
          if (body->valid & OBD_MD_FLID)
                  st->st_ino = body->ino;
-        if (body->valid & OBD_MD_FLATIME &&
-            body->atime > LTIME_S(st->st_atime))
-                LTIME_S(st->st_atime) = body->atime;
-        
-        /* mtime is always updated with ctime, but can be set in past.
-           As write and utime(2) may happen within 1 second, and utime's
-           mtime has a priority over write's one, so take mtime from mds 
-           for the same ctimes. */
-        if (body->valid & OBD_MD_FLCTIME &&
-            body->ctime >= LTIME_S(st->st_ctime)) {
-                LTIME_S(st->st_ctime) = body->ctime;
-                if (body->valid & OBD_MD_FLMTIME)
+        if (body->valid & OBD_MD_FLGENER)
+                lli->lli_st_generation = body->generation;
+        if (body->valid & OBD_MD_FLMTIME) {
+                if (body->mtime > LTIME_S(st->st_mtime))
                          LTIME_S(st->st_mtime) = body->mtime;
+                lli->lli_lvb.lvb_mtime = body->mtime;
+        }
+        if (body->valid & OBD_MD_FLATIME) {
+                if (body->atime > LTIME_S(st->st_atime))
+                        LTIME_S(st->st_atime) = body->atime;
+                lli->lli_lvb.lvb_atime = body->atime;
+        }
+        if (body->valid & OBD_MD_FLCTIME) {
+                if (body->ctime > LTIME_S(st->st_ctime))
+                        LTIME_S(st->st_ctime) = body->ctime;
+                lli->lli_lvb.lvb_ctime = body->ctime;
          }
          if (body->valid & OBD_MD_FLMODE)
                  st->st_mode = (st->st_mode & S_IFMT)|(body->mode & ~S_IFMT);
@@ -171,16 +191,8 @@ void llu_update_inode(struct inode *inode, struct mds_body *body,
                  st->st_blocks = body->blocks;
          if (body->valid & OBD_MD_FLFLAGS)
                  lli->lli_st_flags = body->flags;
-        if (body->valid & OBD_MD_FLGENER)
-                lli->lli_st_generation = body->generation;
  
-        /* fillin fid */
-        if (body->valid & OBD_MD_FLID)
-                lli->lli_fid.id = body->ino;
-        if (body->valid & OBD_MD_FLGENER)
-                lli->lli_fid.generation = body->generation;
-        if (body->valid & OBD_MD_FLTYPE)
-                lli->lli_fid.f_type = body->mode & S_IFMT;
+        lli->lli_fid = body->fid1;
  }
  
  void obdo_to_inode(struct inode *dst, struct obdo *src, obd_flag valid)
@@ -410,6 +422,8 @@ static int llu_have_md_lock(struct inode *inode, __u64 lockpart)
  static int llu_inode_revalidate(struct inode *inode)
  {
          struct lov_stripe_md *lsm = NULL;
+        struct llu_inode_info *lli = llu_i2info(inode);
+        struct intnl_stat *st = llu_i2stat(inode);
          ENTRY;
  
          if (!inode) {
@@ -427,15 +441,15 @@ static int llu_inode_revalidate(struct inode *inode)
  
                  /* Why don't we update all valid MDS fields here, if we're
                   * doing an RPC anyways?  -phil */
-                if (S_ISREG(llu_i2stat(inode)->st_mode)) {
+                if (S_ISREG(st->st_mode)) {
                          ealen = obd_size_diskmd(sbi->ll_osc_exp, NULL);
                          valid |= OBD_MD_FLEASIZE;
                  }
-                ll_inode2fid(&fid, inode);
+                llu_inode2fid(&fid, inode);
                  rc = mdc_getattr(sbi->ll_mdc_exp, &fid, valid, ealen, &req);
                  if (rc) {
                          CERROR("failure %d inode %llu\n", rc,
-                               (long long)llu_i2stat(inode)->st_ino);
+                               (long long)st->st_ino);
                          RETURN(-abs(rc));
                  }
                  rc = mdc_req2lustre_md(req, REPLY_REC_OFF, sbi->ll_osc_exp,&md);
@@ -456,18 +470,23 @@ static int llu_inode_revalidate(struct inode *inode)
  
  
                  llu_update_inode(inode, md.body, md.lsm);
-                if (md.lsm != NULL && llu_i2info(inode)->lli_smd != md.lsm)
+                if (md.lsm != NULL && lli->lli_smd != md.lsm)
                          obd_free_memmd(sbi->ll_osc_exp, &md.lsm);
  
                  if (md.body->valid & OBD_MD_FLSIZE)
                          set_bit(LLI_F_HAVE_MDS_SIZE_LOCK,
-                                &llu_i2info(inode)->lli_flags);
+                                &lli->lli_flags);
                  ptlrpc_req_finished(req);
          }
  
-        lsm = llu_i2info(inode)->lli_smd;
-        if (!lsm)       /* object not yet allocated, don't validate size */
+        lsm = lli->lli_smd;
+        if (!lsm) {
+                /* object not yet allocated, don't validate size */
+                st->st_atime = lli->lli_lvb.lvb_atime;
+                st->st_mtime = lli->lli_lvb.lvb_mtime;
+                st->st_ctime = lli->lli_lvb.lvb_ctime;
                  RETURN(0);
+        }
  
          /* ll_glimpse_size will prefer locally cached writes if they extend
           * the file */
@@ -532,7 +551,7 @@ void llu_clear_inode(struct inode *inode)
                 (long long)llu_i2stat(inode)->st_ino, lli->lli_st_generation,
                 inode);
  
-        ll_inode2fid(&fid, inode);
+        llu_inode2fid(&fid, inode);
          clear_bit(LLI_F_HAVE_MDS_SIZE_LOCK, &(lli->lli_flags));
          mdc_change_cbdata(sbi->ll_mdc_exp, &fid, null_if_equal, inode);
  
@@ -592,7 +611,7 @@ static int inode_setattr(struct inode * inode, struct iattr * attr)
                  st->st_ctime = attr->ia_ctime;
          if (ia_valid & ATTR_MODE) {
                  st->st_mode = attr->ia_mode;
-                if (!in_group_p(st->st_gid) && !capable(CAP_FSETID))
+                if (!in_group_p(st->st_gid) && !cfs_capable(CFS_CAP_FSETID))
                          st->st_mode &= ~S_ISGID;
          }
          /* mark_inode_dirty(inode); */
@@ -649,16 +668,6 @@ int llu_setattr_raw(struct inode *inode, struct iattr *attr)
                  attr->ia_mtime = CURRENT_TIME;
                  attr->ia_valid |= ATTR_MTIME_SET;
          }
-        if ((attr->ia_valid & ATTR_CTIME) && !(attr->ia_valid & ATTR_MTIME)) {
-                /* To avoid stale mtime on mds, obtain it from ost and send 
-                   to mds. */
-                rc = llu_glimpse_size(inode);
-                if (rc) 
-                        RETURN(rc);
-                
-                attr->ia_valid |= ATTR_MTIME_SET | ATTR_MTIME;
-                attr->ia_mtime = inode->i_stbuf.st_mtime;
-        }
  
          if (attr->ia_valid & (ATTR_MTIME | ATTR_CTIME))
                  CDEBUG(D_INODE, "setting mtime %lu, ctime %lu, now = %lu\n",
@@ -718,7 +727,7 @@ int llu_setattr_raw(struct inode *inode, struct iattr *attr)
                          } else {
                                  /* from inode_change_ok() */
                                  if (current->fsuid != st->st_uid &&
-                                    !capable(CAP_FOWNER))
+                                    !cfs_capable(CFS_CAP_FOWNER))
                                          RETURN(-EPERM);
                          }
                  }
@@ -779,14 +788,86 @@ int llu_setattr_raw(struct inode *inode, struct iattr *attr)
          } else if (ia_valid & (ATTR_MTIME | ATTR_MTIME_SET)) {
                  struct obd_info oinfo = { { { 0 } } };
                  struct obdo oa;
+                struct lustre_handle lockh = { 0 };
+                obd_valid valid;
  
                  CDEBUG(D_INODE, "set mtime on OST inode %llu to %lu\n",
                         (long long)st->st_ino, LTIME_S(attr->ia_mtime));
+
                  oa.o_id = lsm->lsm_object_id;
                  oa.o_valid = OBD_MD_FLID;
  
-                obdo_from_inode(&oa, inode, OBD_MD_FLTYPE | OBD_MD_FLATIME |
-                                            OBD_MD_FLMTIME | OBD_MD_FLCTIME);
+                valid = OBD_MD_FLTYPE;
+
+                if (LTIME_S(attr->ia_mtime) < LTIME_S(attr->ia_ctime)){
+                        struct ost_lvb xtimes;
+
+                        /* setting mtime to past is performed under PW
+                         * EOF extent lock */
+                        oinfo.oi_policy.l_extent.start = 0;
+                        oinfo.oi_policy.l_extent.end = OBD_OBJECT_EOF;
+                        rc = llu_extent_lock(NULL, inode, lsm, LCK_PW,
+                                             &oinfo.oi_policy,
+                                             &lockh, 0);
+                        if (rc)
+                                RETURN(rc);
+
+                        /* setattr under locks
+                         *
+                         * 1. restore inode's timestamps which are
+                         * about to be set as long as concurrent stat
+                         * (via llu_glimpse_size) might bring
+                         * out-of-date ones
+                         *
+                         * 2. update lsm so that next stat (via
+                         * llu_glimpse_size) could get correct values
+                         * in lsm */
+                        lov_stripe_lock(lsm);
+                        if (ia_valid & ATTR_ATIME) {
+                                st->st_atime = xtimes.lvb_atime =
+                                        attr->ia_atime;
+                                valid |= OBD_MD_FLATIME;
+                        }
+                        if (ia_valid & ATTR_MTIME) {
+                                st->st_mtime = xtimes.lvb_mtime =
+                                        attr->ia_mtime;
+                                valid |= OBD_MD_FLMTIME;
+                        }
+                        if (ia_valid & ATTR_CTIME) {
+                                st->st_ctime = xtimes.lvb_ctime =
+                                        attr->ia_mtime;
+                                valid |= OBD_MD_FLCTIME;
+                        }
+
+                        obd_update_lvb(sbi->ll_osc_exp, lsm,
+                                       &xtimes, valid);
+                        lov_stripe_unlock(lsm);
+                } else {
+                        /* lockless setattr
+                         *
+                         * 1. do not use inode's timestamps because
+                         * concurrent stat might fill the inode with
+                         * out-of-date times, send values from attr
+                         * instead
+                         *
+                         * 2.do no update lsm, as long as stat (via
+                         * ll_glimpse_size) will bring attributes from
+                         * osts anyway */
+                        if (ia_valid & ATTR_ATIME) {
+                                oa.o_atime = attr->ia_atime;
+                                oa.o_valid |= OBD_MD_FLATIME;
+                        }
+                        if (ia_valid & ATTR_MTIME) {
+                                oa.o_mtime = attr->ia_mtime;
+                                oa.o_valid |= OBD_MD_FLMTIME;
+                        }
+                        if (ia_valid & ATTR_CTIME) {
+                                oa.o_ctime = attr->ia_ctime;
+                                oa.o_valid |= OBD_MD_FLCTIME;
+                        }
+                }
+
+                obdo_from_inode(&oa, inode, valid);
  
                  oinfo.oi_oa = &oa;
                  oinfo.oi_md = lsm;
@@ -794,6 +875,19 @@ int llu_setattr_raw(struct inode *inode, struct iattr *attr)
                  rc = obd_setattr_rqset(sbi->ll_osc_exp, &oinfo, NULL);
                  if (rc)
                          CERROR("obd_setattr_async fails: rc=%d\n", rc);
+
+                if (LTIME_S(attr->ia_mtime) < LTIME_S(attr->ia_ctime)){
+                        int err;
+
+                        err = llu_extent_unlock(NULL, inode, lsm,
+                                               LCK_PW, &lockh);
+                        if (unlikely(err != 0)) {
+                                CERROR("extent unlock failed: "
+                                       "err=%d\n", err);
+                                if (rc == 0)
+                                        rc = err;
+                        }
+                }
          }
          RETURN(rc);
  }
@@ -869,10 +963,9 @@ static int llu_iop_symlink_raw(struct pnode *pno, const char *tgt)
                  RETURN(err);
  
          llu_prepare_mdc_op_data(&op_data, dir, NULL, name, len, 0);
-        err = mdc_create(sbi->ll_mdc_exp, &op_data,
-                         tgt, strlen(tgt) + 1, S_IFLNK | S_IRWXUGO,
-                         current->fsuid, current->fsgid, current->cap_effective,
-                         0, &request);
+        err = mdc_create(sbi->ll_mdc_exp, &op_data, tgt, strlen(tgt) + 1,
+                         S_IFLNK | S_IRWXUGO, current->fsuid, current->fsgid,
+                         cfs_curproc_cap_pack(), 0, &request);
          ptlrpc_req_finished(request);
          liblustre_wait_event(0);
          RETURN(err);
@@ -898,7 +991,7 @@ static int llu_readlink_internal(struct inode *inode,
                  RETURN(0);
          }
  
-        ll_inode2fid(&fid, inode);
+        llu_inode2fid(&fid, inode);
          rc = mdc_getattr(sbi->ll_mdc_exp, &fid,
                           OBD_MD_LINKNAME, symlen, request);
          if (rc) {
@@ -1001,7 +1094,7 @@ static int llu_iop_mknod_raw(struct pnode *pno,
                                          0);
                  err = mdc_create(sbi->ll_mdc_exp, &op_data, NULL, 0, mode,
                                   current->fsuid, current->fsgid,
-                                 current->cap_effective, dev, &request);
+                                 cfs_curproc_cap_pack(), dev, &request);
                  ptlrpc_req_finished(request);
                  break;
          case S_IFDIR:
@@ -1229,9 +1322,9 @@ static int llu_iop_mkdir_raw(struct pnode *pno, mode_t mode)
                  RETURN(err);
  
          llu_prepare_mdc_op_data(&op_data, dir, NULL, name, len, 0);
-        err = mdc_create(llu_i2sbi(dir)->ll_mdc_exp, &op_data, NULL, 0, mode | S_IFDIR,
-                         current->fsuid, current->fsgid, current->cap_effective,
-                         0, &request);
+        err = mdc_create(llu_i2sbi(dir)->ll_mdc_exp, &op_data, NULL, 0,
+                         mode | S_IFDIR, current->fsuid, current->fsgid,
+                         cfs_curproc_cap_pack(), 0, &request);
          ptlrpc_req_finished(request);
          liblustre_wait_event(0);
          RETURN(err);
@@ -1600,11 +1693,25 @@ static int llu_lov_dir_setstripe(struct inode *ino, unsigned long arg)
          if (rc)
                  return(-EFAULT);
  
-        if (lum.lmm_magic != LOV_USER_MAGIC)
+        switch (lum.lmm_magic) {
+        case LOV_USER_MAGIC_V1: {
+                if (lum.lmm_magic != cpu_to_le32(LOV_USER_MAGIC_V1))
+                        lustre_swab_lov_user_md_v1(&lum);
+                break;
+                }
+        case LOV_USER_MAGIC_V3: {
+                if (lum.lmm_magic != cpu_to_le32(LOV_USER_MAGIC_V3))
+                        lustre_swab_lov_user_md_v3((struct lov_user_md_v3 *)&lum);
+                break;
+                }
+        default: {
+                CDEBUG(D_IOCTL, "bad userland LOV MAGIC:"
+                                " %#08x != %#08x nor %#08x\n",
+                                lum.lmm_magic, LOV_USER_MAGIC_V1,
+                                LOV_USER_MAGIC_V3);
                  RETURN(-EINVAL);
-
-        if (lum.lmm_magic != cpu_to_le32(LOV_USER_MAGIC))
-                lustre_swab_lov_user_md(&lum);
+        }
+        }
  
          /* swabbing is done in lov_setstripe() on server side */
          rc = mdc_setattr(sbi->ll_mdc_exp, &op_data,
@@ -1814,9 +1921,7 @@ struct inode *llu_iget(struct filesys *fs, struct lustre_md *md)
          }
  
          /* try to find existing inode */
-        fid.id = md->body->ino;
-        fid.generation = md->body->generation;
-        fid.f_type = md->body->mode & S_IFMT;
+        fid = md->body->fid1;
  
          inode = _sysio_i_find(fs, &fileid);
          if (inode) {
@@ -1938,7 +2043,7 @@ llu_fsswop_mount(const char *source,
                             sizeof(async), &async, NULL);
  
          ocd.ocd_connect_flags = OBD_CONNECT_IBITS | OBD_CONNECT_VERSION |
-                OBD_CONNECT_AT;
+                                OBD_CONNECT_AT | OBD_CONNECT_VBR;
  #ifdef LIBLUSTRE_POSIX_ACL
          ocd.ocd_connect_flags |= OBD_CONNECT_ACL;
  #endif
@@ -1983,6 +2088,9 @@ llu_fsswop_mount(const char *source,
          }
          sbi->ll_osc_exp = class_conn2export(&osc_conn);
          sbi->ll_lco.lco_flags = ocd.ocd_connect_flags;
+        sbi->ll_lco.lco_mdc_exp = sbi->ll_mdc_exp;
+        sbi->ll_lco.lco_osc_exp = sbi->ll_osc_exp;
+
  
          err = obd_register_lock_cancel_cb(sbi->ll_osc_exp,
                                            llu_extent_lock_cancel_cb);
diff --git a/lustre/liblustre/tests/Makefile.am b/lustre/liblustre/tests/Makefile.am

index 08bd86b..bbc30e8 100644 (file)
--- a/lustre/liblustre/tests/Makefile.am
+++ b/lustre/liblustre/tests/Makefile.am
@@ -52,7 +52,8 @@ replay_ost_single_DEPENDENCIES = $(top_builddir)/lustre/liblustre/liblustre.a li
  if MPITESTS
  test_lock_cancel_SOURCES = test_lock_cancel.c
  test_lock_cancel_CFLAGS = $(LL_CFLAGS) -I/opt/lam/include
-test_lock_cancel_LDADD :=  $(LLIB_EXEC)  -L/opt/lam/lib -lmpi -llam
+#test_lock_cancel_LDADD :=  $(LLIB_EXEC)  -L/opt/lam/lib -lmpi -llam
+test_lock_cancel_LDADD :=  $(LLIB_EXEC)  -lmpich
  endif
  
  
diff --git a/lustre/liblustre/tests/echo_test.c b/lustre/liblustre/tests/echo_test.c

index 62a11de..9531519 100644 (file)
--- a/lustre/liblustre/tests/echo_test.c
+++ b/lustre/liblustre/tests/echo_test.c
@@ -1,24 +1,41 @@
  /* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
   * vim:expandtab:shiftwidth=8:tabstop=8:
   *
- * Lustre Light user test program
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   *
- *  Copyright (c) 2002-2004 Cluster File Systems, Inc.
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
   *
- *   This file is part of Lustre, http://www.lustre.org.
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
   *
- *   Lustre is free software; you can redistribute it and/or
- *   modify it under the terms of version 2 of the GNU General Public
- *   License as published by the Free Software Foundation.
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
   *
- *   Lustre is distributed in the hope that it will be useful,
- *   but WITHOUT ANY WARRANTY; without even the implied warranty of
- *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- *   GNU General Public License for more details.
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
   *
- *   You should have received a copy of the GNU General Public License
- *   along with Lustre; if not, write to the Free Software
- *   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ * GPL HEADER END
+ */
+/*
+ * Copyright  2008 Sun Microsystems, Inc. All rights reserved
+ * Use is subject to license terms.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/liblustre/tests/echo_test.c
+ *
+ * Lustre Light user test program
   */
  
  #include <liblustre.h>
diff --git a/lustre/liblustre/tests/recovery_small.c b/lustre/liblustre/tests/recovery_small.c

index 6af93f1..3058a77 100644 (file)
--- a/lustre/liblustre/tests/recovery_small.c
+++ b/lustre/liblustre/tests/recovery_small.c
@@ -1,24 +1,41 @@
  /* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
   * vim:expandtab:shiftwidth=8:tabstop=8:
   *
- * Lustre Light user test program
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   *
- *  Copyright (c) 2002, 2003 Cluster File Systems, Inc.
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
   *
- *   This file is part of Lustre, http://www.lustre.org.
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
   *
- *   Lustre is free software; you can redistribute it and/or
- *   modify it under the terms of version 2 of the GNU General Public
- *   License as published by the Free Software Foundation.
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
   *
- *   Lustre is distributed in the hope that it will be useful,
- *   but WITHOUT ANY WARRANTY; without even the implied warranty of
- *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- *   GNU General Public License for more details.
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
   *
- *   You should have received a copy of the GNU General Public License
- *   along with Lustre; if not, write to the Free Software
- *   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ * GPL HEADER END
+ */
+/*
+ * Copyright  2008 Sun Microsystems, Inc. All rights reserved
+ * Use is subject to license terms.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/liblustre/tests/recovery_small.c
+ *
+ * Lustre Light user test program
   */
  
  #define _BSD_SOURCE
diff --git a/lustre/liblustre/tests/replay_ost_single.c b/lustre/liblustre/tests/replay_ost_single.c

index 418ba94..122b7b5 100644 (file)
--- a/lustre/liblustre/tests/replay_ost_single.c
+++ b/lustre/liblustre/tests/replay_ost_single.c
@@ -1,24 +1,41 @@
  /* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
   * vim:expandtab:shiftwidth=8:tabstop=8:
   *
- * Lustre Light user test program
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   *
- *  Copyright (c) 2002, 2003 Cluster File Systems, Inc.
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
   *
- *   This file is part of Lustre, http://www.lustre.org.
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
   *
- *   Lustre is free software; you can redistribute it and/or
- *   modify it under the terms of version 2 of the GNU General Public
- *   License as published by the Free Software Foundation.
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
   *
- *   Lustre is distributed in the hope that it will be useful,
- *   but WITHOUT ANY WARRANTY; without even the implied warranty of
- *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- *   GNU General Public License for more details.
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
   *
- *   You should have received a copy of the GNU General Public License
- *   along with Lustre; if not, write to the Free Software
- *   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ * GPL HEADER END
+ */
+/*
+ * Copyright  2008 Sun Microsystems, Inc. All rights reserved
+ * Use is subject to license terms.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/liblustre/tests/replay_ost_single.c
+ *
+ * Lustre Light user test program
   */
  
  #define _BSD_SOURCE
diff --git a/lustre/liblustre/tests/replay_single.c b/lustre/liblustre/tests/replay_single.c

index 17155f8..2a4d4e8 100644 (file)
--- a/lustre/liblustre/tests/replay_single.c
+++ b/lustre/liblustre/tests/replay_single.c
@@ -1,24 +1,41 @@
  /* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
   * vim:expandtab:shiftwidth=8:tabstop=8:
   *
- * Lustre Light user test program
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   *
- *  Copyright (c) 2002, 2003 Cluster File Systems, Inc.
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
   *
- *   This file is part of Lustre, http://www.lustre.org.
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
   *
- *   Lustre is free software; you can redistribute it and/or
- *   modify it under the terms of version 2 of the GNU General Public
- *   License as published by the Free Software Foundation.
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
   *
- *   Lustre is distributed in the hope that it will be useful,
- *   but WITHOUT ANY WARRANTY; without even the implied warranty of
- *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- *   GNU General Public License for more details.
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
   *
- *   You should have received a copy of the GNU General Public License
- *   along with Lustre; if not, write to the Free Software
- *   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ * GPL HEADER END
+ */
+/*
+ * Copyright  2008 Sun Microsystems, Inc. All rights reserved
+ * Use is subject to license terms.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/liblustre/tests/replay_single.c
+ *
+ * Lustre Light user test program
   */
  
  #define _BSD_SOURCE
diff --git a/lustre/liblustre/tests/sanity.c b/lustre/liblustre/tests/sanity.c

index 2309907..cb3d672 100644 (file)
--- a/lustre/liblustre/tests/sanity.c
+++ b/lustre/liblustre/tests/sanity.c
@@ -1,24 +1,41 @@
  /* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
   * vim:expandtab:shiftwidth=8:tabstop=8:
   *
- * Lustre Light user test program
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   *
- *  Copyright (c) 2002, 2003 Cluster File Systems, Inc.
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
   *
- *   This file is part of Lustre, http://www.lustre.org.
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
   *
- *   Lustre is free software; you can redistribute it and/or
- *   modify it under the terms of version 2 of the GNU General Public
- *   License as published by the Free Software Foundation.
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
   *
- *   Lustre is distributed in the hope that it will be useful,
- *   but WITHOUT ANY WARRANTY; without even the implied warranty of
- *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- *   GNU General Public License for more details.
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
   *
- *   You should have received a copy of the GNU General Public License
- *   along with Lustre; if not, write to the Free Software
- *   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ * GPL HEADER END
+ */
+/*
+ * Copyright  2008 Sun Microsystems, Inc. All rights reserved
+ * Use is subject to license terms.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/liblustre/tests/sanity.c
+ *
+ * Lustre Light user test program
   */
  
  #define _BSD_SOURCE
diff --git a/lustre/liblustre/tests/test_common.c b/lustre/liblustre/tests/test_common.c

index 29377b1..88ea23c 100644 (file)
--- a/lustre/liblustre/tests/test_common.c
+++ b/lustre/liblustre/tests/test_common.c
@@ -1,5 +1,37 @@
  /* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
   * vim:expandtab:shiftwidth=8:tabstop=8:
+ *
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright  2008 Sun Microsystems, Inc. All rights reserved
+ * Use is subject to license terms.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
   */
  
  #include <sys/stat.h>
diff --git a/lustre/liblustre/tests/test_common.h b/lustre/liblustre/tests/test_common.h

index 5949a42..fb4937c 100644 (file)
--- a/lustre/liblustre/tests/test_common.h
+++ b/lustre/liblustre/tests/test_common.h
@@ -1,3 +1,39 @@
+/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
+ * vim:expandtab:shiftwidth=8:tabstop=8:
+ *
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright  2008 Sun Microsystems, Inc. All rights reserved
+ * Use is subject to license terms.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
  #ifndef __TEST_COMMON__H
  #define __TEST_COMMON__H
  
diff --git a/lustre/liblustre/tests/test_lock_cancel.c b/lustre/liblustre/tests/test_lock_cancel.c

index 9350e4c..8d0c4a3 100644 (file)
--- a/lustre/liblustre/tests/test_lock_cancel.c
+++ b/lustre/liblustre/tests/test_lock_cancel.c
@@ -1,24 +1,41 @@
  /* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
   * vim:expandtab:shiftwidth=8:tabstop=8:
   *
- * Lustre Light user test program
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   *
- *  Copyright (c) 2002, 2003 Cluster File Systems, Inc.
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
   *
- *   This file is part of Lustre, http://www.lustre.org.
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
   *
- *   Lustre is free software; you can redistribute it and/or
- *   modify it under the terms of version 2 of the GNU General Public
- *   License as published by the Free Software Foundation.
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
   *
- *   Lustre is distributed in the hope that it will be useful,
- *   but WITHOUT ANY WARRANTY; without even the implied warranty of
- *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- *   GNU General Public License for more details.
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
   *
- *   You should have received a copy of the GNU General Public License
- *   along with Lustre; if not, write to the Free Software
- *   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ * GPL HEADER END
+ */
+/*
+ * Copyright  2008 Sun Microsystems, Inc. All rights reserved
+ * Use is subject to license terms.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/liblustre/tests/test_lock_cancel.c
+ *
+ * Lustre Light user test program
   */
  
  #define _BSD_SOURCE
diff --git a/lustre/llite/Makefile.in b/lustre/llite/Makefile.in

index fad7b02..b5ed7be 100644 (file)
--- a/lustre/llite/Makefile.in
+++ b/lustre/llite/Makefile.in
@@ -3,10 +3,6 @@ lustre-objs := dcache.o dir.o file.o llite_close.o llite_lib.o llite_nfs.o rw.o
  
  llite_lloop-objs := lloop.o
  
-ifeq ($(PATCHLEVEL),4)
-lustre-objs += rw24.o super.o
-else
  lustre-objs += rw26.o super25.o
-endif
  
  @INCLUDE_RULES@
diff --git a/lustre/llite/autoMakefile.am b/lustre/llite/autoMakefile.am

index 3ebe906..5dcc0cd 100644 (file)
--- a/lustre/llite/autoMakefile.am
+++ b/lustre/llite/autoMakefile.am
@@ -1,7 +1,38 @@
-# Copyright (C) 2001  Cluster File Systems, Inc.
  #
-# This code is issued under the GNU General Public License.
-# See the file COPYING in this distribution
+# GPL HEADER START
+#
+# DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License version 2 only,
+# as published by the Free Software Foundation.
+#
+# This program is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+# General Public License version 2 for more details (a copy is included
+# in the LICENSE file that accompanied this code).
+#
+# You should have received a copy of the GNU General Public License
+# version 2 along with this program; If not, see
+# http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+#
+# Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+# CA 95054 USA or visit www.sun.com if you need additional information or
+# have any questions.
+#
+# GPL HEADER END
+#
+
+#
+# Copyright  2008 Sun Microsystems, Inc. All rights reserved
+# Use is subject to license terms.
+#
+
+#
+# This file is part of Lustre, http://www.lustre.org/
+# Lustre is a trademark of Sun Microsystems, Inc.
+#
  
  if MODULES
  modulefs_DATA = lustre$(KMODEXT) llite_lloop$(KMODEXT)
diff --git a/lustre/llite/dcache.c b/lustre/llite/dcache.c

index 96e01b1..d4ad3f2 100644 (file)
--- a/lustre/llite/dcache.c
+++ b/lustre/llite/dcache.c
@@ -1,22 +1,37 @@
  /* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
   * vim:expandtab:shiftwidth=8:tabstop=8:
   *
- *  Copyright (c) 2001-2003 Cluster File Systems, Inc.
+ * GPL HEADER START
   *
- *   This file is part of Lustre, http://www.lustre.org.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   *
- *   Lustre is free software; you can redistribute it and/or
- *   modify it under the terms of version 2 of the GNU General Public
- *   License as published by the Free Software Foundation.
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
   *
- *   Lustre is distributed in the hope that it will be useful,
- *   but WITHOUT ANY WARRANTY; without even the implied warranty of
- *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- *   GNU General Public License for more details.
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
   *
- *   You should have received a copy of the GNU General Public License
- *   along with Lustre; if not, write to the Free Software
- *   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright  2008 Sun Microsystems, Inc. All rights reserved
+ * Use is subject to license terms.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
   */
  
  #include <linux/fs.h>
@@ -34,6 +49,8 @@
  
  #include "llite_internal.h"
  
+spinlock_t ll_lookup_lock = SPIN_LOCK_UNLOCKED;
+
  /* should NOT be called with the dcache lock, see fs/dcache.c */
  static void ll_release(struct dentry *de)
  {
@@ -122,14 +139,13 @@ void ll_set_dd(struct dentry *de)
          if (de->d_fsdata == NULL) {
                  struct ll_dentry_data *lld;
  
-                OBD_ALLOC(lld, sizeof(struct ll_dentry_data));
+                OBD_ALLOC_PTR(lld);
                  if (likely(lld != NULL)) {
-                        cfs_waitq_init(&lld->lld_waitq);
                          lock_dentry(de);
                          if (likely(de->d_fsdata == NULL))
                                  de->d_fsdata = lld;
                          else
-                                OBD_FREE(lld, sizeof(struct ll_dentry_data));
+                                OBD_FREE_PTR(lld);
                          unlock_dentry(de);
                  }
          }
@@ -191,11 +207,13 @@ int ll_drop_dentry(struct dentry *dentry)
                  __d_drop(dentry);
                  unlock_dentry(dentry);
                  spin_unlock(&dcache_lock);
+                spin_unlock(&ll_lookup_lock);
                  dput(dentry);
+                spin_lock(&ll_lookup_lock);
                  spin_lock(&dcache_lock);
                  return 1;
          }
-       /* disconected dentry can not be find without lookup, because we 
+       /* disconected dentry can not be find without lookup, because we
          * not need his to unhash or mark invalid. */
         if (dentry->d_flags & DCACHE_DISCONNECTED) {
                 unlock_dentry(dentry);
@@ -216,14 +234,6 @@ int ll_drop_dentry(struct dentry *dentry)
                   * sys_getcwd() could return -ENOENT -bzzz */
  #ifdef DCACHE_LUSTRE_INVALID
                  dentry->d_flags |= DCACHE_LUSTRE_INVALID;
-#if (LINUX_VERSION_CODE < KERNEL_VERSION(2,5,0))
-                __d_drop(dentry);
-                if (dentry->d_inode) {
-                        /* Put positive dentries to orphan list */
-                        list_add(&dentry->d_hash,
-                                 &ll_i2sbi(dentry->d_inode)->ll_orphan_dentry_list);
-                }
-#endif
  #else
                  if (!dentry->d_inode || !S_ISDIR(dentry->d_inode->i_mode))
                          __d_drop(dentry);
@@ -248,6 +258,7 @@ void ll_unhash_aliases(struct inode *inode)
                 inode->i_ino, inode->i_generation, inode);
  
          head = &inode->i_dentry;
+        spin_lock(&ll_lookup_lock);
          spin_lock(&dcache_lock);
  restart:
          tmp = head;
@@ -276,6 +287,8 @@ restart:
                            goto restart;
          }
          spin_unlock(&dcache_lock);
+        spin_unlock(&ll_lookup_lock);
+
          EXIT;
  }
  
@@ -311,21 +324,17 @@ void ll_lookup_finish_locks(struct lookup_intent *it, struct dentry *dentry)
  
          /* drop lookup or getattr locks immediately */
          if (it->it_op == IT_LOOKUP || it->it_op == IT_GETATTR) {
-#if (LINUX_VERSION_CODE > KERNEL_VERSION(2,5,0))
                  /* on 2.6 there are situation when several lookups and
                   * revalidations may be requested during single operation.
                   * therefore, we don't release intent here -bzzz */
                  ll_intent_drop_lock(it);
-#else
-                ll_intent_release(it);
-#endif
          }
  }
  
  void ll_frob_intent(struct lookup_intent **itp, struct lookup_intent *deft)
  {
          struct lookup_intent *it = *itp;
-#if defined(HAVE_VFS_INTENT_PATCHES)&&(LINUX_VERSION_CODE > KERNEL_VERSION(2,5,0))
+#ifdef HAVE_VFS_INTENT_PATCHES
          if (it) {
                  LASSERTF(it->it_magic == INTENT_MAGIC, "bad intent magic: %x\n",
                           it->it_magic);
@@ -343,7 +352,7 @@ void ll_frob_intent(struct lookup_intent **itp, struct lookup_intent *deft)
  int ll_revalidate_it(struct dentry *de, int lookup_flags,
                       struct lookup_intent *it)
  {
-        struct mdc_op_data op_data;
+        struct mdc_op_data op_data = { { 0 } };
          struct ptlrpc_request *req = NULL;
          struct lookup_intent lookup_it = { .it_op = IT_LOOKUP };
          struct obd_export *exp;
@@ -440,11 +449,18 @@ int ll_revalidate_it(struct dentry *de, int lookup_flags,
  
  do_lock:
          it->it_create_mode &= ~current->fs->umask;
-
+        it->it_flags |= O_CHECK_STALE;
          rc = mdc_intent_lock(exp, &op_data, NULL, 0, it, lookup_flags,
                               &req, ll_mdc_blocking_ast, 0);
+        it->it_flags &= ~O_CHECK_STALE;
          if (it->it_op == IT_GETATTR && !first)
-                ll_statahead_exit(de, rc);
+                /* If there are too many locks on client-side, then some
+                 * locks taken by statahead maybe dropped automatically
+                 * before the real "revalidate" using them. */
+                ll_statahead_exit(de, req == NULL ? rc : 0);
+        else if (first == -EEXIST)
+                ll_statahead_mark(de);
+
          /* If req is NULL, then mdc_intent_lock only tried to do a lock match;
           * if all was well, it will return 1 if it found locks, 0 otherwise. */
          if (req == NULL && rc >= 0) {
@@ -454,7 +470,15 @@ do_lock:
          }
  
          if (rc < 0) {
-                if (rc != -ESTALE) {
+                if (-ESTALE == rc) {
+                        if (it_disposition(it, DISP_OPEN_OPEN) &&
+                            !it_open_error(DISP_OPEN_OPEN, it))
+                                /* server have valid open - close file first*/
+                                ll_release_openhandle(de, it);
+                        /* release intent reference to avoid having stale 'it'
+                         * in namedata for old VFS intent */
+                        ll_intent_drop_lock(it);
+                } else {
                          CDEBUG(D_INFO, "ll_intent_lock: rc %d : it->it_status "
                                 "%d\n", rc, it->d.lustre.it_status);
                  }
@@ -464,11 +488,14 @@ do_lock:
  revalidate_finish:
          rc = revalidate_it_finish(req, DLM_REPLY_REC_OFF, it, de);
          if (rc != 0) {
+                /* we are going release the intent, so clear DISP_ENQ_COMPLETE
+                 * to prevent a double free of the request */
+                it_clear_disposition(it, DISP_ENQ_COMPLETE);
                  ll_intent_release(it);
                  GOTO(out, rc = 0);
          }
-        if ((it->it_op & IT_OPEN) && de->d_inode && 
-            !S_ISREG(de->d_inode->i_mode) && 
+        if ((it->it_op & IT_OPEN) && de->d_inode &&
+            !S_ISREG(de->d_inode->i_mode) &&
              !S_ISDIR(de->d_inode->i_mode)) {
                  ll_release_openhandle(de, it);
          }
@@ -476,12 +503,14 @@ revalidate_finish:
  
          /* unfortunately ll_intent_lock may cause a callback and revoke our
           * dentry */
+        spin_lock(&ll_lookup_lock);
          spin_lock(&dcache_lock);
          lock_dentry(de);
          __d_drop(de);
          unlock_dentry(de);
          d_rehash_cond(de, 0);
          spin_unlock(&dcache_lock);
+        spin_unlock(&ll_lookup_lock);
  
   out:
          /* We do not free request as it may be reused during following lookup
@@ -546,6 +575,9 @@ do_lookup:
                  /* see if we got same inode, if not - return error */
                  if(!memcmp(&fid, &mds_body->fid1, sizeof(struct ll_fid)))
                          goto revalidate_finish;
+                /* we are going release the intent, so clear DISP_ENQ_COMPLETE
+                 * to prevent a double free of the request */
+                it_clear_disposition(it, DISP_ENQ_COMPLETE);
                  ll_intent_release(it);
          }
          GOTO(out, rc = 0);
@@ -558,7 +590,9 @@ out_sa:
          if (it && it->it_op == IT_GETATTR && rc == 1) {
                  first = ll_statahead_enter(de->d_parent->d_inode, &de, 0);
                  if (!first)
-                        ll_statahead_exit(de, rc);
+                        ll_statahead_exit(de, 1);
+                else if (first == -EEXIST)
+                        ll_statahead_mark(de);
          }
  
          return rc;
@@ -593,8 +627,8 @@ out_sa:
          unlock_kernel();
  
          handle = (flag) ? &ldd->lld_mnt_och : &ldd->lld_cwd_och;
-        rc = obd_pin(sbi->ll_mdc_exp, inode->i_ino, inode->i_generation,
-                     inode->i_mode & S_IFMT, handle, flag);
+        rc = obd_pin(sbi->ll_mdc_exp, ll_inode_ll_fid(inode),
+                     handle, flag);
  
          if (rc) {
                  lock_kernel();
@@ -648,7 +682,6 @@ out_sa:
          return;
  }
  
-#if (LINUX_VERSION_CODE > KERNEL_VERSION(2,5,0))
  #ifdef HAVE_VFS_INTENT_PATCHES
  static int ll_revalidate_nd(struct dentry *dentry, struct nameidata *nd)
  {
@@ -675,7 +708,8 @@ int ll_revalidate_nd(struct dentry *dentry, struct nameidata *nd)
                          RETURN(0);
                  if (it->it_op == (IT_OPEN|IT_CREAT))
                          if (nd->intent.open.flags & O_EXCL) {
-                                CDEBUG(D_VFSTRACE, "create O_EXCL, returning 0\n");
+                                CDEBUG(D_VFSTRACE,
+                                       "create O_EXCL, returning 0\n");
                                  rc = 0;
                                  goto out_it;
                          }
@@ -719,7 +753,7 @@ int ll_revalidate_nd(struct dentry *dentry, struct nameidata *nd)
                          ll_d2d(dentry)->lld_it = it;
                          it = NULL; /* avoid freeing */
                  }
-                        
+
  out_it:
                  if (it) {
                          ll_intent_release(it);
@@ -732,14 +766,9 @@ out_it:
          RETURN(rc);
  }
  #endif
-#endif
  
  struct dentry_operations ll_d_ops = {
-#if (LINUX_VERSION_CODE > KERNEL_VERSION(2,5,0))
          .d_revalidate = ll_revalidate_nd,
-#else
-        .d_revalidate_it = ll_revalidate_it,
-#endif
          .d_release = ll_release,
          .d_delete = ll_ddelete,
  #ifdef DCACHE_LUSTRE_INVALID
@@ -750,45 +779,3 @@ struct dentry_operations ll_d_ops = {
          .d_unpin = ll_unpin,
  #endif
  };
-
-static int ll_fini_revalidate_nd(struct dentry *dentry, struct nameidata *nd)
-{
-        ENTRY;
-        /* need lookup */
-        RETURN(0);
-}
-
-struct dentry_operations ll_fini_d_ops = {
-        .d_revalidate = ll_fini_revalidate_nd,
-        .d_release = ll_release,
-};
-
-/*
- * It is for the following race condition:
- * When someone (maybe statahead thread) adds the dentry to the dentry hash
- * table, the dentry's "d_op" maybe NULL, at the same time, another (maybe
- * "ls -l") process finds such dentry by "do_lookup()" without "do_revalidate()"
- * called. It causes statahead window lost, and maybe other issues. --Fan Yong
- */
-static int ll_init_revalidate_nd(struct dentry *dentry, struct nameidata *nd)
-{
-        struct l_wait_info lwi = { 0 };
-        struct ll_dentry_data *lld;
-        ENTRY;
-
-        ll_set_dd(dentry);
-        lld = ll_d2d(dentry);
-        if (unlikely(lld == NULL))
-                RETURN(-ENOMEM);
-
-        l_wait_event(lld->lld_waitq, dentry->d_op != &ll_init_d_ops, &lwi);
-        if (likely(dentry->d_op == &ll_d_ops))
-                RETURN(ll_revalidate_nd(dentry, nd));
-        else
-                RETURN(dentry->d_op == &ll_fini_d_ops ? 0 : -EINVAL);
-}
-
-struct dentry_operations ll_init_d_ops = {
-        .d_revalidate = ll_init_revalidate_nd,
-        .d_release = ll_release,
-};
diff --git a/lustre/llite/dir.c b/lustre/llite/dir.c

index ee9b255..05f8157 100644 (file)
--- a/lustre/llite/dir.c
+++ b/lustre/llite/dir.c
@@ -1,28 +1,41 @@
  /* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
   * vim:expandtab:shiftwidth=8:tabstop=8:
   *
- * Directory code for lustre client.
+ * GPL HEADER START
   *
- *  Copyright (C) 2002--2007 Cluster File Systems, Inc.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   *
- *   This file is part of the Lustre file system, http://www.lustre.org
- *   Lustre is a trademark of Cluster File Systems, Inc.
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
   *
- *   You may have signed or agreed to another license before downloading
- *   this software.  If so, you are bound by the terms and conditions
- *   of that agreement, and the following does not apply to you.  See the
- *   LICENSE file included with this distribution for more information.
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
   *
- *   If you did not agree to a different license, then this copy of Lustre
- *   is open source software; you can redistribute it and/or modify it
- *   under the terms of version 2 of the GNU General Public License as
- *   published by the Free Software Foundation.
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
   *
- *   In either case, Lustre is distributed in the hope that it will be
- *   useful, but WITHOUT ANY WARRANTY; without even the implied warranty
- *   of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- *   license text for more details.
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
   *
+ * GPL HEADER END
+ */
+/*
+ * Copyright  2008 Sun Microsystems, Inc. All rights reserved
+ * Use is subject to license terms.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/llite/dir.c
+ *
+ * Directory code for lustre client.
   */
  
  #include <linux/fs.h>
@@ -31,11 +44,7 @@
  #include <linux/version.h>
  #include <linux/smp_lock.h>
  #include <asm/uaccess.h>
-#if (LINUX_VERSION_CODE < KERNEL_VERSION(2,5,0))
-# include <linux/locks.h>   // for wait_on_buffer
-#else
-# include <linux/buffer_head.h>   // for wait_on_buffer
-#endif
+#include <linux/buffer_head.h>   // for wait_on_buffer
  
  #define DEBUG_SUBSYSTEM S_LLITE
  
@@ -71,7 +80,7 @@ static int ll_dir_readpage(struct file *file, struct page *page)
          CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p) off "LPU64"\n",
                 inode->i_ino, inode->i_generation, inode, offset);
  
-        mdc_pack_fid(&mdc_fid, inode->i_ino, inode->i_generation, S_IFDIR);
+        ll_pack_fid(&mdc_fid, inode->i_ino, inode->i_generation, S_IFDIR);
  
          rc = mdc_readpage(ll_i2sbi(inode)->ll_mdc_exp, &mdc_fid,
                            offset, page, &request);
@@ -200,8 +209,7 @@ static void ll_dir_check_page(struct inode *dir, struct page *page)
  
  struct page *ll_get_dir_page(struct inode *dir, unsigned long n)
  {
-        struct ldlm_res_id res_id =
-                { .name = { dir->i_ino, (__u64)dir->i_generation} };
+        struct ldlm_res_id res_id;
          struct lustre_handle lockh;
          struct obd_device *obddev = class_exp2obd(ll_i2sbi(dir)->ll_mdc_exp);
          struct address_space *mapping = dir->i_mapping;
@@ -209,6 +217,7 @@ struct page *ll_get_dir_page(struct inode *dir, unsigned long n)
          ldlm_policy_data_t policy = {.l_inodebits = {MDS_INODELOCK_UPDATE} };
          int rc;
  
+        fid_build_reg_res_name(ll_inode_lu_fid(dir), &res_id);
          rc = ldlm_lock_match(obddev->obd_namespace, LDLM_FL_BLOCK_GRANTED,
                               &res_id, LDLM_IBITS, &policy, LCK_CR, &lockh);
          if (!rc) {
@@ -216,7 +225,7 @@ struct page *ll_get_dir_page(struct inode *dir, unsigned long n)
                  struct ldlm_enqueue_info einfo = { LDLM_IBITS, LCK_CR,
                         ll_mdc_blocking_ast, ldlm_completion_ast, NULL, dir };
                  struct ptlrpc_request *request;
-                struct mdc_op_data data;
+                struct mdc_op_data data = { { 0 } };
  
                  ll_prepare_mdc_op_data(&data, dir, NULL, NULL, 0, 0, NULL);
  
@@ -302,8 +311,8 @@ static unsigned char ll_dir_filetype_table[LL_DIR_FT_MAX] = {
   *       0: no live entries on this page.
   */
  
-int ll_readdir_page(char *addr, __u64 base, unsigned *offset,
-                    filldir_t filldir, void *cookie)
+static int ll_readdir_page(char *addr, __u64 base, unsigned *offset,
+                           filldir_t filldir, void *cookie)
  {
          struct ll_dir_entry *de;
          char *end;
@@ -325,7 +334,7 @@ int ll_readdir_page(char *addr, __u64 base, unsigned *offset,
          return nr;
  }
  
-int ll_readdir(struct file *filp, void *dirent, filldir_t filldir)
+static int ll_readdir_18(struct file *filp, void *dirent, filldir_t filldir)
  {
          struct inode *inode = filp->f_dentry->d_inode;
          loff_t pos          = filp->f_pos;
@@ -410,6 +419,438 @@ int ll_readdir(struct file *filp, void *dirent, filldir_t filldir)
          RETURN(rc);
  }
  
+/*      
+ * Chain of hash overflow pages.
+ */            
+struct ll_dir_chain {
+        /* XXX something. Later */
+};
+  
+static inline void ll_dir_chain_init(struct ll_dir_chain *chain)
+{  
+}
+
+static inline void ll_dir_chain_fini(struct ll_dir_chain *chain)
+{
+}
+
+static inline __u32 hash_x_index(__u32 value)
+{
+        return ((__u32)~0) - value;
+}
+
+/*
+ * Layout of readdir pages, as transmitted on wire.
+ */     
+struct lu_dirent {
+        struct lu_fid lde_fid;
+        __u64         lde_hash;
+        __u16         lde_reclen;
+        __u16         lde_namelen;
+        __u32         lde_padding;
+        char          lde_name[0];
+};
+
+struct lu_dirpage {
+        __u64            ldp_hash_start;
+        __u64            ldp_hash_end;
+        __u16            ldp_flags;
+        __u16            ldp_pad0;
+        __u32            ldp_pad1;
+        struct lu_dirent ldp_entries[0];
+};
+
+enum lu_dirpage_flags {
+        LDF_EMPTY = 1 << 0
+};
+
+static inline struct lu_dirent *lu_dirent_start(struct lu_dirpage *dp)
+{
+        if (le16_to_cpu(dp->ldp_flags) & LDF_EMPTY)
+                return NULL;
+        else
+                return dp->ldp_entries;
+}
+
+static inline struct lu_dirent *lu_dirent_next(struct lu_dirent *ent)
+{
+        struct lu_dirent *next;
+
+        if (le16_to_cpu(ent->lde_reclen) != 0)
+                next = ((void *)ent) + le16_to_cpu(ent->lde_reclen);
+        else
+                next = NULL;
+
+        return next;
+}
+
+static inline int lu_dirent_size(struct lu_dirent *ent)
+{
+        if (le16_to_cpu(ent->lde_reclen) == 0) {
+                return (sizeof(*ent) +
+                        le16_to_cpu(ent->lde_namelen) + 3) & ~3;
+        }
+        return le16_to_cpu(ent->lde_reclen);
+}
+
+#define DIR_END_OFF              0xfffffffffffffffeULL
+
+#ifdef HAVE_RW_TREE_LOCK
+#define TREE_READ_LOCK_IRQ(mapping)     read_lock_irq(&(mapping)->tree_lock)
+#define TREE_READ_UNLOCK_IRQ(mapping) read_unlock_irq(&(mapping)->tree_lock)
+#else
+#define TREE_READ_LOCK_IRQ(mapping) spin_lock_irq(&(mapping)->tree_lock)
+#define TREE_READ_UNLOCK_IRQ(mapping) spin_unlock_irq(&(mapping)->tree_lock)
+#endif
+
+/* returns the page unlocked, but with a reference */
+static int ll_dir_readpage_20(struct file *file, struct page *page)
+{
+        struct inode *inode = page->mapping->host;
+        struct ptlrpc_request *request;
+        struct mdt_body *body;
+        struct ll_fid fid;
+        __u64 hash;
+        int rc;
+        ENTRY;
+
+        hash = hash_x_index(page->index);
+        CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p) off %lu\n",
+               inode->i_ino, inode->i_generation, inode, (unsigned long)hash);
+
+        ll_inode2fid(&fid, inode);
+        rc = mdc_readpage(ll_i2sbi(inode)->ll_mdc_exp, &fid,
+                          hash, page, &request);
+        if (!rc) {
+                body = lustre_msg_buf(request->rq_repmsg, REPLY_REC_OFF,
+                                      sizeof(*body));
+                /* Checked by mdc_readpage() */
+                LASSERT(body != NULL);
+
+                if (body->valid & OBD_MD_FLSIZE) {
+                        ll_inode_size_lock(inode, 0);
+                        i_size_write(inode, body->size);
+                        ll_inode_size_unlock(inode, 0);
+                }
+                SetPageUptodate(page);
+        }
+        ptlrpc_req_finished(request);
+
+        unlock_page(page);
+        EXIT;
+        return rc;
+}
+
+
+static void ll_check_page(struct inode *dir, struct page *page)
+{
+        /* XXX: check page format later */
+        SetPageChecked(page);
+}
+
+
+/*
+ * Find, kmap and return page that contains given hash.
+ */
+static struct page *ll_dir_page_locate(struct inode *dir, unsigned long hash,
+                                       __u64 *start, __u64 *end)
+{
+        struct address_space *mapping = dir->i_mapping;
+        /*
+         * Complement of hash is used as an index so that
+         * radix_tree_gang_lookup() can be used to find a page with starting
+         * hash _smaller_ than one we are looking for.
+         */
+        unsigned long offset = hash_x_index(hash);
+        struct page *page;
+        int found;
+        ENTRY;
+
+        TREE_READ_LOCK_IRQ(mapping);
+        found = radix_tree_gang_lookup(&mapping->page_tree,
+                                       (void **)&page, offset, 1);
+        if (found > 0) {
+                struct lu_dirpage *dp;
+
+                page_cache_get(page);
+                TREE_READ_UNLOCK_IRQ(mapping);
+                /*
+                 * In contrast to find_lock_page() we are sure that directory
+                 * page cannot be truncated (while DLM lock is held) and,
+                 * hence, can avoid restart.
+                 *
+                 * In fact, page cannot be locked here at all, because
+                 * ll_dir_readpage() does synchronous io.
+                 */
+                wait_on_page(page);
+                if (PageUptodate(page)) {
+                        dp = kmap(page);
+                        *start = le64_to_cpu(dp->ldp_hash_start);
+                        *end   = le64_to_cpu(dp->ldp_hash_end);
+                        LASSERT(*start <= hash);
+                        if (hash > *end || (*end != *start && hash == *end)) {
+                                kunmap(page);
+                                lock_page(page);
+                                ll_truncate_complete_page(page);
+                                unlock_page(page);
+                                page_cache_release(page);
+                                page = NULL;
+                        }
+                } else {
+                        page_cache_release(page);
+                        page = ERR_PTR(-EIO);
+                }
+
+        } else {
+                TREE_READ_UNLOCK_IRQ(mapping);
+                page = NULL;
+        }
+        RETURN(page);
+}
+
+static struct page *ll_get_dir_page_20(struct inode *dir, __u64 hash, int exact,
+                                       struct ll_dir_chain *chain)
+{
+        struct ldlm_res_id res_id;
+        struct lustre_handle lockh;
+        struct obd_device *obddev = class_exp2obd(ll_i2sbi(dir)->ll_mdc_exp);
+        struct address_space *mapping = dir->i_mapping;
+        struct lu_dirpage *dp;
+        struct page *page;
+        ldlm_policy_data_t policy = {.l_inodebits = {MDS_INODELOCK_UPDATE} };
+        ldlm_mode_t mode;
+        int rc;
+        __u64 start = 0;
+        __u64 end = 0;
+        ENTRY;
+ 
+        fid_build_reg_res_name(ll_inode_lu_fid(dir), &res_id);
+        mode = LCK_PR;
+        rc = ldlm_lock_match(obddev->obd_namespace, LDLM_FL_BLOCK_GRANTED,
+                             &res_id, LDLM_IBITS, &policy, mode, &lockh);
+        if (!rc) {
+                struct lookup_intent it = { .it_op = IT_READDIR };
+                struct ldlm_enqueue_info einfo = { LDLM_IBITS, mode,
+                       ll_mdc_blocking_ast, ldlm_completion_ast, NULL, dir };
+                struct ptlrpc_request *request;
+                struct mdc_op_data op_data = { { 0 } };
+
+                ll_prepare_mdc_op_data(&op_data, dir, NULL, NULL, 0, 0, NULL);
+
+                rc = mdc_enqueue(ll_i2sbi(dir)->ll_mdc_exp, &einfo, &it,
+                                 &op_data, &lockh, NULL, 0, 0);
+
+                request = (struct ptlrpc_request *)it.d.lustre.it_data;
+                if (request)
+                        ptlrpc_req_finished(request);
+                if (rc < 0) {
+                        CERROR("lock enqueue: rc: %d\n", rc);
+                        RETURN(ERR_PTR(rc));
+                }
+        }
+        ldlm_lock_dump_handle(D_OTHER, &lockh);
+
+        page = ll_dir_page_locate(dir, hash, &start, &end);
+        if (IS_ERR(page))
+                GOTO(out_unlock, page);
+
+        if (page != NULL) {
+                /*
+                 * XXX nikita: not entirely correct handling of a corner case:
+                 * suppose hash chain of entries with hash value HASH crosses
+                 * border between pages P0 and P1. First both P0 and P1 are
+                 * cached, seekdir() is called for some entry from the P0 part
+                 * of the chain. Later P0 goes out of cache. telldir(HASH)
+                 * happens and finds P1, as it starts with matching hash
+                 * value. Remaining entries from P0 part of the chain are
+                 * skipped. (Is that really a bug?)
+                 *
+                 * Possible solutions: 0. don't cache P1 is such case, handle
+                 * it as an "overflow" page. 1. invalidate all pages at
+                 * once. 2. use HASH|1 as an index for P1.
+                 */
+                if (exact && hash != start) {
+                        /*
+                         * readdir asked for a page starting _exactly_ from
+                         * given hash, but cache contains stale page, with
+                         * entries with smaller hash values. Stale page should
+                         * be invalidated, and new one fetched.
+                         */
+                        CDEBUG(D_INFO, "Stale readpage page %p: %#lx != %#lx\n",
+                              page, (unsigned long)hash, (unsigned long)start);
+                        lock_page(page);
+                        ll_truncate_complete_page(page);
+                        unlock_page(page);
+                        page_cache_release(page);
+                } else {
+                        GOTO(hash_collision, page);
+                }
+        }
+
+        page = read_cache_page(mapping, hash_x_index(hash),
+                               (filler_t*)ll_dir_readpage_20, NULL);
+        if (IS_ERR(page))
+                GOTO(out_unlock, page);
+
+        wait_on_page(page);
+        (void)kmap(page);
+        if (!PageUptodate(page))
+                goto fail;
+        if (!PageChecked(page))
+                ll_check_page(dir, page);
+        if (PageError(page))
+                goto fail;
+hash_collision:
+        dp = page_address(page);
+
+        start = le64_to_cpu(dp->ldp_hash_start);
+        end   = le64_to_cpu(dp->ldp_hash_end);
+        if (end == start) {
+                LASSERT(start == hash);
+                CWARN("Page-wide hash collision: %#lx\n", (unsigned long)end);
+                /*
+                 * Fetch whole overflow chain...
+                 *
+                 * XXX not yet.
+                 */
+                goto fail;
+        }
+out_unlock:
+        ldlm_lock_decref(&lockh, mode);
+        RETURN(page);
+
+fail:
+        ll_put_page(page);
+        page = ERR_PTR(-EIO);
+        goto out_unlock;
+}
+
+static int ll_readdir_20(struct file *filp, void *cookie, filldir_t filldir)
+{
+        struct inode         *inode = filp->f_dentry->d_inode;
+        struct ll_sb_info    *sbi   = ll_i2sbi(inode);
+        __u64                 pos   = filp->f_pos;
+        struct page          *page;
+        struct ll_dir_chain   chain;
+        int rc;
+        int done;
+        int shift;
+        ENTRY;
+
+        CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p) pos %lu/%llu\n",
+               inode->i_ino, inode->i_generation, inode,
+               (unsigned long)pos, i_size_read(inode));
+
+        if (pos == DIR_END_OFF)
+                /*
+                 * end-of-file.
+                 */
+                RETURN(0);
+
+        rc    = 0;
+        done  = 0;
+        shift = 0;
+        ll_dir_chain_init(&chain);
+
+        page = ll_get_dir_page_20(inode, pos, 0, &chain);
+
+        while (rc == 0 && !done) {
+                struct lu_dirpage *dp;
+                struct lu_dirent  *ent;
+
+                if (!IS_ERR(page)) {
+                        /* 
+                         * If page is empty (end of directoryis reached),
+                         * use this value. 
+                         */
+                        __u64 hash = DIR_END_OFF;
+                        __u64 next;
+
+                        dp = page_address(page);
+                        for (ent = lu_dirent_start(dp); ent != NULL && !done;
+                             ent = lu_dirent_next(ent)) {
+                                char          *name;
+                                int            namelen;
+                                struct lu_fid  fid;
+                                ino_t          ino;
+
+                                hash    = le64_to_cpu(ent->lde_hash);
+                                namelen = le16_to_cpu(ent->lde_namelen);
+
+                                if (hash < pos)
+                                        /*
+                                         * Skip until we find target hash
+                                         * value.
+                                         */
+                                        continue;
+
+                                if (namelen == 0)
+                                        /*
+                                         * Skip dummy record.
+                                         */
+                                        continue;
+
+                                fid  = ent->lde_fid;
+                                name = ent->lde_name;
+                                fid_le_to_cpu(&fid, &fid);
+                                ino  = ll_fid_build_ino(sbi, (struct ll_fid*)&fid);
+
+                                done = filldir(cookie, name, namelen,
+                                               (loff_t)hash, ino, DT_UNKNOWN);
+                        }
+                        next = le64_to_cpu(dp->ldp_hash_end);
+                        ll_put_page(page);
+                        if (!done) {
+                                pos = next;
+                                if (pos == DIR_END_OFF)
+                                        /*
+                                         * End of directory reached.
+                                         */
+                                        done = 1;
+                                else if (1 /* chain is exhausted*/)
+                                        /*
+                                         * Normal case: continue to the next
+                                         * page.
+                                         */
+                                        page = ll_get_dir_page_20(inode, pos, 1,
+                                                                  &chain);
+                                else {
+                                        /*
+                                         * go into overflow page.
+                                         */
+                                }
+                        } else {
+                                pos = hash;
+                        }
+                } else {
+                        rc = PTR_ERR(page);
+                        CERROR("error reading dir "DFID" at %lu: rc %d\n",
+                               PFID(ll_inode_lu_fid(inode)),
+                               (unsigned long)pos, rc);
+                }
+        }
+
+        filp->f_pos = (loff_t)(__s32)pos;
+        filp->f_version = inode->i_version;
+        touch_atime(filp->f_vfsmnt, filp->f_dentry);
+
+        ll_dir_chain_fini(&chain);
+
+        RETURN(rc);
+}
+
+static int ll_readdir(struct file *filp, void *cookie, filldir_t filldir)
+{
+        struct inode      *inode = filp->f_dentry->d_inode;
+        struct ll_sb_info *sbi = ll_i2sbi(inode);
+
+        if (sbi->ll_mdc_exp->exp_connect_flags & OBD_CONNECT_FID) {
+                return ll_readdir_20(filp, cookie, filldir);
+        } else {
+                return ll_readdir_18(filp, cookie, filldir);
+        }
+}
+
  #define QCTL_COPY(out, in)              \
  do {                                    \
          Q_COPY(out, in, qc_cmd);        \
@@ -420,7 +861,7 @@ do {                                    \
          Q_COPY(out, in, qc_dqblk);      \
  } while (0)
  
-int ll_send_mgc_param(struct obd_export *mgc, char *string)
+static int ll_send_mgc_param(struct obd_export *mgc, char *string)
  {
          struct mgs_send_param *msp;
          int rc = 0;
@@ -439,7 +880,7 @@ int ll_send_mgc_param(struct obd_export *mgc, char *string)
          return rc;
  }
  
-char *ll_get_fsname(struct inode *inode)
+static char *ll_get_fsname(struct inode *inode)
  {
          struct lustre_sb_info *lsi = s2lsi(inode->i_sb);
          char *ptr, *fsname;
@@ -460,11 +901,12 @@ int ll_dir_setstripe(struct inode *inode, struct lov_user_md *lump,
                       int set_default)
  {
          struct ll_sb_info *sbi = ll_i2sbi(inode);
-        struct mdc_op_data data;
+        struct mdc_op_data data = { { 0 } };
          struct ptlrpc_request *req = NULL;
          struct lustre_sb_info *lsi = s2lsi(inode->i_sb);
          struct obd_device *mgc = lsi->lsi_mgc;
          char *fsname = NULL, *param = NULL;
+        int lum_size;
  
          struct iattr attr = { 0 };
          int rc = 0;
@@ -474,17 +916,33 @@ int ll_dir_setstripe(struct inode *inode, struct lov_user_md *lump,
           * local endian.  But the MDS would like it in little
           * endian, so we swab it before we send it.
           */
-        if (lump->lmm_magic != LOV_USER_MAGIC)
+        switch (lump->lmm_magic) {
+        case LOV_USER_MAGIC_V1: {
+                if (lump->lmm_magic != cpu_to_le32(LOV_USER_MAGIC_V1))
+                        lustre_swab_lov_user_md_v1(lump);
+                lum_size = sizeof(struct lov_user_md_v1);
+                break;
+                }
+        case LOV_USER_MAGIC_V3: {
+                if (lump->lmm_magic != cpu_to_le32(LOV_USER_MAGIC_V3))
+                        lustre_swab_lov_user_md_v3((struct lov_user_md_v3 *)lump);
+                lum_size = sizeof(struct lov_user_md_v3);
+                break;
+                }
+        default: {
+                CDEBUG(D_IOCTL, "bad userland LOV MAGIC:"
+                                " %#08x != %#08x nor %#08x\n",
+                                lump->lmm_magic, LOV_USER_MAGIC_V1,
+                                LOV_USER_MAGIC_V3);
                  RETURN(-EINVAL);
-
-        if (lump->lmm_magic != cpu_to_le32(LOV_USER_MAGIC))
-                lustre_swab_lov_user_md(lump);
+                }
+        }
  
          ll_prepare_mdc_op_data(&data, inode, NULL, NULL, 0, 0, NULL);
  
          /* swabbing is done in lov_setstripe() on server side */
          rc = mdc_setattr(sbi->ll_mdc_exp, &data,
-                         &attr, lump, sizeof(*lump), NULL, 0, &req);
+                         &attr, lump, lum_size, NULL, 0, &req);
          if (rc) {
                  ptlrpc_req_finished(req);
                  if (rc != -EPERM && rc != -EACCES)
@@ -493,6 +951,9 @@ int ll_dir_setstripe(struct inode *inode, struct lov_user_md *lump,
          }
          ptlrpc_req_finished(req);
  
+        /* In the following we use the fact that LOV_USER_MAGIC_V1 and
+         LOV_USER_MAGIC_V3 have the same initial fields so we do not
+         need the make the distiction between the 2 versions */
          if (set_default && mgc->u.cli.cl_mgc_mgsexp) {
                  OBD_ALLOC(param, MGS_PARAM_MAXLEN);
  
@@ -500,21 +961,21 @@ int ll_dir_setstripe(struct inode *inode, struct lov_user_md *lump,
                  fsname = ll_get_fsname(inode);
                  /* Set root stripesize */
                  sprintf(param, "%s-MDT0000.lov.stripesize=%u", fsname,
-                        lump->lmm_stripe_size);
+                        le32_to_cpu(lump->lmm_stripe_size));
                  rc = ll_send_mgc_param(mgc->u.cli.cl_mgc_mgsexp, param);
                  if (rc)
                          goto end;
  
                  /* Set root stripecount */
                  sprintf(param, "%s-MDT0000.lov.stripecount=%u", fsname,
-                        lump->lmm_stripe_count);
+                        le16_to_cpu(lump->lmm_stripe_count));
                  rc = ll_send_mgc_param(mgc->u.cli.cl_mgc_mgsexp, param);
                  if (rc)
                          goto end;
  
                  /* Set root stripeoffset */
                  sprintf(param, "%s-MDT0000.lov.stripeoffset=%u", fsname,
-                        lump->lmm_stripe_offset);
+                        le16_to_cpu(lump->lmm_stripe_offset));
                  rc = ll_send_mgc_param(mgc->u.cli.cl_mgc_mgsexp, param);
                  if (rc)
                          goto end;
@@ -574,9 +1035,21 @@ int ll_dir_getstripe(struct inode *inode, struct lov_mds_md **lmmp,
           * little endian.  We convert it to host endian before
           * passing it to userspace.
           */
-        if ((LOV_MAGIC != cpu_to_le32(LOV_MAGIC)) &&
-            (cpu_to_le32(LOV_MAGIC) == lmm->lmm_magic))
-                lustre_swab_lov_user_md((struct lov_user_md *)lmm);
+        /* We don't swab objects for directories */
+        switch (le32_to_cpu(lmm->lmm_magic)) {
+        case LOV_MAGIC_V1:
+                if (LOV_MAGIC != cpu_to_le32(LOV_MAGIC))
+                        lustre_swab_lov_user_md_v1((struct lov_user_md_v1 *)lmm);
+                break;
+        case LOV_MAGIC_V3:
+                if (LOV_MAGIC != cpu_to_le32(LOV_MAGIC))
+                        lustre_swab_lov_user_md_v3((struct lov_user_md_v3 *)lmm);
+                break;
+        default:
+                CERROR("unknown magic: %lX\n", (unsigned long)lmm->lmm_magic);
+                rc = -EPROTO;
+        }
+
  out:
          *lmmp = lmm;
          *lmm_size = lmmsize;
@@ -648,21 +1121,34 @@ static int ll_dir_ioctl(struct inode *inode, struct file *file,
                  return rc;
          }
          case LL_IOC_LOV_SETSTRIPE: {
-                struct lov_user_md lum, *lump = (struct lov_user_md *)arg;
+                struct lov_user_md_v3 lumv3;
+                struct lov_user_md_v1 *lumv1 = (struct lov_user_md_v1 *)&lumv3;
+                struct lov_user_md_v1 *lumv1p = (struct lov_user_md_v1 *)arg;
+                struct lov_user_md_v3 *lumv3p = (struct lov_user_md_v3 *)arg;
+
                  int rc = 0;
                  int set_default = 0;
  
-                LASSERT(sizeof(lum) == sizeof(*lump));
-                LASSERT(sizeof(lum.lmm_objects[0]) ==
-                        sizeof(lump->lmm_objects[0]));
-                rc = copy_from_user(&lum, lump, sizeof(lum));
+                LASSERT(sizeof(lumv3) == sizeof(*lumv3p));
+                LASSERT(sizeof(lumv3.lmm_objects[0]) ==
+                        sizeof(lumv3p->lmm_objects[0]));
+
+                /* first try with v1 which is smaller than v3 */
+                rc = copy_from_user(lumv1, lumv1p, sizeof(*lumv1));
                  if (rc)
                          return(-EFAULT);
  
+                if (lumv1->lmm_magic == LOV_USER_MAGIC_V3) {
+                        rc = copy_from_user(&lumv3, lumv3p, sizeof(lumv3));
+                        if (rc)
+                                RETURN(-EFAULT);
+                }
+
                  if (inode->i_sb->s_root == file->f_dentry)
                          set_default = 1;
  
-                rc = ll_dir_setstripe(inode, &lum, set_default);
+                /* in v1 and v3 cases lumv1 points to data */
+                rc = ll_dir_setstripe(inode, lumv1, set_default);
  
                  return rc;
          }
@@ -776,13 +1262,27 @@ static int ll_dir_ioctl(struct inode *inode, struct file *file,
                  if (rc)
                          GOTO(free_lmm, rc = -EFAULT);
  
-                if (lmm->lmm_magic != LOV_USER_MAGIC)
+                switch (lmm->lmm_magic) {
+                case LOV_USER_MAGIC_V1:
+                        if (LOV_USER_MAGIC == cpu_to_le32(LOV_USER_MAGIC))
+                                break;
+                        /* swab objects first so that stripes num will be sane */
+                        lustre_swab_lov_user_md_objects(
+                                ((struct lov_user_md_v1 *)lmm)->lmm_objects,
+                                ((struct lov_user_md_v1 *)lmm)->lmm_stripe_count);
+                        lustre_swab_lov_user_md_v1((struct lov_user_md_v1 *)lmm);
+                        break;
+                case LOV_USER_MAGIC_V3:
+                        if (LOV_USER_MAGIC == cpu_to_le32(LOV_USER_MAGIC))
+                                break;
+                        /* swab objects first so that stripes num will be sane */
+                        lustre_swab_lov_user_md_objects(
+                                ((struct lov_user_md_v3 *)lmm)->lmm_objects,
+                                ((struct lov_user_md_v3 *)lmm)->lmm_stripe_count);
+                        lustre_swab_lov_user_md_v3((struct lov_user_md_v3 *)lmm);
+                        break;
+                default:
                          GOTO(free_lmm, rc = -EINVAL);
-
-                if (LOV_USER_MAGIC != cpu_to_le32(LOV_USER_MAGIC) &&
-                    cpu_to_le32(LOV_USER_MAGIC) == cpu_to_le32(lmm->lmm_magic)) {
-                        lustre_swab_lov_user_md_objects((struct lov_user_md *)lmm);
-                        lustre_swab_lov_user_md((struct lov_user_md *)lmm);
                  }
  
                  rc = obd_unpackmd(sbi->ll_osc_exp, &lsm, lmm, lmmsize);
@@ -862,7 +1362,7 @@ static int ll_dir_ioctl(struct inode *inode, struct file *file,
                  struct obd_quotactl *oqctl;
                  int rc, error = 0;
  
-                if (!capable(CAP_SYS_ADMIN))
+                if (!cfs_capable(CFS_CAP_SYS_ADMIN))
                          RETURN(-EPERM);
  
                  OBD_ALLOC_PTR(oqctl);
@@ -886,7 +1386,7 @@ static int ll_dir_ioctl(struct inode *inode, struct file *file,
                  struct if_quotacheck *check;
                  int rc;
  
-                if (!capable(CAP_SYS_ADMIN))
+                if (!cfs_capable(CFS_CAP_SYS_ADMIN))
                          RETURN(-EPERM);
  
                  OBD_ALLOC_PTR(check);
@@ -914,7 +1414,6 @@ static int ll_dir_ioctl(struct inode *inode, struct file *file,
                  OBD_FREE_PTR(check);
                  RETURN(rc);
          }
-#ifdef HAVE_QUOTA_SUPPORT
          case OBD_IOC_QUOTACTL: {
                  struct if_quotactl *qctl;
                  struct obd_quotactl *oqctl;
@@ -943,13 +1442,13 @@ static int ll_dir_ioctl(struct inode *inode, struct file *file,
                  case Q_QUOTAOFF:
                  case Q_SETQUOTA:
                  case Q_SETINFO:
-                        if (!capable(CAP_SYS_ADMIN))
+                        if (!cfs_capable(CFS_CAP_SYS_ADMIN))
                                  GOTO(out_quotactl, rc = -EPERM);
                          break;
                  case Q_GETQUOTA:
                          if (((type == USRQUOTA && current->euid != id) ||
                               (type == GRPQUOTA && !in_egroup_p(id))) &&
-                            !capable(CAP_SYS_ADMIN))
+                            !cfs_capable(CFS_CAP_SYS_ADMIN))
                                  GOTO(out_quotactl, rc = -EPERM);
  
                          /* XXX: dqb_valid is borrowed as a flag to mark that
@@ -1030,7 +1529,6 @@ static int ll_dir_ioctl(struct inode *inode, struct file *file,
                  OBD_FREE_PTR(oqctl);
                  RETURN(rc);
          }
-#endif /* HAVE_QUOTA_SUPPORT */
          case OBD_IOC_GETNAME_OLD:
          case OBD_IOC_GETNAME: {
                  struct obd_device *obd = class_exp2obd(sbi->ll_osc_exp);
@@ -1053,4 +1551,3 @@ struct file_operations ll_dir_operations = {
          .readdir  = ll_readdir,
          .ioctl    = ll_dir_ioctl
  };
-
diff --git a/lustre/llite/file.c b/lustre/llite/file.c

index 66e4f8a..7588544 100644 (file)
--- a/lustre/llite/file.c
+++ b/lustre/llite/file.c
@@ -1,25 +1,43 @@
  /* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
   * vim:expandtab:shiftwidth=8:tabstop=8:
   *
- *  Copyright (c) 2002, 2003 Cluster File Systems, Inc.
- *   Author: Peter Braam <braam@clusterfs.com>
- *   Author: Phil Schwan <phil@clusterfs.com>
- *   Author: Andreas Dilger <adilger@clusterfs.com>
+ * GPL HEADER START
   *
- *   This file is part of Lustre, http://www.lustre.org.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   *
- *   Lustre is free software; you can redistribute it and/or
- *   modify it under the terms of version 2 of the GNU General Public
- *   License as published by the Free Software Foundation.
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
   *
- *   Lustre is distributed in the hope that it will be useful,
- *   but WITHOUT ANY WARRANTY; without even the implied warranty of
- *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- *   GNU General Public License for more details.
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
   *
- *   You should have received a copy of the GNU General Public License
- *   along with Lustre; if not, write to the Free Software
- *   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright  2008 Sun Microsystems, Inc. All rights reserved
+ * Use is subject to license terms.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/llite/file.c
+ *
+ * Author: Peter Braam <braam@clusterfs.com>
+ * Author: Phil Schwan <phil@clusterfs.com>
+ * Author: Andreas Dilger <adilger@clusterfs.com>
   */
  
  #define DEBUG_SUBSYSTEM S_LLITE
@@ -27,10 +45,8 @@
  #include <lustre_lite.h>
  #include <linux/pagemap.h>
  #include <linux/file.h>
-#if (LINUX_VERSION_CODE < KERNEL_VERSION(2,5,0))
-#include <linux/lustre_compat25.h>
-#endif
  #include "llite_internal.h"
+#include <lustre/ll_fiemap.h>
  
  /* also used by llite/special.c:ll_special_open() */
  struct ll_file_data *ll_file_data_get(void)
@@ -53,6 +69,7 @@ static int ll_close_inode_openhandle(struct inode *inode,
          struct ptlrpc_request *req = NULL;
          struct obd_device *obd;
          struct obdo *oa;
+        struct mdc_op_data data = { { 0 } };
          int rc;
          ENTRY;
  
@@ -85,8 +102,8 @@ static int ll_close_inode_openhandle(struct inode *inode,
                  oa->o_flags = MDS_BFLAG_UNCOMMITTED_WRITES;
                  oa->o_valid |= OBD_MD_FLFLAGS;
          }
-
-        rc = mdc_close(ll_i2mdcexp(inode), oa, och, &req);
+        ll_inode2fid(&data.fid1, inode);
+        rc = mdc_close(ll_i2mdcexp(inode), &data, oa, och, &req);
          if (rc == EAGAIN) {
                  /* We are the last writer, so the MDS has instructed us to get
                   * the file size and any write cookies, then close again. */
@@ -179,9 +196,10 @@ int ll_mdc_close(struct obd_export *mdc_exp, struct inode *inode,
                  int flags = LDLM_FL_BLOCK_GRANTED | LDLM_FL_TEST_LOCK;
                  struct lustre_handle lockh;
                  struct inode *inode = file->f_dentry->d_inode;
-                struct ldlm_res_id file_res_id = {.name={inode->i_ino,
-                                                         inode->i_generation}};
+                struct ldlm_res_id file_res_id;
+
                  ldlm_policy_data_t policy = {.l_inodebits={MDS_INODELOCK_OPEN}};
+                fid_build_reg_res_name(ll_inode_lu_fid(inode), &file_res_id);
  
                  down(&lli->lli_och_sem);
                  if (fd->fd_omode & FMODE_WRITE) {
@@ -230,8 +248,8 @@ int ll_file_release(struct inode *inode, struct file *file)
          struct ll_inode_info *lli = ll_i2info(inode);
          struct lov_stripe_md *lsm = lli->lli_smd;
          int rc;
-
          ENTRY;
+
          CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p)\n", inode->i_ino,
                 inode->i_generation, inode);
  
@@ -241,20 +259,18 @@ int ll_file_release(struct inode *inode, struct file *file)
          fd = LUSTRE_FPRIVATE(file);
          LASSERT(fd != NULL);
  
-        /*
-         * The last ref on @file, maybe not the the owner pid of statahead.
+        /* The last ref on @file, maybe not the the owner pid of statahead.
           * Different processes can open the same dir, "ll_opendir_key" means:
-         * it is me that should stop the statahead thread.
-         */
-        if (lli->lli_opendir_key == fd)
-                ll_stop_statahead(inode, fd);
+         * it is me that should stop the statahead thread. */
+        if (lli->lli_opendir_key == fd && lli->lli_opendir_pid != 0)
+                ll_stop_statahead(inode, lli->lli_opendir_key);
  
          if (inode->i_sb->s_root == file->f_dentry) {
                  LUSTRE_FPRIVATE(file) = NULL;
                  ll_file_data_put(fd);
                  RETURN(0);
          }
-        
+
          if (lsm)
                  lov_test_and_clear_async_rc(lsm);
          lli->lli_async_rc = 0;
@@ -267,7 +283,7 @@ static int ll_intent_file_open(struct file *file, void *lmm,
                                 int lmmsize, struct lookup_intent *itp)
  {
          struct ll_sb_info *sbi = ll_i2sbi(file->f_dentry->d_inode);
-        struct mdc_op_data data;
+        struct mdc_op_data data = { { 0 } };
          struct dentry *parent = file->f_dentry->d_parent;
          const char *name = file->f_dentry->d_name.name;
          const int len = file->f_dentry->d_name.len;
@@ -300,11 +316,11 @@ static int ll_intent_file_open(struct file *file, void *lmm,
                  /* reason for keep own exit path - don`t flood log
                  * with messages with -ESTALE errors.
                  */
-                if (!it_disposition(itp, DISP_OPEN_OPEN) || 
+                if (!it_disposition(itp, DISP_OPEN_OPEN) ||
                       it_open_error(DISP_OPEN_OPEN, itp))
                          GOTO(out, rc);
                  ll_release_openhandle(file->f_dentry, itp);
-                GOTO(out_stale, rc);
+                GOTO(out, rc);
          }
  
          if (rc != 0 || it_open_error(DISP_OPEN_OPEN, itp)) {
@@ -321,8 +337,6 @@ static int ll_intent_file_open(struct file *file, void *lmm,
                             req, DLM_REPLY_REC_OFF, NULL);
  out:
          ptlrpc_req_finished(itp->d.lustre.it_data);
-
-out_stale:
          it_clear_disposition(itp, DISP_ENQ_COMPLETE);
          ll_intent_drop_lock(itp);
  
@@ -411,27 +425,29 @@ int ll_file_open(struct inode *inode, struct file *file)
                  RETURN(-ENOMEM);
  
          if (S_ISDIR(inode->i_mode)) {
+again:
                  spin_lock(&lli->lli_lock);
-                /*
-                 * "lli->lli_opendir_pid != 0" means someone has set it.
-                 * "lli->lli_sai != NULL" means the previous statahead has not
-                 *                        been cleanup.
-                 */ 
-                if (lli->lli_opendir_pid == 0 && lli->lli_sai == NULL) {
-                        opendir_set = 1;
-                        lli->lli_opendir_pid = cfs_curproc_pid();
+                if (lli->lli_opendir_key == NULL && lli->lli_opendir_pid == 0) {
+                        LASSERT(lli->lli_sai == NULL);
                          lli->lli_opendir_key = fd;
-                } else if (unlikely(lli->lli_opendir_pid == cfs_curproc_pid())) {
+                        lli->lli_opendir_pid = cfs_curproc_pid();
+                        opendir_set = 1;
+                } else if (unlikely(lli->lli_opendir_pid == cfs_curproc_pid() &&
+                                    lli->lli_opendir_key != NULL)) {
                          /* Two cases for this:
                           * (1) The same process open such directory many times.
                           * (2) The old process opened the directory, and exited
                           *     before its children processes. Then new process
                           *     with the same pid opens such directory before the
                           *     old process's children processes exit.
-                         * Change the owner to the latest one.
-                         */
-                        opendir_set = 2;
-                        lli->lli_opendir_key = fd;
+                         * reset stat ahead for such cases. */
+                        spin_unlock(&lli->lli_lock);
+                        CDEBUG(D_INFO, "Conflict statahead for %.*s %lu/%u"
+                               " reset it.\n", file->f_dentry->d_name.len,
+                               file->f_dentry->d_name.name,
+                               inode->i_ino, inode->i_generation);
+                        ll_stop_statahead(inode, lli->lli_opendir_key);
+                        goto again;
                  }
                  spin_unlock(&lli->lli_lock);
          }
@@ -489,9 +505,9 @@ restart:
                                  up(&lli->lli_och_sem);
                                  ll_file_data_put(fd);
                                  GOTO(out_openerr, rc);
-                        }       
+                        }
                          ll_release_openhandle(file->f_dentry, it);
-                        lprocfs_counter_incr(ll_i2sbi(inode)->ll_stats, 
+                        lprocfs_counter_incr(ll_i2sbi(inode)->ll_stats,
                                               LPROC_LL_OPEN);
                  }
                  (*och_usecount)++;
@@ -508,7 +524,9 @@ restart:
                             would attempt to grab och_sem as well, that would
                             result in a deadlock */
                          up(&lli->lli_och_sem);
+                        it->it_flags |= O_CHECK_STALE;
                          rc = ll_intent_file_open(file, NULL, 0, it);
+                        it->it_flags &= ~O_CHECK_STALE;
                          if (rc) {
                                  ll_file_data_put(fd);
                                  GOTO(out_openerr, rc);
@@ -518,7 +536,7 @@ restart:
                                            file->f_dentry->d_inode);
                          goto restart;
                  }
- 
+
                  OBD_ALLOC(*och_p, sizeof (struct obd_client_handle));
                  if (!*och_p) {
                          ll_file_data_put(fd);
@@ -575,13 +593,10 @@ out_och_free:
                  }
                  up(&lli->lli_och_sem);
  out_openerr:
-                if (opendir_set) {
-                        lli->lli_opendir_key = NULL;
-                        lli->lli_opendir_pid = 0;
-                } else if (unlikely(opendir_set == 2)) {
-                        ll_stop_statahead(inode, fd);
-                }
+                if (opendir_set != 0)
+                        ll_stop_statahead(inode, lli->lli_opendir_key);
          }
+
          return rc;
  }
  
@@ -600,10 +615,11 @@ int ll_lsm_getattr(struct obd_export *exp, struct lov_stripe_md *lsm,
          oinfo.oi_md = lsm;
          oinfo.oi_oa = oa;
          oa->o_id = lsm->lsm_object_id;
+        oa->o_gr = lsm->lsm_object_gr;
          oa->o_mode = S_IFREG;
          oa->o_valid = OBD_MD_FLID | OBD_MD_FLTYPE | OBD_MD_FLSIZE |
                  OBD_MD_FLBLOCKS | OBD_MD_FLBLKSZ | OBD_MD_FLMTIME |
-                OBD_MD_FLCTIME;
+                OBD_MD_FLCTIME | OBD_MD_FLGROUP;
  
          set = ptlrpc_prep_set();
          if (set == NULL) {
@@ -622,21 +638,6 @@ int ll_lsm_getattr(struct obd_export *exp, struct lov_stripe_md *lsm,
          RETURN(0);
  }
  
-static inline void ll_remove_suid(struct inode *inode)
-{
-        unsigned int mode;
-
-        /* set S_IGID if S_IXGRP is set, and always set S_ISUID */
-        mode = (inode->i_mode & S_IXGRP)*(S_ISGID/S_IXGRP) | S_ISUID;
-
-        /* was any of the uid bits set? */
-        mode &= inode->i_mode;
-        if (mode && !capable(CAP_FSETID)) {
-                inode->i_mode &= ~mode;
-                // XXX careful here - we cannot change the size
-        }
-}
-
  static int ll_lock_to_stripe_offset(struct inode *inode, struct ldlm_lock *lock)
  {
          struct ll_inode_info *lli = ll_i2info(inode);
@@ -645,9 +646,9 @@ static int ll_lock_to_stripe_offset(struct inode *inode, struct ldlm_lock *lock)
          struct {
                  char name[16];
                  struct ldlm_lock *lock;
-                struct lov_stripe_md *lsm;
-        } key = { .name = KEY_LOCK_TO_STRIPE, .lock = lock, .lsm = lsm };
+        } key = { .name = KEY_LOCK_TO_STRIPE, .lock = lock };
          __u32 stripe, vallen = sizeof(stripe);
+        struct lov_oinfo *loinfo;
          int rc;
          ENTRY;
  
@@ -655,7 +656,7 @@ static int ll_lock_to_stripe_offset(struct inode *inode, struct ldlm_lock *lock)
                  GOTO(check, stripe = 0);
  
          /* get our offset in the lov */
-        rc = obd_get_info(exp, sizeof(key), &key, &vallen, &stripe);
+        rc = obd_get_info(exp, sizeof(key), &key, &vallen, &stripe, lsm);
          if (rc != 0) {
                  CERROR("obd_get_info: rc = %d\n", rc);
                  RETURN(rc);
@@ -663,11 +664,11 @@ static int ll_lock_to_stripe_offset(struct inode *inode, struct ldlm_lock *lock)
          LASSERT(stripe < lsm->lsm_stripe_count);
  
  check:
-        if (lsm->lsm_oinfo[stripe]->loi_id != lock->l_resource->lr_name.name[0]||
-            lsm->lsm_oinfo[stripe]->loi_gr != lock->l_resource->lr_name.name[1]){
+        loinfo = lsm->lsm_oinfo[stripe];
+        if (!osc_res_name_eq(loinfo->loi_id, loinfo->loi_gr,
+                            &lock->l_resource->lr_name)) {
                  LDLM_ERROR(lock, "resource doesn't match object "LPU64"/"LPU64,
-                           lsm->lsm_oinfo[stripe]->loi_id,
-                           lsm->lsm_oinfo[stripe]->loi_gr);
+                           loinfo->loi_id, loinfo->loi_gr);
                  RETURN(-ELDLM_NO_LOCK_DATA);
          }
  
@@ -678,7 +679,7 @@ check:
  void ll_pin_extent_cb(void *data)
  {
          struct page *page = data;
-        
+
          page_cache_get(page);
  
          return;
@@ -728,14 +729,10 @@ int ll_page_removal_cb(void *data, int discard)
                          CERROR("writepage inode %lu(%p) of page %p "
                                 "failed: %d\n", mapping->host->i_ino,
                                 mapping->host, page, rc);
-#if (LINUX_VERSION_CODE > KERNEL_VERSION(2,5,0))
                          if (rc == -ENOSPC)
                                  set_bit(AS_ENOSPC, &mapping->flags);
                          else
                                  set_bit(AS_EIO, &mapping->flags);
-#else
-                        mapping->gfp_mask |= AS_EIO_MASK;
-#endif
                  }
          }
          if (page->mapping != NULL) {
@@ -896,8 +893,8 @@ static int ll_glimpse_callback(struct ldlm_lock *lock, void *reqp)
  
          LDLM_DEBUG(lock, "i_size: %llu -> stripe number %u -> kms "LPU64
                     " atime "LPU64", mtime "LPU64", ctime "LPU64,
-                   i_size_read(inode), stripe, lvb->lvb_size, lvb->lvb_mtime,
-                   lvb->lvb_atime, lvb->lvb_ctime);
+                   i_size_read(inode), stripe, lvb->lvb_size, lvb->lvb_atime,
+                   lvb->lvb_mtime, lvb->lvb_ctime);
   iput:
          iput(inode);
  
@@ -919,9 +916,9 @@ int ll_glimpse_ioctl(struct ll_sb_info *sbi, struct lov_stripe_md *lsm,
          struct obd_info oinfo = { { { 0 } } };
          struct ost_lvb lvb;
          int rc;
-        
+
          ENTRY;
-        
+
          einfo.ei_type = LDLM_EXTENT;
          einfo.ei_mode = LCK_PR;
          einfo.ei_cb_bl = osc_extent_blocking_cb;
@@ -942,7 +939,7 @@ int ll_glimpse_ioctl(struct ll_sb_info *sbi, struct lov_stripe_md *lsm,
                         "returning -EIO\n", rc);
                  RETURN(rc > 0 ? -EIO : rc);
          }
-        
+
          lov_stripe_lock(lsm);
          memset(&lvb, 0, sizeof(lvb));
          obd_merge_lvb(sbi->ll_osc_exp, lsm, &lvb, 0);
@@ -952,7 +949,7 @@ int ll_glimpse_ioctl(struct ll_sb_info *sbi, struct lov_stripe_md *lsm,
          st->st_atime = lvb.lvb_atime;
          st->st_ctime = lvb.lvb_ctime;
          lov_stripe_unlock(lsm);
-        
+
          RETURN(rc);
  }
  
@@ -1005,6 +1002,11 @@ int ll_glimpse_size(struct inode *inode, int ast_flags)
  
          ll_inode_size_lock(inode, 1);
          inode_init_lvb(inode, &lvb);
+        /* merge timestamps the most resently obtained from mds with
+           timestamps obtained from osts */
+        lvb.lvb_atime = lli->lli_lvb.lvb_atime;
+        lvb.lvb_mtime = lli->lli_lvb.lvb_mtime;
+        lvb.lvb_ctime = lli->lli_lvb.lvb_ctime;
          rc = obd_merge_lvb(sbi->ll_osc_exp, lli->lli_smd, &lvb, 0);
          i_size_write(inode, lvb.lvb_size);
          inode->i_blocks = lvb.lvb_blocks;
@@ -1162,12 +1164,13 @@ static int ll_is_file_contended(struct file *file)
  static int ll_file_get_tree_lock_iov(struct ll_lock_tree *tree,
                                       struct file *file, const struct iovec *iov,
                                       unsigned long nr_segs,
-                                     loff_t start, loff_t end, int rw)
+                                     obd_off start, obd_off end, int rw)
  {
          int append;
          int tree_locked = 0;
          int rc;
          struct inode * inode = file->f_dentry->d_inode;
+        ENTRY;
  
          append = (rw == OBD_BRW_WRITE) && (file->f_flags & O_APPEND);
  
@@ -1200,7 +1203,7 @@ out:
  
  /* XXX: exact copy from kernel code (__generic_file_aio_write_nolock from rhel4)
   */
-static size_t ll_file_get_iov_count(const struct iovec *iov, 
+static size_t ll_file_get_iov_count(const struct iovec *iov,
                                       unsigned long *nr_segs)
  {
          size_t count = 0;
@@ -1467,8 +1470,7 @@ repeat:
          if (sbi->ll_max_rw_chunk != 0) {
                  /* first, let's know the end of the current stripe */
                  end = *ppos;
-                obd_extent_calc(sbi->ll_osc_exp, lsm, OBD_CALC_STRIPE_END,
-                                (obd_off *)&end);
+                obd_extent_calc(sbi->ll_osc_exp, lsm, OBD_CALC_STRIPE_END,&end);
  
                  /* correct, the end is beyond the request */
                  if (end > *ppos + count - 1)
@@ -1490,7 +1492,7 @@ repeat:
                                  nrsegs_orig = nr_segs;
                                  OBD_ALLOC(iov_copy, sizeof(*iov) * nr_segs);
                                  if (!iov_copy)
-                                        GOTO(out, retval = -ENOMEM); 
+                                        GOTO(out, retval = -ENOMEM);
                          }
  
                          iov_copy_update(&nr_segs, &iov, &nrsegs_copy, iov_copy,
@@ -1560,11 +1562,23 @@ repeat:
  
          /* turn off the kernel's read-ahead */
          if (lock_style != LL_LOCK_STYLE_NOLOCK) {
-#if (LINUX_VERSION_CODE < KERNEL_VERSION(2,5,0))
-                file->f_ramax = 0;
-#else
+                /* read under locks
+                 *
+                 * 1. update inode's atime as long as concurrent stat
+                 * (via ll_glimpse_size) might bring out-of-date ones
+                 *
+                 * 2. update lsm so that next stat (via
+                 * ll_glimpse_size) could get correct values in lsm */
+                struct ost_lvb xtimes;
+
+                lov_stripe_lock(lsm);
+                LTIME_S(inode->i_atime) = LTIME_S(CURRENT_TIME);
+                xtimes.lvb_atime = LTIME_S(inode->i_atime);
+                obd_update_lvb(sbi->ll_osc_exp, lsm, &xtimes,
+                               OBD_MD_FLATIME);
+                lov_stripe_unlock(lsm);
+
                  file->f_ra.ra_pages = 0;
-#endif
                  /* initialize read-ahead window once per syscall */
                  if (ra == 0) {
                          ra = 1;
@@ -1584,6 +1598,11 @@ repeat:
                  ll_file_put_lock(inode, end, lock_style, cookie,
                                   &tree, OBD_BRW_READ);
          } else {
+                /* lockless read
+                 *
+                 * current time will get into request as atime
+                 * (lustre/osc/osc_request.c:osc_build_request())
+                 */
                  retval = ll_file_lockless_io(file, iov_copy, nrsegs_copy, ppos,
                                               READ, chunk);
          }
@@ -1658,7 +1677,7 @@ static ssize_t ll_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
  
          CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p),size="LPSZ",offset=%Ld\n",
                 inode->i_ino, inode->i_generation, inode, count, *ppos);
-        
+
          SIGNAL_MASK_ASSERT(); /* XXX BUG 1511 */
  
          /* POSIX, but surprised the VFS doesn't check this already */
@@ -1687,7 +1706,7 @@ repeat:
          } else if (sbi->ll_max_rw_chunk != 0) {
                  /* first, let's know the end of the current stripe */
                  end = *ppos;
-                obd_extent_calc(sbi->ll_osc_exp, lsm, OBD_CALC_STRIPE_END, 
+                obd_extent_calc(sbi->ll_osc_exp, lsm, OBD_CALC_STRIPE_END,
                                  (obd_off *)&end);
  
                  /* correct, the end is beyond the request */
@@ -1711,7 +1730,7 @@ repeat:
                                  nrsegs_orig = nr_segs;
                                  OBD_ALLOC(iov_copy, sizeof(*iov) * nr_segs);
                                  if (!iov_copy)
-                                        GOTO(out, retval = -ENOMEM); 
+                                        GOTO(out, retval = -ENOMEM);
                          }
                          iov_copy_update(&nr_segs, &iov, &nrsegs_copy, iov_copy,
                                          &iov_offset, chunk);
@@ -1751,16 +1770,41 @@ repeat:
          chunk = end - *ppos + 1;
          CDEBUG(D_INFO, "Writing inode %lu, "LPSZ" bytes, offset %Lu\n",
                 inode->i_ino, chunk, *ppos);
-        if (tree_locked)
+        if (tree_locked) {
+                /* write under locks
+                 *
+                 * 1. update inode's mtime and ctime as long as
+                 * concurrent stat (via ll_glimpse_size) might bring
+                 * out-of-date ones
+                 *
+                 * 2. update lsm so that next stat (via
+                 * ll_glimpse_size) could get correct values in lsm */
+                struct ost_lvb xtimes;
+
+                lov_stripe_lock(lsm);
+                LTIME_S(inode->i_mtime) = LTIME_S(CURRENT_TIME);
+                LTIME_S(inode->i_ctime) = LTIME_S(CURRENT_TIME);
+                xtimes.lvb_mtime = LTIME_S(inode->i_mtime);
+                xtimes.lvb_ctime = LTIME_S(inode->i_ctime);
+                obd_update_lvb(sbi->ll_osc_exp, lsm, &xtimes,
+                               OBD_MD_FLMTIME | OBD_MD_FLCTIME);
+                lov_stripe_unlock(lsm);
+
  #ifdef HAVE_FILE_WRITEV
                  retval = generic_file_writev(file, iov_copy, nrsegs_copy, ppos);
  #else
                  retval = generic_file_aio_write(iocb, iov_copy, nrsegs_copy,
                                                  *ppos);
  #endif
-        else
+        } else {
+                /* lockless write
+                 *
+                 * current time will get into request as mtime and
+                 * ctime (lustre/osc/osc_request.c:osc_build_request())
+                 */
                  retval = ll_file_lockless_io(file, iov_copy, nrsegs_copy,
                                               ppos, WRITE, chunk);
+        }
          ll_rw_stats_tally(ll_i2sbi(inode), current->pid, file, chunk, 1);
  
  out_unlock:
@@ -1812,7 +1856,6 @@ static ssize_t ll_file_write(struct file *file, const char *buf, size_t count,
  /*
   * Send file content (through pagecache) somewhere with helper
   */
-#if (LINUX_VERSION_CODE > KERNEL_VERSION(2,5,0))
  static ssize_t ll_file_sendfile(struct file *in_file, loff_t *ppos,size_t count,
                                  read_actor_t actor, void *target)
  {
@@ -1905,7 +1948,6 @@ static ssize_t ll_file_sendfile(struct file *in_file, loff_t *ppos,size_t count,
          ll_tree_unlock(&tree);
          RETURN(retval);
  }
-#endif
  
  static int ll_lov_recreate_obj(struct inode *inode, struct file *file,
                                 unsigned long arg)
@@ -1920,7 +1962,7 @@ static int ll_lov_recreate_obj(struct inode *inode, struct file *file,
          struct lov_stripe_md *lsm, *lsm2;
          ENTRY;
  
-        if (!capable (CAP_SYS_ADMIN))
+        if (!cfs_capable(CFS_CAP_SYS_ADMIN))
                  RETURN(-EPERM);
  
          rc = copy_from_user(&ucreatp, (struct ll_recreate_obj *)arg,
@@ -2000,8 +2042,8 @@ out_req_free:
          goto out;
  }
  
-int ll_lov_getstripe_ea_info(struct inode *inode, const char *filename, 
-                             struct lov_mds_md **lmmp, int *lmm_size, 
+int ll_lov_getstripe_ea_info(struct inode *inode, const char *filename,
+                             struct lov_mds_md **lmmp, int *lmm_size,
                               struct ptlrpc_request **request)
  {
          struct ll_sb_info *sbi = ll_i2sbi(inode);
@@ -2045,8 +2087,9 @@ int ll_lov_getstripe_ea_info(struct inode *inode, const char *filename,
          LASSERT(lmm != NULL);
          LASSERT(lustre_rep_swabbed(req, REPLY_REC_OFF + 1));
  
-        if ((lmm->lmm_magic != cpu_to_le32(LOV_MAGIC)) &&
-             (lmm->lmm_magic != cpu_to_le32(LOV_MAGIC_JOIN))) {
+        if ((lmm->lmm_magic != cpu_to_le32(LOV_MAGIC_V1)) &&
+            (lmm->lmm_magic != cpu_to_le32(LOV_MAGIC_V3)) &&
+            (lmm->lmm_magic != cpu_to_le32(LOV_MAGIC_JOIN))) {
                  GOTO(out, rc = -EPROTO);
          }
          /*
@@ -2055,9 +2098,20 @@ int ll_lov_getstripe_ea_info(struct inode *inode, const char *filename,
           * passing it to userspace.
           */
          if (LOV_MAGIC != cpu_to_le32(LOV_MAGIC)) {
-                if (lmm->lmm_magic == cpu_to_le32(LOV_MAGIC)) {
-                        lustre_swab_lov_user_md((struct lov_user_md *)lmm);
-                        lustre_swab_lov_user_md_objects((struct lov_user_md *)lmm);
+                /* if function called for directory - we should
+                 * avoid swab not existent lsm objects */
+                if (lmm->lmm_magic == cpu_to_le32(LOV_MAGIC_V1)) {
+                        lustre_swab_lov_user_md_v1((struct lov_user_md_v1 *)lmm);
+                        if (S_ISREG(body->mode))
+                                lustre_swab_lov_user_md_objects(
+                                 ((struct lov_user_md_v1 *)lmm)->lmm_objects,
+                                 ((struct lov_user_md_v1 *)lmm)->lmm_stripe_count);
+                } else if (lmm->lmm_magic == cpu_to_le32(LOV_MAGIC_V3)) {
+                        lustre_swab_lov_user_md_v3((struct lov_user_md_v3 *)lmm);
+                        if (S_ISREG(body->mode))
+                                lustre_swab_lov_user_md_objects(
+                                 ((struct lov_user_md_v3 *)lmm)->lmm_objects,
+                                 ((struct lov_user_md_v3 *)lmm)->lmm_stripe_count);
                  } else if (lmm->lmm_magic == cpu_to_le32(LOV_MAGIC_JOIN)) {
                          lustre_swab_lov_user_md_join((struct lov_user_md_join *)lmm);
                  }
@@ -2130,7 +2184,7 @@ static int ll_lov_setea(struct inode *inode, struct file *file,
          int rc;
          ENTRY;
  
-        if (!capable (CAP_SYS_ADMIN))
+        if (!cfs_capable(CFS_CAP_SYS_ADMIN))
                  RETURN(-EPERM);
  
          OBD_ALLOC(lump, lum_size);
@@ -2152,23 +2206,34 @@ static int ll_lov_setea(struct inode *inode, struct file *file,
  static int ll_lov_setstripe(struct inode *inode, struct file *file,
                              unsigned long arg)
  {
-        struct lov_user_md lum, *lump = (struct lov_user_md *)arg;
+        struct lov_user_md_v3 lumv3;
+        struct lov_user_md_v1 *lumv1 = (struct lov_user_md_v1 *)&lumv3;
+        struct lov_user_md_v1 *lumv1p = (struct lov_user_md_v1 *)arg;
+        struct lov_user_md_v3 *lumv3p = (struct lov_user_md_v3 *)arg;
+        int lum_size;
          int rc;
          int flags = FMODE_WRITE;
          ENTRY;
  
-        /* Bug 1152: copy properly when this is no longer true */
-        LASSERT(sizeof(lum) == sizeof(*lump));
-        LASSERT(sizeof(lum.lmm_objects[0]) == sizeof(lump->lmm_objects[0]));
-        rc = copy_from_user(&lum, lump, sizeof(lum));
+        /* first try with v1 which is smaller than v3 */
+        lum_size = sizeof(struct lov_user_md_v1);
+        rc = copy_from_user(lumv1, lumv1p, lum_size);
          if (rc)
                  RETURN(-EFAULT);
  
-        rc = ll_lov_setstripe_ea_info(inode, file, flags, &lum, sizeof(lum));
+        if (lumv1->lmm_magic == LOV_USER_MAGIC_V3) {
+                lum_size = sizeof(struct lov_user_md_v3);
+                rc = copy_from_user(&lumv3, lumv3p, lum_size);
+                if (rc)
+                        RETURN(-EFAULT);
+        }
+
+        rc = ll_lov_setstripe_ea_info(inode, file, flags, lumv1, lum_size);
          if (rc == 0) {
-                 put_user(0, &lump->lmm_stripe_count);
+                 put_user(0, &lumv1p->lmm_stripe_count);
                   rc = obd_iocontrol(LL_IOC_LOV_GETSTRIPE, ll_i2obdexp(inode),
-                                    0, ll_i2info(inode)->lli_smd, lump);
+                                    0, ll_i2info(inode)->lli_smd,
+                                    (void *)arg);
          }
          RETURN(rc);
  }
@@ -2244,6 +2309,7 @@ static int ll_put_grouplock(struct inode *inode, struct file *file,
          RETURN(0);
  }
  
+#if LUSTRE_FIX >= 50
  static int join_sanity_check(struct inode *head, struct inode *tail)
  {
          ENTRY;
@@ -2314,6 +2380,8 @@ static int join_file(struct inode *head_inode, struct file *head_filp,
                  ldlm_lock_decref(&lockh, oit.d.lustre.it_lock_mode);
                  oit.d.lustre.it_lock_mode = 0;
          }
+        ptlrpc_req_finished((struct ptlrpc_request *) oit.d.lustre.it_data);
+        it_clear_disposition(&oit, DISP_ENQ_COMPLETE);
          ll_release_openhandle(head_filp->f_dentry, &oit);
  out:
          if (op_data)
@@ -2414,7 +2482,17 @@ cleanup:
          }
          RETURN(rc);
  }
+#endif  /* LUSTRE_FIX >= 50 */
  
+/**
+ * Close inode open handle
+ *
+ * \param dentry [in]     dentry which contains the inode
+ * \param it     [in,out] intent which contains open info and result
+ *
+ * \retval 0     success
+ * \retval <0    failure
+ */
  int ll_release_openhandle(struct dentry *dentry, struct lookup_intent *it)
  {
          struct inode *inode = dentry->d_inode;
@@ -2445,11 +2523,49 @@ int ll_release_openhandle(struct dentry *dentry, struct lookup_intent *it)
          OBD_FREE(och, sizeof(*och));
   out:
          /* this one is in place of ll_file_open */
-        ptlrpc_req_finished(it->d.lustre.it_data);
+        if (it_disposition(it, DISP_ENQ_OPEN_REF))
+                ptlrpc_req_finished(it->d.lustre.it_data);
          it_clear_disposition(it, DISP_ENQ_OPEN_REF);
          RETURN(rc);
  }
  
+int ll_fiemap(struct inode *inode, struct ll_user_fiemap *fiemap,
+              int num_bytes)
+{
+        struct obd_export *exp = ll_i2obdexp(inode);
+        struct lov_stripe_md *lsm = ll_i2info(inode)->lli_smd;
+        struct ll_fiemap_info_key fm_key = { .name = KEY_FIEMAP, };
+        int vallen = num_bytes;
+        int rc;
+        ENTRY;
+
+        /* If the stripe_count > 1 and the application does not understand
+         * DEVICE_ORDER flag, then it cannot interpret the extents correctly.
+         */
+        if (lsm->lsm_stripe_count > 1 &&
+            !(fiemap->fm_flags & FIEMAP_FLAG_DEVICE_ORDER))
+                return -EOPNOTSUPP;
+
+        fm_key.oa.o_id = lsm->lsm_object_id;
+        fm_key.oa.o_valid = OBD_MD_FLID;
+
+        obdo_from_inode(&fm_key.oa, inode, OBD_MD_FLFID | OBD_MD_FLSIZE);
+
+        /* If filesize is 0, then there would be no objects for mapping */
+        if (fm_key.oa.o_size == 0) {
+                fiemap->fm_mapped_extents = 0;
+                RETURN(0);
+        }
+
+        memcpy(&fm_key.fiemap, fiemap, sizeof(*fiemap));
+
+        rc = obd_get_info(exp, sizeof(fm_key), &fm_key, &vallen, fiemap, lsm);
+        if (rc)
+                CERROR("obd_get_info failed: rc = %d\n", rc);
+
+        RETURN(rc);
+}
+
  int ll_file_ioctl(struct inode *inode, struct file *file, unsigned int cmd,
                    unsigned long arg)
  {
@@ -2499,6 +2615,72 @@ int ll_file_ioctl(struct inode *inode, struct file *file, unsigned int cmd,
                  RETURN(ll_lov_getstripe(inode, arg));
          case LL_IOC_RECREATE_OBJ:
                  RETURN(ll_lov_recreate_obj(inode, file, arg));
+        case EXT3_IOC_FIEMAP: {
+                struct ll_user_fiemap *fiemap_s;
+                size_t num_bytes, ret_bytes;
+                unsigned int extent_count;
+                int rc = 0;
+
+                /* Get the extent count so we can calculate the size of
+                 * required fiemap buffer */
+                if (get_user(extent_count,
+                    &((struct ll_user_fiemap __user *)arg)->fm_extent_count))
+                        RETURN(-EFAULT);
+                num_bytes = sizeof(*fiemap_s) + (extent_count *
+                                                 sizeof(struct ll_fiemap_extent));
+                OBD_VMALLOC(fiemap_s, num_bytes);
+                if (fiemap_s == NULL)
+                        RETURN(-ENOMEM);
+
+                if (copy_from_user(fiemap_s,(struct ll_user_fiemap __user *)arg,
+                                   sizeof(*fiemap_s)))
+                        GOTO(error, rc = -EFAULT);
+
+                if (fiemap_s->fm_flags & ~LUSTRE_FIEMAP_FLAGS_COMPAT) {
+                        fiemap_s->fm_flags = fiemap_s->fm_flags &
+                                                    ~LUSTRE_FIEMAP_FLAGS_COMPAT;
+                        if (copy_to_user((char *)arg, fiemap_s,
+                                         sizeof(*fiemap_s)))
+                                GOTO(error, rc = -EFAULT);
+
+                        GOTO(error, rc = -EBADR);
+                }
+
+                /* If fm_extent_count is non-zero, read the first extent since
+                 * it is used to calculate end_offset and device from previous
+                 * fiemap call. */
+                if (extent_count) {
+                        if (copy_from_user(&fiemap_s->fm_extents[0],
+                            (char __user *)arg + sizeof(*fiemap_s),
+                            sizeof(struct ll_fiemap_extent)))
+                                GOTO(error, rc = -EFAULT);
+                }
+
+                if (fiemap_s->fm_flags & FIEMAP_FLAG_SYNC) {
+                        int rc;
+
+                        rc = filemap_fdatawrite(inode->i_mapping);
+                        if (rc)
+                                GOTO(error, rc);
+                }
+
+                rc = ll_fiemap(inode, fiemap_s, num_bytes);
+                if (rc)
+                        GOTO(error, rc);
+
+                ret_bytes = sizeof(struct ll_user_fiemap);
+
+                if (extent_count != 0)
+                        ret_bytes += (fiemap_s->fm_mapped_extents *
+                                         sizeof(struct ll_fiemap_extent));
+
+                if (copy_to_user((void *)arg, fiemap_s, ret_bytes))
+                        rc = -EFAULT;
+
+error:
+                OBD_VFREE(fiemap_s, num_bytes);
+                RETURN(rc);
+        }
          case EXT3_IOC_GETFLAGS:
          case EXT3_IOC_SETFLAGS:
                  RETURN(ll_iocontrol(inode, file, cmd, arg));
@@ -2506,6 +2688,8 @@ int ll_file_ioctl(struct inode *inode, struct file *file, unsigned int cmd,
          case EXT3_IOC_GETVERSION:
                  RETURN(put_user(inode->i_generation, (int *)arg));
          case LL_IOC_JOIN: {
+#if LUSTRE_FIX >= 50
+                /* Allow file join in beta builds to allow debuggging */
                  char *ftail;
                  int rc;
  
@@ -2515,6 +2699,10 @@ int ll_file_ioctl(struct inode *inode, struct file *file, unsigned int cmd,
                  rc = ll_file_join(inode, file, ftail);
                  putname(ftail);
                  RETURN(rc);
+#else
+                CWARN("file join is not supported in this version of Lustre\n");
+                RETURN(-ENOTTY);
+#endif
          }
          case LL_IOC_GROUP_LOCK:
                  RETURN(ll_get_grouplock(inode, file, arg));
@@ -2543,7 +2731,7 @@ int ll_file_ioctl(struct inode *inode, struct file *file, unsigned int cmd,
          default: {
                  int err;
  
-                if (LLIOC_STOP == 
+                if (LLIOC_STOP ==
                      ll_iocontrol_call(inode, file, cmd, arg, &err))
                          RETURN(err);
  
@@ -2590,12 +2778,7 @@ loff_t ll_file_seek(struct file *file, loff_t offset, int origin)
          if (offset >= 0 && offset <= ll_file_maxbytes(inode)) {
                  if (offset != file->f_pos) {
                          file->f_pos = offset;
-#if (LINUX_VERSION_CODE < KERNEL_VERSION(2,5,0))
-                        file->f_reada = 0;
-                        file->f_version = ++event;
-#else
                          file->f_version = 0;
-#endif
                  }
                  retval = offset;
          }
@@ -2640,22 +2823,29 @@ int ll_fsync(struct file *file, struct dentry *dentry, int data)
                  ptlrpc_req_finished(req);
  
          if (data && lsm) {
-                struct obdo *oa;
+                struct obd_info *oinfo;
  
-                OBDO_ALLOC(oa);
-                if (!oa)
+                OBD_ALLOC_PTR(oinfo);
+                if (!oinfo)
                          RETURN(rc ? rc : -ENOMEM);
-
-                oa->o_id = lsm->lsm_object_id;
-                oa->o_valid = OBD_MD_FLID;
-                obdo_from_inode(oa, inode, OBD_MD_FLTYPE | OBD_MD_FLATIME |
-                                           OBD_MD_FLMTIME | OBD_MD_FLCTIME);
-
-                err = obd_sync(ll_i2sbi(inode)->ll_osc_exp, oa, lsm,
-                               0, OBD_OBJECT_EOF);
+                OBDO_ALLOC(oinfo->oi_oa);
+                if (!oinfo->oi_oa) {
+                        OBD_FREE_PTR(oinfo);
+                        RETURN(rc ? rc : -ENOMEM);
+                }
+                oinfo->oi_oa->o_id = lsm->lsm_object_id;
+                oinfo->oi_oa->o_gr = lsm->lsm_object_gr;
+                oinfo->oi_oa->o_valid = OBD_MD_FLID | OBD_MD_FLGROUP;
+                obdo_from_inode(oinfo->oi_oa, inode,
+                                OBD_MD_FLTYPE | OBD_MD_FLATIME |
+                                OBD_MD_FLMTIME | OBD_MD_FLCTIME);
+                oinfo->oi_md = lsm;
+                err = obd_sync_rqset(ll_i2sbi(inode)->ll_osc_exp, oinfo,
+                                     0, OBD_OBJECT_EOF);
                  if (!rc)
                          rc = err;
-                OBDO_FREE(oa);
+                OBDO_FREE(oinfo->oi_oa);
+                OBD_FREE_PTR(oinfo);
          }
  
          RETURN(rc);
@@ -2665,8 +2855,12 @@ int ll_file_flock(struct file *file, int cmd, struct file_lock *file_lock)
  {
          struct inode *inode = file->f_dentry->d_inode;
          struct ll_sb_info *sbi = ll_i2sbi(inode);
+        struct lu_fid *fid = ll_inode_lu_fid(inode);
          struct ldlm_res_id res_id =
-                    { .name = {inode->i_ino, inode->i_generation, LDLM_FLOCK} };
+                    { .name = { fid_seq(fid),
+                                fid_oid(fid),
+                                fid_ver(fid),
+                                LDLM_FLOCK} };
          struct ldlm_enqueue_info einfo = { LDLM_FLOCK, 0, NULL,
                  ldlm_flock_completion_ast, NULL, file_lock };
          struct lustre_handle lockh = {0};
@@ -2679,6 +2873,15 @@ int ll_file_flock(struct file *file, int cmd, struct file_lock *file_lock)
                 inode->i_ino, file_lock);
          ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_FLOCK, 1);
  
+        if (fid_is_igif(fid)) {
+                /* If this is an IGIF inode, we need to keep the 1.6-style
+                 * flock mapping for compatibility.  If it is a proper FID
+                 * then we know any other client accessing it must also be
+                 * accessing it as a FID and can use the CMD-style flock. */
+                res_id.name[2] = LDLM_FLOCK;
+                res_id.name[3] = 0;
+        }
+
          if (file_lock->fl_flags & FL_FLOCK) {
                  LASSERT((cmd == F_SETLKW) || (cmd == F_SETLK));
                  /* set missing params for flock() calls */
@@ -2709,7 +2912,7 @@ int ll_file_flock(struct file *file, int cmd, struct file_lock *file_lock)
                  break;
          default:
                  CERROR("unknown fcntl lock type: %d\n", file_lock->fl_type);
-                LBUG();
+                RETURN (-EINVAL);
          }
  
          switch (cmd) {
@@ -2736,7 +2939,7 @@ int ll_file_flock(struct file *file, int cmd, struct file_lock *file_lock)
                  break;
          default:
                  CERROR("unknown fcntl lock command: %d\n", cmd);
-                LBUG();
+                RETURN (-EINVAL);
          }
  
          CDEBUG(D_DLMTRACE, "inode=%lu, pid=%u, flags=%#x, mode=%u, "
@@ -2745,10 +2948,12 @@ int ll_file_flock(struct file *file, int cmd, struct file_lock *file_lock)
  
          rc = ldlm_cli_enqueue(sbi->ll_mdc_exp, NULL, &einfo, res_id,
                                &flock, &flags, NULL, 0, NULL, &lockh, 0);
-        if ((file_lock->fl_flags & FL_FLOCK) && (rc == 0))
+        if ((file_lock->fl_flags & FL_FLOCK) &&
+            (rc == 0 || file_lock->fl_type == F_UNLCK))
                  ll_flock_lock_file_wait(file, file_lock, (cmd == F_SETLKW));
  #ifdef HAVE_F_OP_FLOCK
-        if ((file_lock->fl_flags & FL_POSIX) && (rc == 0) &&
+        if ((file_lock->fl_flags & FL_POSIX) &&
+            (rc == 0 || file_lock->fl_type == F_UNLCK) &&
              !(flags & LDLM_FL_TEST_LOCK))
                  posix_lock_file_wait(file, file_lock);
  #endif
@@ -2766,7 +2971,7 @@ int ll_file_noflock(struct file *file, int cmd, struct file_lock *file_lock)
  int ll_have_md_lock(struct inode *inode, __u64 bits)
  {
          struct lustre_handle lockh;
-        struct ldlm_res_id res_id = { .name = {0} };
+        struct ldlm_res_id res_id;
          struct obd_device *obddev;
          ldlm_policy_data_t policy = { .l_inodebits = {bits}};
          int flags;
@@ -2776,10 +2981,12 @@ int ll_have_md_lock(struct inode *inode, __u64 bits)
                 RETURN(0);
  
          obddev = ll_i2mdcexp(inode)->exp_obd;
-        res_id.name[0] = inode->i_ino;
-        res_id.name[1] = inode->i_generation;
+        fid_build_reg_res_name(ll_inode_lu_fid(inode), &res_id);
  
-        CDEBUG(D_INFO, "trying to match res "LPU64"\n", res_id.name[0]);
+        CDEBUG(D_INFO, "trying to match res "LPU64":"LPU64":"LPU64"\n",
+                res_id.name[0],
+                res_id.name[1],
+                res_id.name[2]);
  
          flags = LDLM_FL_BLOCK_GRANTED | LDLM_FL_CBPENDING | LDLM_FL_TEST_LOCK;
          if (ldlm_lock_match(obddev->obd_namespace, flags, &res_id, LDLM_IBITS,
@@ -2825,29 +3032,28 @@ int ll_inode_revalidate_it(struct dentry *dentry, struct lookup_intent *it)
          }
          CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p),name=%s\n",
                 inode->i_ino, inode->i_generation, inode, dentry->d_name.name);
-#if (LINUX_VERSION_CODE <= KERNEL_VERSION(2,5,0))
-        ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_REVALIDATE, 1);
-#endif
  
          exp = ll_i2mdcexp(inode);
  
          if (exp->exp_connect_flags & OBD_CONNECT_ATTRFID) {
                  struct lookup_intent oit = { .it_op = IT_GETATTR };
-                struct mdc_op_data op_data;
+                struct mdc_op_data op_data = { { 0 } };
  
                  /* Call getattr by fid, so do not provide name at all. */
                  ll_prepare_mdc_op_data(&op_data, dentry->d_parent->d_inode,
                                         dentry->d_inode, NULL, 0, 0, NULL);
+                oit.it_flags |= O_CHECK_STALE;
                  rc = mdc_intent_lock(exp, &op_data, NULL, 0,
                                       /* we are not interested in name
                                          based lookup */
                                       &oit, 0, &req,
                                       ll_mdc_blocking_ast, 0);
+                oit.it_flags &= ~O_CHECK_STALE;
                  if (rc < 0) {
                          rc = ll_inode_revalidate_fini(inode, rc);
                          GOTO (out, rc);
                  }
-                
+
                  rc = revalidate_it_finish(req, DLM_REPLY_REC_OFF, &oit, dentry);
                  if (rc != 0) {
                          ll_intent_release(&oit);
@@ -2859,9 +3065,11 @@ int ll_inode_revalidate_it(struct dentry *dentry, struct lookup_intent *it)
                     here to preserve get_cwd functionality on 2.6.
                     Bug 10503 */
                  if (!dentry->d_inode->i_nlink) {
+                        spin_lock(&ll_lookup_lock);
                          spin_lock(&dcache_lock);
                          ll_drop_dentry(dentry);
                          spin_unlock(&dcache_lock);
+                        spin_unlock(&ll_lookup_lock);
                  }
  
                  ll_lookup_finish_locks(&oit, dentry);
@@ -2874,8 +3082,8 @@ int ll_inode_revalidate_it(struct dentry *dentry, struct lookup_intent *it)
  
                  if (S_ISREG(inode->i_mode)) {
                          rc = ll_get_max_mdsize(sbi, &ealen);
-                        if (rc) 
-                                RETURN(rc); 
+                        if (rc)
+                                RETURN(rc);
                          valid |= OBD_MD_FLEASIZE | OBD_MD_FLMODEASIZE;
                  }
                  ll_inode2fid(&fid, inode);
@@ -2892,8 +3100,12 @@ int ll_inode_revalidate_it(struct dentry *dentry, struct lookup_intent *it)
          }
  
          /* if object not yet allocated, don't validate size */
-        if (ll_i2info(inode)->lli_smd == NULL) 
+        if (ll_i2info(inode)->lli_smd == NULL) {
+                LTIME_S(inode->i_atime) = ll_i2info(inode)->lli_lvb.lvb_atime;
+                LTIME_S(inode->i_mtime) = ll_i2info(inode)->lli_lvb.lvb_mtime;
+                LTIME_S(inode->i_ctime) = ll_i2info(inode)->lli_lvb.lvb_ctime;
                  GOTO(out, rc = 0);
+        }
  
          /* ll_glimpse_size will prefer locally cached writes if they extend
           * the file */
@@ -2904,7 +3116,6 @@ out:
          RETURN(rc);
  }
  
-#if (LINUX_VERSION_CODE > KERNEL_VERSION(2,5,0))
  int ll_getattr_it(struct vfsmount *mnt, struct dentry *de,
                    struct lookup_intent *it, struct kstat *stat)
  {
@@ -2946,7 +3157,6 @@ int ll_getattr(struct vfsmount *mnt, struct dentry *de, struct kstat *stat)
  
          return ll_getattr_it(mnt, de, &it, stat);
  }
-#endif
  
  static
  int lustre_check_acl(struct inode *inode, int mask)
@@ -3023,10 +3233,10 @@ check_groups:
  check_capabilities:
          if (!(mask & MAY_EXEC) ||
              (inode->i_mode & S_IXUGO) || S_ISDIR(inode->i_mode))
-                if (capable(CAP_DAC_OVERRIDE))
+                if (cfs_capable(CFS_CAP_DAC_OVERRIDE))
                          return 0;
  
-        if (capable(CAP_DAC_READ_SEARCH) && ((mask == MAY_READ) ||
+        if (cfs_capable(CFS_CAP_DAC_READ_SEARCH) && ((mask == MAY_READ) ||
              (S_ISDIR(inode->i_mode) && !(mask & MAY_WRITE))))
                  return 0;
  
@@ -3053,9 +3263,7 @@ struct file_operations ll_file_operations = {
          .release        = ll_file_release,
          .mmap           = ll_file_mmap,
          .llseek         = ll_file_seek,
-#if (LINUX_VERSION_CODE >= KERNEL_VERSION(2,5,0))
          .sendfile       = ll_file_sendfile,
-#endif
          .fsync          = ll_fsync,
  };
  
@@ -3069,7 +3277,7 @@ struct file_operations ll_file_operations_flock = {
          .write          = ll_file_write,
  #ifdef HAVE_FILE_WRITEV
          .writev         = ll_file_writev,
-#else   
+#else
          .aio_write      = ll_file_aio_write,
  #endif
          .ioctl          = ll_file_ioctl,
@@ -3077,9 +3285,7 @@ struct file_operations ll_file_operations_flock = {
          .release        = ll_file_release,
          .mmap           = ll_file_mmap,
          .llseek         = ll_file_seek,
-#if (LINUX_VERSION_CODE >= KERNEL_VERSION(2,5,0))
          .sendfile       = ll_file_sendfile,
-#endif
          .fsync          = ll_fsync,
  #ifdef HAVE_F_OP_FLOCK
          .flock          = ll_file_flock,
@@ -3098,7 +3304,7 @@ struct file_operations ll_file_operations_noflock = {
          .write          = ll_file_write,
  #ifdef HAVE_FILE_WRITEV
          .writev         = ll_file_writev,
-#else   
+#else
          .aio_write      = ll_file_aio_write,
  #endif
          .ioctl          = ll_file_ioctl,
@@ -3106,9 +3312,7 @@ struct file_operations ll_file_operations_noflock = {
          .release        = ll_file_release,
          .mmap           = ll_file_mmap,
          .llseek         = ll_file_seek,
-#if (LINUX_VERSION_CODE >= KERNEL_VERSION(2,5,0))
          .sendfile       = ll_file_sendfile,
-#endif
          .fsync          = ll_fsync,
  #ifdef HAVE_F_OP_FLOCK
          .flock          = ll_file_noflock,
@@ -3122,11 +3326,7 @@ struct inode_operations ll_file_inode_operations = {
  #endif
          .setattr        = ll_setattr,
          .truncate       = ll_truncate,
-#if (LINUX_VERSION_CODE > KERNEL_VERSION(2,5,0))
          .getattr        = ll_getattr,
-#else
-        .revalidate_it  = ll_inode_revalidate_it,
-#endif
          .permission     = ll_inode_permission,
          .setxattr       = ll_setxattr,
          .getxattr       = ll_getxattr,
@@ -3138,8 +3338,8 @@ struct inode_operations ll_file_inode_operations = {
  static struct llioc_ctl_data {
          struct rw_semaphore ioc_sem;
          struct list_head    ioc_head;
-} llioc = { 
-        __RWSEM_INITIALIZER(llioc.ioc_sem), 
+} llioc = {
+        __RWSEM_INITIALIZER(llioc.ioc_sem),
          CFS_LIST_HEAD_INIT(llioc.ioc_head)
  };
  
@@ -3207,7 +3407,7 @@ void ll_iocontrol_unregister(void *magic)
  EXPORT_SYMBOL(ll_iocontrol_register);
  EXPORT_SYMBOL(ll_iocontrol_unregister);
  
-enum llioc_iter ll_iocontrol_call(struct inode *inode, struct file *file, 
+enum llioc_iter ll_iocontrol_call(struct inode *inode, struct file *file,
                          unsigned int cmd, unsigned long arg, int *rcp)
  {
          enum llioc_iter ret = LLIOC_CONT;
@@ -3217,7 +3417,7 @@ enum llioc_iter ll_iocontrol_call(struct inode *inode, struct file *file,
          down_read(&llioc.ioc_sem);
          list_for_each_entry(data, &llioc.ioc_head, iocd_list) {
                  for (i = 0; i < data->iocd_count; i++) {
-                        if (cmd != data->iocd_cmd[i]) 
+                        if (cmd != data->iocd_cmd[i])
                                  continue;
  
                          ret = data->iocd_cb(inode, file, cmd, arg, data, &rc);
diff --git a/lustre/llite/llite_close.c b/lustre/llite/llite_close.c

index d36f2d6..ff3bd0e 100644 (file)
--- a/lustre/llite/llite_close.c
+++ b/lustre/llite/llite_close.c
@@ -1,24 +1,41 @@
  /* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
   * vim:expandtab:shiftwidth=8:tabstop=8:
   *
- * Lustre Lite routines to issue a secondary close after writeback
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   *
- *  Copyright (c) 2001-2003 Cluster File Systems, Inc.
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
   *
- *   This file is part of Lustre, http://www.lustre.org.
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
   *
- *   Lustre is free software; you can redistribute it and/or
- *   modify it under the terms of version 2 of the GNU General Public
- *   License as published by the Free Software Foundation.
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
   *
- *   Lustre is distributed in the hope that it will be useful,
- *   but WITHOUT ANY WARRANTY; without even the implied warranty of
- *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- *   GNU General Public License for more details.
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
   *
- *   You should have received a copy of the GNU General Public License
- *   along with Lustre; if not, write to the Free Software
- *   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ * GPL HEADER END
+ */
+/*
+ * Copyright  2008 Sun Microsystems, Inc. All rights reserved
+ * Use is subject to license terms.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/llite/llite_close.c
+ *
+ * Lustre Lite routines to issue a secondary close after writeback
   */
  
  #include <linux/module.h>
@@ -122,6 +139,7 @@ static void ll_close_done_writing(struct inode *inode)
          ldlm_policy_data_t policy = { .l_extent = {0, OBD_OBJECT_EOF } };
          struct lustre_handle lockh = { 0 };
          struct obdo obdo;
+        struct mdc_op_data data = { { 0 } };
          obd_flag valid;
          int rc, ast_flags = 0;
          ENTRY;
@@ -166,7 +184,8 @@ static void ll_close_done_writing(struct inode *inode)
          obdo.o_blocks = inode->i_blocks;
          obdo.o_valid = OBD_MD_FLID | OBD_MD_FLSIZE | OBD_MD_FLBLOCKS;
  
-        rc = mdc_done_writing(ll_i2sbi(inode)->ll_mdc_exp, &obdo);
+        ll_inode2fid(&data.fid1, inode);
+        rc = mdc_done_writing(ll_i2sbi(inode)->ll_mdc_exp, &data, &obdo);
   out:
  }
  
@@ -265,5 +284,3 @@ void ll_close_thread_shutdown(struct ll_close_queue *lcq)
          wait_for_completion(&lcq->lcq_comp);
          OBD_FREE(lcq, sizeof(*lcq));
  }
-
-
diff --git a/lustre/llite/llite_internal.h b/lustre/llite/llite_internal.h

index 5605f8c..522f9ce 100644 (file)
--- a/lustre/llite/llite_internal.h
+++ b/lustre/llite/llite_internal.h
@@ -1,5 +1,37 @@
  /* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
   * vim:expandtab:shiftwidth=8:tabstop=8:
+ *
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright  2008 Sun Microsystems, Inc. All rights reserved
+ * Use is subject to license terms.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
   */
  
  #ifndef LLITE_INTERNAL_H
@@ -19,6 +51,11 @@
  #include <lustre_ver.h>
  #include <linux/lustre_version.h>
  #include <lustre_disk.h>  /* for s2sbi */
+
+#ifndef HAVE_LE_TYPES
+typedef __u16 __le16;
+typedef __u32 __le32;
+#endif
   
  /*
  struct lustre_intent_data {
@@ -39,11 +76,7 @@ struct lustre_intent_data {
  #ifdef HAVE_VFS_INTENT_PATCHES
  static inline struct lookup_intent *ll_nd2it(struct nameidata *nd)
  {
-#if (LINUX_VERSION_CODE >= KERNEL_VERSION(2,5,0))
          return &nd->intent;
-#else
-        return nd->intent;
-#endif
  }
  #endif
  
@@ -76,7 +109,6 @@ struct ll_dentry_data {
          struct lookup_intent    *lld_it;
  #endif
          unsigned int             lld_sa_generation;
-        cfs_waitq_t              lld_waitq;
  };
  
  #define ll_d2d(de) ((struct ll_dentry_data*)((de)->d_fsdata))
@@ -129,9 +161,11 @@ struct ll_inode_info {
          struct obd_client_handle *lli_mds_exec_och;
          __u64                   lli_open_fd_exec_count;
  
-#if (LINUX_VERSION_CODE >= KERNEL_VERSION(2,5,0))
-        struct inode            lli_vfs_inode;
-#endif
+        /** fid of this object. */
+        union {
+                struct lu_fid f20;
+                struct ll_fid f16;
+        } lli_fid;
  
          /* metadata stat-ahead */
          /*
@@ -145,6 +179,10 @@ struct ll_inode_info {
           * before child -- it is me should cleanup the dir readahead. */
          void                   *lli_opendir_key;
          struct ll_statahead_info *lli_sai;
+        /* the most recent attributes from mds, it is used for timestampts
+         * only so far */
+        struct ost_lvb         lli_lvb;
+        struct inode            lli_vfs_inode;
  };
  
  /*
@@ -162,12 +200,7 @@ void ll_inode_size_unlock(struct inode *inode, int unlock_lsm);
  // static inline struct ll_inode_info *LL_I(struct inode *inode)
  static inline struct ll_inode_info *ll_i2info(struct inode *inode)
  {
-#if (LINUX_VERSION_CODE >= KERNEL_VERSION(2,5,0))
          return container_of(inode, struct ll_inode_info, lli_vfs_inode);
-#else
-        CLASSERT(sizeof(inode->u) >= sizeof(struct ll_inode_info));
-        return (struct ll_inode_info *)&(inode->u.generic_ip);
-#endif
  }
  
  /* default to about 40meg of readahead on a given system.  That much tied
@@ -193,11 +226,26 @@ enum ra_stat {
          _NR_RA_STAT,
  };
  
+#define LL_RA_STAT      _NR_RA_STAT
+#define LL_RA_STAT_STRINGS           {                                  \
+        [RA_STAT_HIT]               = "hits",                           \
+        [RA_STAT_MISS]              = "misses",                         \
+        [RA_STAT_DISTANT_READPAGE]  = "readpage not consecutive",       \
+        [RA_STAT_MISS_IN_WINDOW]    = "miss inside window",             \
+        [RA_STAT_FAILED_GRAB_PAGE]  = "failed grab_cache_page",         \
+        [RA_STAT_FAILED_MATCH]      = "failed lock match",              \
+        [RA_STAT_DISCARDED]         = "read but discarded",             \
+        [RA_STAT_ZERO_LEN]          = "zero length file",               \
+        [RA_STAT_ZERO_WINDOW]       = "zero size window",               \
+        [RA_STAT_EOF]               = "read-ahead to EOF",              \
+        [RA_STAT_MAX_IN_FLIGHT]     = "hit max r-a issue",              \
+        [RA_STAT_WRONG_GRAB_PAGE]   = "wrong page from grab_cache_page",\
+} 
+
  struct ll_ra_info {
-        unsigned long             ra_cur_pages;
+        atomic_t                  ra_cur_pages;
          unsigned long             ra_max_pages;
          unsigned long             ra_max_read_ahead_whole_pages;
-        unsigned long             ra_stats[_NR_RA_STAT];
  };
  
  /* LL_HIST_MAX=32 causes an overflow */
@@ -253,10 +301,30 @@ enum stats_track_type {
  /* default value for lockless_truncate_enable */
  #define SBI_DEFAULT_LOCKLESS_TRUNCATE_ENABLE 1
  
+/* percpu data structure for lustre lru page list */
+struct ll_pglist_data {
+        spinlock_t                llpd_lock; /* lock to protect llpg_list */
+        struct list_head          llpd_list; /* all pages (llap_pglist_item) */
+        unsigned long             llpd_gen;  /* generation # of this list */
+        unsigned long             llpd_count; /* How many pages in this list */
+        atomic_t                  llpd_sample_count;
+        unsigned long             llpd_reblnc_count;
+        /* the pages in this list shouldn't be over this number */
+        unsigned long             llpd_budget; 
+        int                       llpd_cpu;
+        /* which page the pglist data is in */
+        struct page              *llpd_page; 
+
+        /* stats */
+        unsigned long             llpd_hit;
+        unsigned long             llpd_miss;
+        unsigned long             llpd_cross;
+};
+
  struct ll_sb_info {
          struct list_head          ll_list;
-        /* this protects pglist and ra_info.  It isn't safe to
-         * grab from interrupt contexts */
+        /* this protects pglist(only ll_async_page_max) and ra_info.  
+         * It isn't safe to grab from interrupt contexts. */
          spinlock_t                ll_lock;
          spinlock_t                ll_pp_extent_lock; /* Lock for pp_extent entries */
          spinlock_t                ll_process_lock; /* Lock for ll_rw_process_info */
@@ -275,10 +343,19 @@ struct ll_sb_info {
  
          struct lprocfs_stats     *ll_stats; /* lprocfs stats counter */
  
+        /* reblnc lock protects llpd_budget */
+        spinlock_t                ll_async_page_reblnc_lock;
+        unsigned long             ll_async_page_reblnc_count;
+        unsigned long             ll_async_page_sample_max;
+        /* I defined this array here rather than in ll_pglist_data
+         * because it is always accessed by only one cpu. -jay */
+        unsigned long            *ll_async_page_sample;
          unsigned long             ll_async_page_max;
-        unsigned long             ll_async_page_count;
-        unsigned long             ll_pglist_gen;
-        struct list_head          ll_pglist; /* all pages (llap_pglist_item) */
+        unsigned long             ll_async_page_clock_hand;
+        lcounter_t                ll_async_page_count;
+        struct ll_pglist_data   **ll_pglist;
+
+        struct lprocfs_stats     *ll_ra_stats;
  
          unsigned                  ll_contention_time; /* seconds */
          unsigned                  ll_lockless_truncate_enable; /* true/false */
@@ -321,7 +398,69 @@ struct ll_sb_info {
          unsigned long long        ll_sa_miss;    /* miss count */
  };
  
-#define LL_DEFAULT_MAX_RW_CHUNK         (32 * 1024 * 1024)
+#define LL_DEFAULT_MAX_RW_CHUNK      (32 * 1024 * 1024)
+
+#define LL_PGLIST_DATA_CPU(sbi, cpu) ((sbi)->ll_pglist[cpu])
+#define LL_PGLIST_DATA(sbi)          LL_PGLIST_DATA_CPU(sbi, smp_processor_id())
+
+static inline struct ll_pglist_data *ll_pglist_cpu_lock(
+                struct ll_sb_info *sbi, 
+                int cpu)
+{
+        spin_lock(&sbi->ll_pglist[cpu]->llpd_lock);
+        return LL_PGLIST_DATA_CPU(sbi, cpu);
+}
+
+static inline void ll_pglist_cpu_unlock(struct ll_sb_info *sbi, int cpu)
+{
+        spin_unlock(&sbi->ll_pglist[cpu]->llpd_lock);
+}
+
+static inline struct ll_pglist_data *ll_pglist_double_lock(
+                struct ll_sb_info *sbi, 
+                int cpu, struct ll_pglist_data **pd_cpu)
+{
+        int current_cpu = get_cpu();
+
+        if (cpu == current_cpu) {
+                ll_pglist_cpu_lock(sbi, cpu);
+        } else if (current_cpu < cpu) {
+                ll_pglist_cpu_lock(sbi, current_cpu);
+                ll_pglist_cpu_lock(sbi, cpu);
+        } else {
+                ll_pglist_cpu_lock(sbi, cpu);
+                ll_pglist_cpu_lock(sbi, current_cpu);
+        }
+
+        if (pd_cpu)
+                *pd_cpu = LL_PGLIST_DATA_CPU(sbi, cpu);
+
+        return LL_PGLIST_DATA(sbi);
+}
+
+static inline void ll_pglist_double_unlock(struct ll_sb_info *sbi, int cpu)
+{
+        int current_cpu = smp_processor_id();
+        if (cpu == current_cpu) {
+                ll_pglist_cpu_unlock(sbi, cpu);
+        } else {
+                ll_pglist_cpu_unlock(sbi, cpu);
+                ll_pglist_cpu_unlock(sbi, current_cpu);
+        }
+        put_cpu();
+}
+
+static inline struct ll_pglist_data *ll_pglist_lock(struct ll_sb_info *sbi)
+{
+        ll_pglist_cpu_lock(sbi, get_cpu());
+        return LL_PGLIST_DATA(sbi);
+}
+
+static inline void ll_pglist_unlock(struct ll_sb_info *sbi)
+{
+        ll_pglist_cpu_unlock(sbi, smp_processor_id());
+        put_cpu();
+}
  
  struct ll_ra_read {
          pgoff_t             lrr_start;
@@ -434,11 +573,7 @@ extern struct proc_dir_entry *proc_lustre_fs_root;
  
  static inline struct inode *ll_info2i(struct ll_inode_info *lli)
  {
-#if (LINUX_VERSION_CODE >= KERNEL_VERSION(2,5,0))
          return &lli->lli_vfs_inode;
-#else
-        return list_entry(lli, struct inode, u.generic_ip);
-#endif
  }
  
  struct it_cb_data {
@@ -462,7 +597,9 @@ struct ll_async_page {
                           llap_ra_used:1,
                           llap_ignore_quota:1,
                           llap_nocache:1,
-                         llap_lockless_io_page:1;
+                         llap_lockless_io_page:1,
+                         llap_reserved:7;
+        unsigned int     llap_pglist_cpu:16;
          void            *llap_cookie;
          struct page     *llap_page;
          struct list_head llap_pending_write;
@@ -488,8 +625,25 @@ enum {
  extern char *llap_origins[];
  
  #ifdef HAVE_REGISTER_CACHE
+#include <linux/cache_def.h>
  #define ll_register_cache(cache) register_cache(cache)
  #define ll_unregister_cache(cache) unregister_cache(cache)
+#elif defined(HAVE_SHRINKER_CACHE)
+struct cache_definition {
+        const char *name;
+        shrinker_t shrink;
+        struct shrinker *shrinker;
+};
+
+#define ll_register_cache(cache) do {                                   \
+        struct cache_definition *c = (cache);                           \
+        c->shrinker = set_shrinker(DEFAULT_SEEKS, c->shrink);           \
+} while(0)
+
+#define ll_unregister_cache(cache) do {                                 \
+        remove_shrinker((cache)->shrinker);                             \
+        (cache)->shrinker = NULL;                                       \
+} while(0)
  #else
  #define ll_register_cache(cache) do {} while (0)
  #define ll_unregister_cache(cache) do {} while (0)
@@ -553,7 +707,6 @@ static inline unsigned long dir_pages(struct inode *inode)
          return (inode->i_size + CFS_PAGE_SIZE - 1) >> CFS_PAGE_SHIFT;
  }
  
-/* llite/namei.c */
  int ll_objects_destroy(struct ptlrpc_request *request, struct inode *dir);
  struct inode *ll_iget(struct super_block *sb, ino_t hash,
                        struct lustre_md *lic);
@@ -624,11 +777,9 @@ int ll_mdc_close(struct obd_export *mdc_exp, struct inode *inode,
  int ll_mdc_real_close(struct inode *inode, int flags);
  extern void ll_rw_stats_tally(struct ll_sb_info *sbi, pid_t pid, struct file
                                 *file, size_t count, int rw);
-#if (LINUX_VERSION_CODE > KERNEL_VERSION(2,5,0))
  int ll_getattr_it(struct vfsmount *mnt, struct dentry *de,
                 struct lookup_intent *it, struct kstat *stat);
  int ll_getattr(struct vfsmount *mnt, struct dentry *de, struct kstat *stat);
-#endif
  struct ll_file_data *ll_file_data_get(void);
  #if (LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,0))
  int ll_inode_permission(struct inode *inode, int mask, struct nameidata *nd);
@@ -645,11 +796,17 @@ int ll_dir_setstripe(struct inode *inode, struct lov_user_md *lump,
                       int set_default);
  int ll_dir_getstripe(struct inode *inode, struct lov_mds_md **lmm, 
                       int *lmm_size, struct ptlrpc_request **request);
+int ll_fsync(struct file *file, struct dentry *dentry, int data);
+int ll_fiemap(struct inode *inode, struct ll_user_fiemap *fiemap,
+              int num_bytes);
  
  /* llite/dcache.c */
-extern struct dentry_operations ll_init_d_ops;
+/* llite/namei.c */
+/**
+ * protect race ll_find_aliases vs ll_revalidate_it vs ll_unhash_aliases
+ */
+extern spinlock_t ll_lookup_lock;
  extern struct dentry_operations ll_d_ops;
-extern struct dentry_operations ll_fini_d_ops;
  void ll_intent_drop_lock(struct lookup_intent *);
  void ll_intent_release(struct lookup_intent *);
  extern void ll_set_dd(struct dentry *de);
@@ -690,11 +847,12 @@ void ll_umount_begin(struct vfsmount *vfsmnt, int flags);
  void ll_umount_begin(struct super_block *sb);
  #endif
  int ll_remount_fs(struct super_block *sb, int *flags, char *data);
+int ll_show_options(struct seq_file *seq, struct vfsmount *vfs);
  int ll_prep_inode(struct obd_export *exp, struct inode **inode,
                    struct ptlrpc_request *req, int offset, struct super_block *);
  void lustre_dump_dentry(struct dentry *, int recur);
  void lustre_dump_inode(struct inode *);
-struct ll_async_page *llite_pglist_next_llap(struct ll_sb_info *sbi,
+struct ll_async_page *llite_pglist_next_llap(struct list_head *head,
                                               struct list_head *list);
  int ll_obd_statfs(struct inode *inode, void *arg);
  int ll_get_max_mdsize(struct ll_sb_info *sbi, int *max_mdsize);
@@ -750,10 +908,8 @@ void ll_close_thread_shutdown(struct ll_close_queue *lcq);
  int ll_close_thread_start(struct ll_close_queue **lcq_ret);
  
  /* llite/llite_mmap.c */
-#if  (LINUX_VERSION_CODE >= KERNEL_VERSION(2,5,0))
  typedef struct rb_root  rb_root_t;
  typedef struct rb_node  rb_node_t;
-#endif
  
  struct ll_lock_tree_node;
  struct ll_lock_tree {
@@ -777,18 +933,11 @@ int ll_tree_unlock(struct ll_lock_tree *tree);
  
  #define    ll_s2sbi(sb)        (s2lsi(sb)->lsi_llsbi)
  
-#if  (LINUX_VERSION_CODE >= KERNEL_VERSION(2,5,0))
  static inline __u64 ll_ts2u64(struct timespec *time)
  {
          __u64 t = time->tv_sec;
          return t;
  }
-#else  /* 2.4 here */
-static inline __u64 ll_ts2u64(time_t *time)
-{
-        return *time;
-}
-#endif
  
  /* don't need an addref as the sb_info should be holding one */
  static inline struct obd_export *ll_s2obdexp(struct super_block *sb)
@@ -826,10 +975,21 @@ static inline struct obd_export *ll_i2mdcexp(struct inode *inode)
          return ll_s2mdcexp(inode->i_sb);
  }
  
+/** get lu_fid from inode. */
+static inline struct lu_fid *ll_inode_lu_fid(struct inode *inode)
+{
+        return &ll_i2info(inode)->lli_fid.f20;
+}
+
+/** get ll_fid from inode. */
+static inline struct ll_fid *ll_inode_ll_fid(struct inode *inode)
+{
+        return &ll_i2info(inode)->lli_fid.f16;
+}
+
  static inline void ll_inode2fid(struct ll_fid *fid, struct inode *inode)
  {
-        mdc_pack_fid(fid, inode->i_ino, inode->i_generation,
-                     inode->i_mode & S_IFMT);
+        *fid = *ll_inode_ll_fid(inode);
  }
  
  static inline int ll_mds_max_easize(struct super_block *sb)
@@ -867,6 +1027,8 @@ struct ll_statahead_info {
                                                   * reply */
          unsigned int            sai_max;        /* max ahead of lookup */
          unsigned int            sai_index;      /* index of statahead entry */
+        unsigned int            sai_index_next; /* index for the next statahead
+                                                 * entry to be stated */
          unsigned int            sai_hit;        /* hit count */
          unsigned int            sai_miss;       /* miss count:
                                                   * for "ls -al" case, it includes
@@ -884,7 +1046,9 @@ struct ll_statahead_info {
                                                   * hidden entries */
          cfs_waitq_t             sai_waitq;      /* stat-ahead wait queue */
          struct ptlrpc_thread    sai_thread;     /* stat-ahead thread */
-        struct list_head        sai_entries;    /* stat-ahead entries */
+        struct list_head        sai_entries_sent;     /* entries sent out */
+        struct list_head        sai_entries_received; /* entries returned */
+        struct list_head        sai_entries_stated;   /* entries stated */
  };
  
  int do_statahead_enter(struct inode *dir, struct dentry **dentry, int lookup);
@@ -892,13 +1056,19 @@ void ll_statahead_exit(struct dentry *dentry, int result);
  void ll_stop_statahead(struct inode *inode, void *key);
  
  static inline
-void ll_d_wakeup(struct dentry *dentry)
+void ll_statahead_mark(struct dentry *dentry)
  {
-        struct ll_dentry_data *lld = ll_d2d(dentry);
+        struct ll_inode_info *lli = ll_i2info(dentry->d_parent->d_inode);
+        struct ll_dentry_data *ldd = ll_d2d(dentry);
  
-        LASSERT(dentry->d_op != &ll_init_d_ops);
-        if (lld != NULL)
-                cfs_waitq_broadcast(&lld->lld_waitq);
+        /* not the same process, don't mark */
+        if (lli->lli_opendir_pid != cfs_curproc_pid())
+                return;
+
+        spin_lock(&lli->lli_lock);
+        if (likely(lli->lli_sai != NULL && ldd != NULL))
+                ldd->lld_sa_generation = lli->lli_sai->sai_generation;
+        spin_unlock(&lli->lli_lock);
  }
  
  static inline
@@ -911,6 +1081,10 @@ int ll_statahead_enter(struct inode *dir, struct dentry **dentryp, int lookup)
          if (sbi->ll_sa_max == 0)
                  return -ENOTSUPP;
  
+        /* temporarily disable dir stat ahead in interoperability mode */
+        if (sbi->ll_mdc_exp->exp_connect_flags & OBD_CONNECT_FID)
+                return -ENOTSUPP;
+
          /* not the same process, don't statahead */
          if (lli->lli_opendir_pid != cfs_curproc_pid())
                  return -EBADF;
@@ -937,6 +1111,21 @@ int ll_statahead_enter(struct inode *dir, struct dentry **dentryp, int lookup)
          return do_statahead_enter(dir, dentryp, lookup);
  }
  
+static void inline ll_dops_init(struct dentry *de, int block)
+{
+        struct ll_dentry_data *lld = ll_d2d(de);
+
+        if (lld == NULL && block != 0) {
+                ll_set_dd(de);
+                lld = ll_d2d(de);
+        }
+
+        if (lld != NULL)
+                lld->lld_sa_generation = 0;
+
+        de->d_op = &ll_d_ops;
+}
+
  /* llite ioctl register support rountine */
  #ifdef __KERNEL__
  enum llioc_iter {
@@ -982,6 +1171,9 @@ enum llioc_iter ll_iocontrol_call(struct inode *inode, struct file *file,
  void *ll_iocontrol_register(llioc_callback_t cb, int count, unsigned int *cmd);
  void ll_iocontrol_unregister(void *magic);
  
+ino_t ll_fid_build_ino(struct ll_sb_info *sbi,
+                       struct ll_fid *fid);
+
  #endif
  
  #endif /* LLITE_INTERNAL_H */
diff --git a/lustre/llite/llite_lib.c b/lustre/llite/llite_lib.c

index 669c50d..c440e86 100644 (file)
--- a/lustre/llite/llite_lib.c
+++ b/lustre/llite/llite_lib.c
@@ -1,24 +1,41 @@
  /* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
   * vim:expandtab:shiftwidth=8:tabstop=8:
   *
- * Lustre Light Super operations
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   *
- *  Copyright (c) 2002-2005 Cluster File Systems, Inc.
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
   *
- *   This file is part of Lustre, http://www.lustre.org.
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
   *
- *   Lustre is free software; you can redistribute it and/or
- *   modify it under the terms of version 2 of the GNU General Public
- *   License as published by the Free Software Foundation.
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
   *
- *   Lustre is distributed in the hope that it will be useful,
- *   but WITHOUT ANY WARRANTY; without even the implied warranty of
- *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- *   GNU General Public License for more details.
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
   *
- *   You should have received a copy of the GNU General Public License
- *   along with Lustre; if not, write to the Free Software
- *   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ * GPL HEADER END
+ */
+/*
+ * Copyright  2008 Sun Microsystems, Inc. All rights reserved
+ * Use is subject to license terms.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/llite/llite_lib.c
+ *
+ * Lustre Light Super operations
   */
  
  #define DEBUG_SUBSYSTEM S_LLITE
@@ -49,10 +66,70 @@ extern struct address_space_operations ll_dir_aops;
  #define log2(n) ffz(~(n))
  #endif
  
+static inline void ll_pglist_fini(struct ll_sb_info *sbi)
+{
+        struct page *page;
+        int i;
+
+        if (sbi->ll_pglist == NULL)
+                return;
+
+        for_each_possible_cpu(i) {
+                page = sbi->ll_pglist[i]->llpd_page;
+                if (page) {
+                        sbi->ll_pglist[i] = NULL;
+                        __free_page(page);
+                }
+        }
+
+        OBD_FREE(sbi->ll_pglist, sizeof(void *)*num_possible_cpus());
+        sbi->ll_pglist = NULL;
+}
+
+static inline int ll_pglist_init(struct ll_sb_info *sbi)
+{
+        struct ll_pglist_data *pd;
+        unsigned long budget;
+        int i, color = 0;
+        ENTRY;
+
+        OBD_ALLOC(sbi->ll_pglist, sizeof(void *) * num_possible_cpus());
+        if (sbi->ll_pglist == NULL)
+                RETURN(-ENOMEM);
+
+        budget = sbi->ll_async_page_max / num_online_cpus();
+        for_each_possible_cpu(i) {
+                struct page *page = alloc_pages_node(cpu_to_node(i),
+                                                    GFP_KERNEL, 0);
+                if (page == NULL) {
+                        ll_pglist_fini(sbi);
+                        RETURN(-ENOMEM);
+                }
+
+                if (color + L1_CACHE_ALIGN(sizeof(*pd)) > PAGE_SIZE)
+                        color = 0;
+
+                pd = (struct ll_pglist_data *)(page_address(page) + color);
+                memset(pd, 0, sizeof(*pd));
+                spin_lock_init(&pd->llpd_lock);
+                INIT_LIST_HEAD(&pd->llpd_list);
+                if (cpu_online(i))
+                        pd->llpd_budget = budget;
+                pd->llpd_cpu = i;
+                pd->llpd_page = page;
+                atomic_set(&pd->llpd_sample_count, 0);
+                sbi->ll_pglist[i] = pd;
+                color += L1_CACHE_ALIGN(sizeof(*pd));
+        }
+
+        RETURN(0);
+}
  
  static struct ll_sb_info *ll_init_sbi(void)
  {
          struct ll_sb_info *sbi = NULL;
+        unsigned long pages;
+        struct sysinfo si;
          class_uuid_t uuid;
          int i;
          ENTRY;
@@ -61,17 +138,37 @@ static struct ll_sb_info *ll_init_sbi(void)
          if (!sbi)
                  RETURN(NULL);
  
+        OBD_ALLOC(sbi->ll_async_page_sample, sizeof(long)*num_possible_cpus());
+        if (sbi->ll_async_page_sample == NULL)
+                GOTO(out, 0);
+
          spin_lock_init(&sbi->ll_lock);
          spin_lock_init(&sbi->ll_lco.lco_lock);
          spin_lock_init(&sbi->ll_pp_extent_lock);
          spin_lock_init(&sbi->ll_process_lock);
          sbi->ll_rw_stats_on = 0;
-        INIT_LIST_HEAD(&sbi->ll_pglist);
-        if (num_physpages >> (20 - CFS_PAGE_SHIFT) < 512)
-                sbi->ll_async_page_max = num_physpages / 2;
-        else
-                sbi->ll_async_page_max = (num_physpages / 4) * 3;
-        sbi->ll_ra_info.ra_max_pages = min(num_physpages / 8,
+
+        si_meminfo(&si);
+        pages = si.totalram - si.totalhigh;
+        if (pages >> (20 - CFS_PAGE_SHIFT) < 512) {
+#ifdef HAVE_BGL_SUPPORT
+                sbi->ll_async_page_max = pages / 4;
+#else
+                sbi->ll_async_page_max = pages / 2;
+#endif
+        } else {
+                sbi->ll_async_page_max = (pages / 4) * 3;
+        }
+
+        lcounter_init(&sbi->ll_async_page_count);
+        spin_lock_init(&sbi->ll_async_page_reblnc_lock);
+        sbi->ll_async_page_sample_max = 64 * num_online_cpus();
+        sbi->ll_async_page_reblnc_count = 0;
+        sbi->ll_async_page_clock_hand = 0;
+        if (ll_pglist_init(sbi))
+                GOTO(out, 0);
+
+        sbi->ll_ra_info.ra_max_pages = min(pages / 32,
                                             SBI_DEFAULT_READAHEAD_MAX);
          sbi->ll_ra_info.ra_max_read_ahead_whole_pages =
                                             SBI_DEFAULT_READAHEAD_WHOLE_MAX;
@@ -112,6 +209,14 @@ static struct ll_sb_info *ll_init_sbi(void)
          sbi->ll_sa_max = LL_SA_RPC_DEF;
  
          RETURN(sbi);
+
+out:
+        if (sbi->ll_async_page_sample)
+                OBD_FREE(sbi->ll_async_page_sample,
+                         sizeof(long) * num_possible_cpus());
+        ll_pglist_fini(sbi);
+        OBD_FREE(sbi, sizeof(*sbi));
+        RETURN(NULL);
  }
  
  void ll_free_sbi(struct super_block *sb)
@@ -120,9 +225,13 @@ void ll_free_sbi(struct super_block *sb)
          ENTRY;
  
          if (sbi != NULL) {
+                ll_pglist_fini(sbi);
                  spin_lock(&ll_sb_lock);
                  list_del(&sbi->ll_list);
                  spin_unlock(&ll_sb_lock);
+                lcounter_destroy(&sbi->ll_async_page_count);
+                OBD_FREE(sbi->ll_async_page_sample,
+                         sizeof(long) * num_possible_cpus());
                  OBD_FREE(sbi, sizeof(*sbi));
          }
          EXIT;
@@ -168,9 +277,11 @@ static int client_common_fill_super(struct super_block *sb,
          }
  
          /* indicate the features supported by this client */
-        data->ocd_connect_flags = OBD_CONNECT_VERSION | OBD_CONNECT_IBITS |
-                OBD_CONNECT_JOIN | OBD_CONNECT_ATTRFID | OBD_CONNECT_NODEVOH |
-                OBD_CONNECT_CANCELSET | OBD_CONNECT_AT;
+        data->ocd_connect_flags = OBD_CONNECT_VERSION | OBD_CONNECT_IBITS      |
+                                  OBD_CONNECT_JOIN    | OBD_CONNECT_ATTRFID    |
+                                  OBD_CONNECT_NODEVOH | OBD_CONNECT_CANCELSET  |
+                                  OBD_CONNECT_AT      | OBD_CONNECT_FID |
+                                  OBD_CONNECT_VBR     | OBD_CONNECT_LOV_V3;
  #ifdef HAVE_LRU_RESIZE_SUPPORT
          if (sbi->ll_flags & LL_SBI_LRU_RESIZE)
                  data->ocd_connect_flags |= OBD_CONNECT_LRU_RESIZE;
@@ -210,6 +321,9 @@ static int client_common_fill_super(struct super_block *sb,
                  GOTO(out, err);
          }
          sbi->ll_mdc_exp = class_conn2export(&mdc_conn);
+        err = obd_fid_init(sbi->ll_mdc_exp);
+        if (err)
+                GOTO(out_mdc, err);
  
          err = obd_statfs(obd, &osfs, cfs_time_current_64() - HZ, 0);
          if (err)
@@ -258,24 +372,17 @@ static int client_common_fill_super(struct super_block *sb,
          if (data->ocd_connect_flags & OBD_CONNECT_JOIN)
                  sbi->ll_flags |= LL_SBI_JOIN;
  
-        sbi->ll_sdev_orig = sb->s_dev;
-        /* We set sb->s_dev equal on all lustre clients in order to support
-         * NFS export clustering.  NFSD requires that the FSID be the same
-         * on all clients. */
-        /* s_dev is also used in lt_compare() to compare two fs, but that is
-         * only a node-local comparison. */
-        sb->s_dev = get_uuid2int(sbi2mdc(sbi)->cl_target_uuid.uuid,
-                                 strlen(sbi2mdc(sbi)->cl_target_uuid.uuid));
          obd = class_name2obd(osc);
          if (!obd) {
                  CERROR("OSC %s: not setup or attached\n", osc);
                  GOTO(out_mdc, err = -ENODEV);
          }
  
-        data->ocd_connect_flags = OBD_CONNECT_VERSION | OBD_CONNECT_GRANT |
-                OBD_CONNECT_REQPORTAL | OBD_CONNECT_BRW_SIZE | 
-                OBD_CONNECT_SRVLOCK | OBD_CONNECT_CANCELSET | OBD_CONNECT_AT |
-                OBD_CONNECT_TRUNCLOCK;
+        data->ocd_connect_flags = OBD_CONNECT_VERSION   | OBD_CONNECT_GRANT    |
+                                  OBD_CONNECT_REQPORTAL | OBD_CONNECT_BRW_SIZE |
+                                  OBD_CONNECT_SRVLOCK   | OBD_CONNECT_CANCELSET|
+                                  OBD_CONNECT_AT        | OBD_CONNECT_FID      |
+                                  OBD_CONNECT_VBR       | OBD_CONNECT_TRUNCLOCK;
  
          if (!OBD_FAIL_CHECK(OBD_FAIL_OSC_CONNECT_CKSUM)) {
                  /* OBD_CONNECT_CKSUM should always be set, even if checksums are
@@ -318,10 +425,12 @@ static int client_common_fill_super(struct super_block *sb,
          sbi->ll_osc_exp = class_conn2export(&osc_conn);
          spin_lock(&sbi->ll_lco.lco_lock);
          sbi->ll_lco.lco_flags = data->ocd_connect_flags;
+        sbi->ll_lco.lco_mdc_exp = sbi->ll_mdc_exp;
+        sbi->ll_lco.lco_osc_exp = sbi->ll_osc_exp;
          spin_unlock(&sbi->ll_lco.lco_lock);
  
          err = obd_register_page_removal_cb(sbi->ll_osc_exp,
-                                           ll_page_removal_cb, 
+                                           ll_page_removal_cb,
                                             ll_pin_extent_cb);
          if (err) {
                  CERROR("cannot register page removal callback: rc = %d\n",err);
@@ -364,11 +473,14 @@ static int client_common_fill_super(struct super_block *sb,
                  CERROR("cannot mds_connect: rc = %d\n", err);
                  GOTO(out_lock_cn_cb, err);
          }
-        CDEBUG(D_SUPER, "rootfid "LPU64"\n", rootfid.id);
+        CDEBUG(D_SUPER, "rootfid "LPU64":"DFID"\n", rootfid.id,
+                        PFID((struct lu_fid*)&rootfid));
          sbi->ll_rootino = rootfid.id;
  
          sb->s_op = &lustre_super_operations;
-#if (LINUX_VERSION_CODE >= KERNEL_VERSION(2,5,0))
+#if THREAD_SIZE >= 8192
+        /* Disable the NFS export because of stack overflow
+         * when THREAD_SIZE < 8192. Please refer to 17630. */
          sb->s_export_op = &lustre_export_operations;
  #endif
  
@@ -391,7 +503,7 @@ static int client_common_fill_super(struct super_block *sb,
          }
  
          LASSERT(sbi->ll_rootino != 0);
-        root = ll_iget(sb, sbi->ll_rootino, &md);
+        root = ll_iget(sb, ll_fid_build_ino(sbi, &rootfid), &md);
  
          ptlrpc_req_finished(request);
  
@@ -425,6 +537,16 @@ static int client_common_fill_super(struct super_block *sb,
          if (data != NULL)
                  OBD_FREE(data, sizeof(*data));
          sb->s_root->d_op = &ll_d_root_ops;
+
+        sbi->ll_sdev_orig = sb->s_dev;
+        /* We set sb->s_dev equal on all lustre clients in order to support
+         * NFS export clustering.  NFSD requires that the FSID be the same
+         * on all clients. */
+        /* s_dev is also used in lt_compare() to compare two fs, but that is
+         * only a node-local comparison. */
+        sb->s_dev = get_uuid2int(sbi2mdc(sbi)->cl_target_uuid.uuid,
+                                 strlen(sbi2mdc(sbi)->cl_target_uuid.uuid));
+
          RETURN(err);
  
  out_root:
@@ -440,6 +562,7 @@ out_osc:
          obd_disconnect(sbi->ll_osc_exp);
          sbi->ll_osc_exp = NULL;
  out_mdc:
+        obd_fid_fini(sbi->ll_mdc_exp);
          obd_disconnect(sbi->ll_mdc_exp);
          sbi->ll_mdc_exp = NULL;
  out:
@@ -456,7 +579,7 @@ int ll_get_max_mdsize(struct ll_sb_info *sbi, int *lmmsize)
          *lmmsize = obd_size_diskmd(sbi->ll_osc_exp, NULL);
          size = sizeof(int);
          rc = obd_get_info(sbi->ll_mdc_exp, sizeof(KEY_MAX_EASIZE),
-                          KEY_MAX_EASIZE, &size, lmmsize);
+                          KEY_MAX_EASIZE, &size, lmmsize, NULL);
          if (rc)
                  CERROR("Get max mdsize error rc %d \n", rc);
  
@@ -506,27 +629,6 @@ void lustre_dump_dentry(struct dentry *dentry, int recur)
          }
  }
  
-#if (LINUX_VERSION_CODE < KERNEL_VERSION(2,5,0))
-void lustre_throw_orphan_dentries(struct super_block *sb)
-{
-        struct dentry *dentry, *next;
-        struct ll_sb_info *sbi = ll_s2sbi(sb);
-
-        /* Do this to get rid of orphaned dentries. That is not really trw. */
-        list_for_each_entry_safe(dentry, next, &sbi->ll_orphan_dentry_list,
-                                 d_hash) {
-                CWARN("found orphan dentry %.*s (%p->%p) at unmount, dumping "
-                      "before and after shrink_dcache_parent\n",
-                      dentry->d_name.len, dentry->d_name.name, dentry, next);
-                lustre_dump_dentry(dentry, 1);
-                shrink_dcache_parent(dentry);
-                lustre_dump_dentry(dentry, 1);
-        }
-}
-#else
-#define lustre_throw_orphan_dentries(sb)
-#endif
-
  #ifdef HAVE_EXPORT___IGET
  static void prune_dir_dentries(struct inode *inode)
  {
@@ -640,11 +742,10 @@ void client_common_put_super(struct super_block *sb)
          obd_disconnect(sbi->ll_osc_exp);
          sbi->ll_osc_exp = NULL;
  
+        obd_fid_fini(sbi->ll_mdc_exp);
          obd_disconnect(sbi->ll_mdc_exp);
          sbi->ll_mdc_exp = NULL;
  
-        lustre_throw_orphan_dentries(sb);
-
          EXIT;
  }
  
@@ -660,7 +761,7 @@ void ll_kill_super(struct super_block *sb)
  
          sbi = ll_s2sbi(sb);
          /* we need restore s_dev from changed for clustred NFS before put_super
-         * because new kernels have cached s_dev and change sb->s_dev in 
+         * because new kernels have cached s_dev and change sb->s_dev in
           * put_super not affected real removing devices */
          if (sbi)
                  sb->s_dev = sbi->ll_sdev_orig;
@@ -748,10 +849,14 @@ static int ll_options(char *options, int *flags)
                          /* Ignore deprecated mount option.  The client will
                           * always try to mount with ACL support, whether this
                           * is used depends on whether server supports it. */
+                        LCONSOLE_ERROR_MSG(0x152, "Ignoring deprecated "
+                                                  "mount option 'acl'.\n");
                          goto next;
                  }
                  tmp = ll_set_opt("noacl", s1, LL_SBI_ACL);
                  if (tmp) {
+                        LCONSOLE_ERROR_MSG(0x152, "Ignoring deprecated "
+                                                  "mount option 'noacl'.\n");
                          goto next;
                  }
  
@@ -989,7 +1094,7 @@ out:
  
  int ll_fill_super(struct super_block *sb)
  {
-        struct lustre_profile *lprof;
+        struct lustre_profile *lprof = NULL;
          struct lustre_sb_info *lsi = s2lsi(sb);
          struct ll_sb_info *sbi;
          char  *osc = NULL, *mdc = NULL;
@@ -997,6 +1102,8 @@ int ll_fill_super(struct super_block *sb)
          struct config_llog_instance cfg = {0, };
          char   ll_instance[sizeof(sb) * 2 + 1];
          int    err;
+        char  *save = NULL;
+        char  pseudo[32] = { 0 };
          ENTRY;
  
          CDEBUG(D_VFSTRACE, "VFS Op: sb %p\n", sb);
@@ -1020,6 +1127,7 @@ int ll_fill_super(struct super_block *sb)
          sprintf(ll_instance, "%p", sb);
          cfg.cfg_instance = ll_instance;
          cfg.cfg_uuid = lsi->lsi_llsbi->ll_sb_uuid;
+        cfg.cfg_sb = sb;
  
          /* set up client obds */
          if (strchr(profilenm, '/') != NULL) /* COMPAT_146 */
@@ -1071,6 +1179,30 @@ int ll_fill_super(struct super_block *sb)
                                     "exist?\n", profilenm);
                  GOTO(out_free, err = -EINVAL);
          }
+
+        /*
+         * The configuration for 1.8 client and 2.0 client are different.
+         * 2.0 introduces lmv, but 1.8 directly uses mdc.
+         * Here, we will hack to use proper name for mdc if needed.
+         */
+        {
+                char *fsname_end;
+                int namelen;
+
+                save = lprof->lp_mdc;
+                fsname_end = strrchr(save, '-');
+                if (fsname_end) {
+                        namelen = fsname_end - save;
+                        if (strcmp(fsname_end, "-clilmv") == 0) {
+                                strncpy(pseudo, save, namelen);
+                                strcat(pseudo, "-MDT0000-mdc");
+                                lprof->lp_mdc = pseudo;
+                                CDEBUG(D_INFO, "1.8.x connecting to 2.0: lmv=%s"
+                                       " new mdc=%s\n", save, pseudo);
+                        }
+                }
+        }
+
          CDEBUG(D_CONFIG, "Found profile %s: mdc=%s osc=%s\n", profilenm,
                 lprof->lp_mdc, lprof->lp_osc);
  
@@ -1090,6 +1222,8 @@ int ll_fill_super(struct super_block *sb)
          err = client_common_fill_super(sb, mdc, osc);
  
  out_free:
+        if (save && lprof)
+                lprof->lp_mdc = save;
          if (mdc)
                  OBD_FREE(mdc, strlen(mdc) + 1);
          if (osc)
@@ -1122,7 +1256,7 @@ void ll_put_super(struct super_block *sb)
  
          if (sbi->ll_mdc_exp) {
                  obd = class_exp2obd(sbi->ll_mdc_exp);
-                if (obd) 
+                if (obd)
                          force = obd->obd_force;
          }
  
@@ -1161,9 +1295,9 @@ void ll_put_super(struct super_block *sb)
          EXIT;
  } /* client_put_super */
  
-#ifdef HAVE_REGISTER_CACHE
-#include <linux/cache_def.h>
-#ifdef HAVE_CACHE_RETURN_INT
+#if defined(HAVE_REGISTER_CACHE) || defined(HAVE_SHRINKER_CACHE)
+
+#if defined(HAVE_CACHE_RETURN_INT)
  static int
  #else
  static void
@@ -1176,7 +1310,7 @@ ll_shrink_cache(int priority, unsigned int gfp_mask)
          list_for_each_entry(sbi, &ll_super_blocks, ll_list)
                  count += llap_shrink_cache(sbi, priority);
  
-#ifdef HAVE_CACHE_RETURN_INT
+#if defined(HAVE_CACHE_RETURN_INT)
          return count;
  #endif
  }
@@ -1185,7 +1319,7 @@ struct cache_definition ll_cache_definition = {
          .name = "llap_cache",
          .shrink = ll_shrink_cache
  };
-#endif /* HAVE_REGISTER_CACHE */
+#endif /* HAVE_REGISTER_CACHE || HAVE_SHRINKER_CACHE */
  
  struct inode *ll_inode_from_lock(struct ldlm_lock *lock)
  {
@@ -1310,7 +1444,7 @@ static int ll_setattr_do_truncate(struct inode *inode, loff_t new_size)
          UNLOCK_INODE_MUTEX(inode);
          UP_WRITE_I_ALLOC_SEM(inode);
  
-        if (sbi->ll_lockless_truncate_enable && 
+        if (sbi->ll_lockless_truncate_enable &&
              (sbi->ll_lco.lco_flags & OBD_CONNECT_TRUNCLOCK)) {
                  ast_flags = LDLM_FL_BLOCK_GRANTED;
                  rc = obd_match(sbi->ll_osc_exp, lsm, LDLM_EXTENT,
@@ -1332,13 +1466,8 @@ static int ll_setattr_do_truncate(struct inode *inode, loff_t new_size)
                          local_lock = 1;
          }
  
-#if (LINUX_VERSION_CODE < KERNEL_VERSION(2,5,0))
-        DOWN_WRITE_I_ALLOC_SEM(inode);
-        LOCK_INODE_MUTEX(inode);
-#else
          LOCK_INODE_MUTEX(inode);
          DOWN_WRITE_I_ALLOC_SEM(inode);
-#endif
          if (likely(rc == 0)) {
                  /* Only ll_inode_size_lock is taken at this level.
                   * lov_stripe_lock() is grabbed by ll_truncate() only over
@@ -1388,7 +1517,7 @@ int ll_setattr_raw(struct inode *inode, struct iattr *attr)
          struct lov_stripe_md *lsm = lli->lli_smd;
          struct ll_sb_info *sbi = ll_i2sbi(inode);
          struct ptlrpc_request *request = NULL;
-        struct mdc_op_data op_data;
+        struct mdc_op_data op_data = { { 0 } };
          struct lustre_md md;
          int ia_valid = attr->ia_valid;
          int rc = 0;
@@ -1410,7 +1539,8 @@ int ll_setattr_raw(struct inode *inode, struct iattr *attr)
  
          /* POSIX: check before ATTR_*TIME_SET set (from inode_change_ok) */
          if (ia_valid & (ATTR_MTIME_SET | ATTR_ATIME_SET)) {
-                if (current->fsuid != inode->i_uid && !capable(CAP_FOWNER))
+                if (current->fsuid != inode->i_uid &&
+                    !cfs_capable(CFS_CAP_FOWNER))
                          RETURN(-EPERM);
          }
  
@@ -1427,16 +1557,6 @@ int ll_setattr_raw(struct inode *inode, struct iattr *attr)
                  attr->ia_mtime = CURRENT_TIME;
                  attr->ia_valid |= ATTR_MTIME_SET;
          }
-        if ((attr->ia_valid & ATTR_CTIME) && !(attr->ia_valid & ATTR_MTIME)) {
-                /* To avoid stale mtime on mds, obtain it from ost and send
-                   to mds. */
-                rc = ll_glimpse_size(inode, 0);
-                if (rc)
-                        RETURN(rc);
-
-                attr->ia_valid |= ATTR_MTIME_SET | ATTR_MTIME;
-                attr->ia_mtime = inode->i_mtime;
-        }
  
          if (attr->ia_valid & (ATTR_MTIME | ATTR_CTIME))
                  CDEBUG(D_INODE, "setting mtime %lu, ctime %lu, now = %lu\n",
@@ -1498,23 +1618,95 @@ int ll_setattr_raw(struct inode *inode, struct iattr *attr)
          if (ia_valid & ATTR_SIZE) {
                  rc = ll_setattr_do_truncate(inode, attr->ia_size);
          } else if (ia_valid & (ATTR_MTIME | ATTR_MTIME_SET)) {
-                obd_flag flags;
                  struct obd_info oinfo = { { { 0 } } };
                  struct obdo *oa;
-                OBDO_ALLOC(oa);
+                struct lustre_handle lockh = { 0 };
+                obd_valid valid;
  
                  CDEBUG(D_INODE, "set mtime on OST inode %lu to %lu\n",
                         inode->i_ino, LTIME_S(attr->ia_mtime));
  
+                OBDO_ALLOC(oa);
                  if (oa) {
                          oa->o_id = lsm->lsm_object_id;
-                        oa->o_valid = OBD_MD_FLID;
+                        oa->o_gr = lsm->lsm_object_gr;
+                        oa->o_valid = OBD_MD_FLID | OBD_MD_FLGROUP;
+
+                        valid = OBD_MD_FLTYPE | OBD_MD_FLFID | OBD_MD_FLGENER;
+
+                        if (LTIME_S(attr->ia_mtime) < LTIME_S(attr->ia_ctime)){
+                                struct ost_lvb xtimes;
+
+                                /* setting mtime to past is performed under PW
+                                 * EOF extent lock */
+                                oinfo.oi_policy.l_extent.start = 0;
+                                oinfo.oi_policy.l_extent.end = OBD_OBJECT_EOF;
+                                rc = ll_extent_lock(NULL, inode, lsm, LCK_PW,
+                                                    &oinfo.oi_policy,
+                                                    &lockh, 0);
+                                if (rc)
+                                        RETURN(rc);
+
+                                /* setattr under locks
+                                 *
+                                 * 1. restore inode's timestamps which
+                                 * are about to be set as long as
+                                 * concurrent stat (via
+                                 * ll_glimpse_size) might bring
+                                 * out-of-date ones
+                                 *
+                                 * 2. update lsm so that next stat
+                                 * (via ll_glimpse_size) could get
+                                 * correct values in lsm */
+                                lov_stripe_lock(lli->lli_smd);
+                                if (ia_valid & ATTR_ATIME) {
+                                        LTIME_S(inode->i_atime) =
+                                                xtimes.lvb_atime =
+                                                LTIME_S(attr->ia_atime);
+                                        valid |= OBD_MD_FLATIME;
+                                }
+                                if (ia_valid & ATTR_MTIME) {
+                                        LTIME_S(inode->i_mtime) =
+                                                xtimes.lvb_mtime =
+                                                LTIME_S(attr->ia_mtime);
+                                        valid |= OBD_MD_FLMTIME;
+                                }
+                                if (ia_valid & ATTR_CTIME) {
+                                        LTIME_S(inode->i_ctime) =
+                                                xtimes.lvb_ctime =
+                                                LTIME_S(attr->ia_ctime);
+                                        valid |= OBD_MD_FLCTIME;
+                                }
  
-                        flags = OBD_MD_FLTYPE | OBD_MD_FLATIME |
-                                OBD_MD_FLMTIME | OBD_MD_FLCTIME |
-                                OBD_MD_FLFID | OBD_MD_FLGENER;
+                                obd_update_lvb(ll_i2obdexp(inode), lli->lli_smd,
+                                               &xtimes, valid);
+                                lov_stripe_unlock(lli->lli_smd);
+                        } else {
+                                /* lockless setattr
+                                 *
+                                 * 1. do not use inode's timestamps
+                                 * because concurrent stat might fill
+                                 * the inode with out-of-date times,
+                                 * send values from attr instead
+                                 *
+                                 * 2.do no update lsm, as long as stat
+                                 * (via ll_glimpse_size) will bring
+                                 * attributes from osts anyway */
+                                if (ia_valid & ATTR_ATIME) {
+                                        oa->o_atime = LTIME_S(attr->ia_atime);
+                                        oa->o_valid |= OBD_MD_FLATIME;
+                                }
+                                if (ia_valid & ATTR_MTIME) {
+                                        oa->o_mtime = LTIME_S(attr->ia_mtime);
+                                        oa->o_valid |= OBD_MD_FLMTIME;
+                                }
+                                if (ia_valid & ATTR_CTIME) {
+                                        oa->o_ctime = LTIME_S(attr->ia_ctime);
+                                        oa->o_valid |= OBD_MD_FLCTIME;
+                                }
+                        }
  
-                        obdo_from_inode(oa, inode, flags);
+                        obdo_from_inode(oa, inode, valid);
  
                          oinfo.oi_oa = oa;
                          oinfo.oi_md = lsm;
@@ -1522,6 +1714,19 @@ int ll_setattr_raw(struct inode *inode, struct iattr *attr)
                          rc = obd_setattr_rqset(sbi->ll_osc_exp, &oinfo, NULL);
                          if (rc)
                                  CERROR("obd_setattr_async fails: rc=%d\n", rc);
+
+                        if (LTIME_S(attr->ia_mtime) < LTIME_S(attr->ia_ctime)){
+                                int err;
+
+                                err = ll_extent_unlock(NULL, inode, lsm,
+                                                       LCK_PW, &lockh);
+                                if (unlikely(err != 0)) {
+                                        CERROR("extent unlock failed: "
+                                               "err=%d\n", err);
+                                        if (rc == 0)
+                                                rc = err;
+                                }
+                        }
                          OBDO_FREE(oa);
                  } else {
                          rc = -ENOMEM;
@@ -1537,7 +1742,7 @@ int ll_setattr(struct dentry *de, struct iattr *attr)
          if ((attr->ia_valid & (ATTR_CTIME|ATTR_SIZE|ATTR_MODE)) ==
              (ATTR_CTIME|ATTR_SIZE|ATTR_MODE))
                  attr->ia_valid |= MDS_OPEN_OWNEROVERRIDE;
-        if ((attr->ia_valid & (ATTR_MODE|ATTR_FORCE|ATTR_SIZE)) == 
+        if ((attr->ia_valid & (ATTR_MODE|ATTR_FORCE|ATTR_SIZE)) ==
              (ATTR_SIZE|ATTR_MODE)) {
                  mode = de->d_inode->i_mode;
                  if (((mode & S_ISUID) && (!(attr->ia_mode & S_ISUID))) ||
@@ -1695,11 +1900,16 @@ void ll_update_inode(struct inode *inode, struct lustre_md *md)
          struct ll_inode_info *lli = ll_i2info(inode);
          struct mds_body *body = md->body;
          struct lov_stripe_md *lsm = md->lsm;
+        struct ll_sb_info *sbi = ll_i2sbi(inode);
+        ENTRY;
+
+        CDEBUG(D_INODE, "body->valid = "LPX64"\n", body->valid);
  
          LASSERT ((lsm != NULL) == ((body->valid & OBD_MD_FLEASIZE) != 0));
          if (lsm != NULL) {
                  if (lli->lli_smd == NULL) {
-                        if (lsm->lsm_magic != LOV_MAGIC &&
+                        if (lsm->lsm_magic != LOV_MAGIC_V1 &&
+                            lsm->lsm_magic != LOV_MAGIC_V3 &&
                              lsm->lsm_magic != LOV_MAGIC_JOIN) {
                                  dump_lsm(D_ERROR, lsm);
                                  LBUG();
@@ -1744,25 +1954,28 @@ void ll_update_inode(struct inode *inode, struct lustre_md *md)
          }
  #endif
  
-        if (body->valid & OBD_MD_FLID)
-                inode->i_ino = body->ino;
-        if (body->valid & OBD_MD_FLATIME &&
-            body->atime > LTIME_S(inode->i_atime))
-                LTIME_S(inode->i_atime) = body->atime;
-
-        /* mtime is always updated with ctime, but can be set in past.
-           As write and utime(2) may happen within 1 second, and utime's
-           mtime has a priority over write's one, so take mtime from mds
-           for the same ctimes. */
-        if (body->valid & OBD_MD_FLCTIME &&
-            body->ctime >= LTIME_S(inode->i_ctime)) {
-                LTIME_S(inode->i_ctime) = body->ctime;
-                if (body->valid & OBD_MD_FLMTIME) {
-                        CDEBUG(D_INODE, "setting ino %lu mtime "
-                               "from %lu to "LPU64"\n", inode->i_ino,
+        inode->i_ino = ll_fid_build_ino(sbi, &body->fid1);
+        if (body->valid & OBD_MD_FLGENER)
+                inode->i_generation = body->generation;
+
+        if (body->valid & OBD_MD_FLATIME) {
+                if (body->atime > LTIME_S(inode->i_atime))
+                        LTIME_S(inode->i_atime) = body->atime;
+                lli->lli_lvb.lvb_atime = body->atime;
+        }
+        if (body->valid & OBD_MD_FLMTIME) {
+                if (body->mtime > LTIME_S(inode->i_mtime)) {
+                        CDEBUG(D_INODE, "setting ino %lu mtime from %lu "
+                               "to "LPU64"\n", inode->i_ino,
                                 LTIME_S(inode->i_mtime), body->mtime);
                          LTIME_S(inode->i_mtime) = body->mtime;
                  }
+                lli->lli_lvb.lvb_mtime = body->mtime;
+        }
+        if (body->valid & OBD_MD_FLCTIME) {
+                if (body->ctime > LTIME_S(inode->i_ctime))
+                        LTIME_S(inode->i_ctime) = body->ctime;
+                lli->lli_lvb.lvb_ctime = body->ctime;
          }
          if (body->valid & OBD_MD_FLMODE)
                  inode->i_mode = (inode->i_mode & S_IFMT)|(body->mode & ~S_IFMT);
@@ -1781,17 +1994,13 @@ void ll_update_inode(struct inode *inode, struct lustre_md *md)
          if (body->valid & OBD_MD_FLGID)
                  inode->i_gid = body->gid;
          if (body->valid & OBD_MD_FLFLAGS)
-                inode->i_flags = ll_ext_to_inode_flags(body->flags);
+                inode->i_flags = ll_ext_to_inode_flags(body->flags |
+                                                       MDS_BFLAG_EXT_FLAGS);
          if (body->valid & OBD_MD_FLNLINK)
                  inode->i_nlink = body->nlink;
-        if (body->valid & OBD_MD_FLGENER)
-                inode->i_generation = body->generation;
+
          if (body->valid & OBD_MD_FLRDEV)
-#if (LINUX_VERSION_CODE < KERNEL_VERSION(2,5,0))
-                inode->i_rdev = body->rdev;
-#else
                  inode->i_rdev = old_decode_dev(body->rdev);
-#endif
          if (body->valid & OBD_MD_FLSIZE) {
  #if 0           /* Can't block ll_test_inode->ll_update_inode, b=14326*/
                  ll_inode_size_lock(inode, 0);
@@ -1806,9 +2015,9 @@ void ll_update_inode(struct inode *inode, struct lustre_md *md)
  
          if (body->valid & OBD_MD_FLSIZE)
                  set_bit(LLI_F_HAVE_MDS_SIZE_LOCK, &lli->lli_flags);
+        EXIT;
  }
  
-#if (LINUX_VERSION_CODE >= KERNEL_VERSION(2,5,0))
  static struct backing_dev_info ll_backing_dev_info = {
          .ra_pages       = 0,    /* No readahead */
  #if (LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,12))
@@ -1817,7 +2026,6 @@ static struct backing_dev_info ll_backing_dev_info = {
          .memory_backed  = 0,    /* Does contribute to dirty memory */
  #endif
  };
-#endif
  
  void ll_read_inode2(struct inode *inode, void *opaque)
  {
@@ -1860,16 +2068,10 @@ void ll_read_inode2(struct inode *inode, void *opaque)
                  EXIT;
          } else {
                  inode->i_op = &ll_special_inode_operations;
-
-#if (LINUX_VERSION_CODE > KERNEL_VERSION(2,5,0))
                  init_special_inode(inode, inode->i_mode,
                                     kdev_t_to_nr(inode->i_rdev));
-
                  /* initializing backing dev info. */
                  inode->i_mapping->backing_dev_info = &ll_backing_dev_info;
-#else
-                init_special_inode(inode, inode->i_mode, inode->i_rdev);
-#endif
                  EXIT;
          }
  }
@@ -1900,13 +2102,13 @@ int ll_iocontrol(struct inode *inode, struct file *file,
                  /* We want to return EXT3_*_FL flags to the caller via this
                   * ioctl.  An older MDS may be sending S_* flags, fix it up. */
                  flags = ll_inode_to_ext_flags(body->flags,
-                                              body->flags &MDS_BFLAG_EXT_FLAGS);
+                                              MDS_BFLAG_EXT_FLAGS);
                  ptlrpc_req_finished (req);
  
                  RETURN(put_user(flags, (int *)arg));
          }
          case EXT3_IOC_SETFLAGS: {
-                struct mdc_op_data op_data;
+                struct mdc_op_data op_data = { { 0 } };
                  struct ll_iattr_struct attr;
                  struct obd_info oinfo = { { { 0 } } };
                  struct lov_stripe_md *lsm = ll_i2info(inode)->lli_smd;
@@ -1934,8 +2136,10 @@ int ll_iocontrol(struct inode *inode, struct file *file,
                  }
  
                  oinfo.oi_oa->o_id = lsm->lsm_object_id;
+                oinfo.oi_oa->o_gr = lsm->lsm_object_gr;
                  oinfo.oi_oa->o_flags = flags;
-                oinfo.oi_oa->o_valid = OBD_MD_FLID | OBD_MD_FLFLAGS;
+                oinfo.oi_oa->o_valid = OBD_MD_FLID | OBD_MD_FLGROUP |
+                                       OBD_MD_FLFLAGS;
  
                  obdo_from_inode(oinfo.oi_oa, inode,
                                  OBD_MD_FLFID | OBD_MD_FLGENER);
@@ -2011,9 +2215,19 @@ void ll_umount_begin(struct super_block *sb)
  
          /* Really, we'd like to wait until there are no requests outstanding,
           * and then continue.  For now, we just invalidate the requests,
-         * schedule, and hope.
+         * schedule() and sleep one second if needed, and hope.
           */
          schedule();
+#ifdef HAVE_UMOUNTBEGIN_VFSMOUNT
+        if (atomic_read(&vfsmnt->mnt_count) > 2) {
+                cfs_schedule_timeout(CFS_TASK_INTERRUPTIBLE,
+                                     cfs_time_seconds(1));
+                if (atomic_read(&vfsmnt->mnt_count) > 2)
+                        LCONSOLE_WARN("Mount still busy with %d refs! You "
+                                      "may try to umount it a bit later\n",
+                                      atomic_read(&vfsmnt->mnt_count));
+        }
+#endif
  
          EXIT;
  }
@@ -2029,6 +2243,17 @@ int ll_remount_fs(struct super_block *sb, int *flags, char *data)
                  err = obd_set_info_async(sbi->ll_mdc_exp, sizeof(KEY_READONLY),
                                           KEY_READONLY, sizeof(read_only),
                                           &read_only, NULL);
+
+                /* MDS might have expected a different ro key value, b=17493 */
+                if (err == -EINVAL) {
+                        CDEBUG(D_CONFIG, "Retrying remount with 1.6.6 ro key\n");
+                        err = obd_set_info_async(sbi->ll_mdc_exp,
+                                                 sizeof(KEY_READONLY_166COMPAT),
+                                                 KEY_READONLY_166COMPAT,
+                                                 sizeof(read_only),
+                                                 &read_only, NULL);
+                }
+
                  if (err) {
                          CERROR("Failed to change the read-only flag during "
                                 "remount: %d\n", err);
@@ -2063,7 +2288,10 @@ int ll_prep_inode(struct obd_export *exp, struct inode **inode,
                  ll_update_inode(*inode, &md);
          } else {
                  LASSERT(sb);
-                *inode = ll_iget(sb, md.body->ino, &md);
+                /** hashing VFS inode by FIDs.
+                 * IGIF will be used for for compatibility if needed.
+                 */
+                *inode =ll_iget(sb, ll_fid_build_ino(sbi, &md.body->fid1), &md);
                  if (*inode == NULL || is_bad_inode(*inode)) {
                          mdc_free_lustre_md(exp, &md);
                          rc = -ENOMEM;
@@ -2087,14 +2315,14 @@ char *llap_origins[] = {
          [LLAP_ORIGIN_LOCKLESS_IO] = "ls"
  };
  
-struct ll_async_page *llite_pglist_next_llap(struct ll_sb_info *sbi,
+struct ll_async_page *llite_pglist_next_llap(struct list_head *head,
                                               struct list_head *list)
  {
          struct ll_async_page *llap;
          struct list_head *pos;
  
          list_for_each(pos, list) {
-                if (pos == &sbi->ll_pglist)
+                if (pos == head)
                          return NULL;
                  llap = list_entry(pos, struct ll_async_page, llap_pglist_item);
                  if (llap->llap_page == NULL)
@@ -2199,3 +2427,27 @@ int ll_process_config(struct lustre_cfg *lcfg)
          return(rc);
  }
  
+int ll_show_options(struct seq_file *seq, struct vfsmount *vfs)
+{
+        struct ll_sb_info *sbi;
+
+        LASSERT((seq != NULL) && (vfs != NULL));
+        sbi = ll_s2sbi(vfs->mnt_sb);
+
+        if (sbi->ll_flags & LL_SBI_NOLCK)
+                seq_puts(seq, ",nolock");
+
+        if (sbi->ll_flags & LL_SBI_FLOCK)
+                seq_puts(seq, ",flock");
+
+        if (sbi->ll_flags & LL_SBI_LOCALFLOCK)
+                seq_puts(seq, ",localflock");
+
+        if (sbi->ll_flags & LL_SBI_USER_XATTR)
+                seq_puts(seq, ",user_xattr");
+
+        if (sbi->ll_flags & LL_SBI_ACL)
+                seq_puts(seq, ",acl");
+
+        RETURN(0);
+}
diff --git a/lustre/llite/llite_mmap.c b/lustre/llite/llite_mmap.c

index 0961479..886cf9d 100644 (file)
--- a/lustre/llite/llite_mmap.c
+++ b/lustre/llite/llite_mmap.c
@@ -1,22 +1,37 @@
  /* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
   * vim:expandtab:shiftwidth=8:tabstop=8:
   *
- *  Copyright (c) 2001-2003 Cluster File Systems, Inc.
+ * GPL HEADER START
   *
- *   This file is part of Lustre, http://www.lustre.org.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   *
- *   Lustre is free software; you can redistribute it and/or
- *   modify it under the terms of version 2 of the GNU General Public
- *   License as published by the Free Software Foundation.
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
   *
- *   Lustre is distributed in the hope that it will be useful,
- *   but WITHOUT ANY WARRANTY; without even the implied warranty of
- *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- *   GNU General Public License for more details.
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
   *
- *   You should have received a copy of the GNU General Public License
- *   along with Lustre; if not, write to the Free Software
- *   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright  2008 Sun Microsystems, Inc. All rights reserved
+ * Use is subject to license terms.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
   */
  #ifndef AUTOCONF_INCLUDED
  #include <linux/config.h>
@@ -38,9 +53,6 @@
  #include <linux/mm.h>
  #include <linux/pagemap.h>
  #include <linux/smp_lock.h>
-#if (LINUX_VERSION_CODE < KERNEL_VERSION(2,5,0))
-#include <linux/iobuf.h>
-#endif
  
  #define DEBUG_SUBSYSTEM S_LLITE
  
@@ -590,8 +602,7 @@ int ll_file_mmap(struct file * file, struct vm_area_struct * vma)
          ll_stats_ops_tally(ll_i2sbi(file->f_dentry->d_inode), LPROC_LL_MAP, 1);
          rc = generic_file_mmap(file, vma);
          if (rc == 0) {
-#if !defined(HAVE_FILEMAP_POPULATE) && \
-    (LINUX_VERSION_CODE >= KERNEL_VERSION(2,5,0))
+#ifndef HAVE_FILEMAP_POPULATE
                  if (!filemap_populate)
                          filemap_populate = vma->vm_ops->populate;
  #endif
diff --git a/lustre/llite/llite_nfs.c b/lustre/llite/llite_nfs.c

index 03dd480..c941d73 100644 (file)
--- a/lustre/llite/llite_nfs.c
+++ b/lustre/llite/llite_nfs.c
@@ -1,24 +1,41 @@
  /* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
   * vim:expandtab:shiftwidth=8:tabstop=8:
   *
- *   NFS export of Lustre Light File System 
+ * GPL HEADER START
   *
- *   Copyright (c) 2002, 2003 Cluster File Systems, Inc.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   *
- *   This file is part of Lustre, http://www.lustre.org.
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
   *
- *   Lustre is free software; you can redistribute it and/or
- *   modify it under the terms of version 2 of the GNU General Public
- *   License as published by the Free Software Foundation.
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
   *
- *   Lustre is distributed in the hope that it will be useful,
- *   but WITHOUT ANY WARRANTY; without even the implied warranty of
- *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- *   GNU General Public License for more details.
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
   *
- *   You should have received a copy of the GNU General Public License
- *   along with Lustre; if not, write to the Free Software
- *   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright  2008 Sun Microsystems, Inc. All rights reserved
+ * Use is subject to license terms.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/llite/llite_nfs.c
+ *
+ * NFS export of Lustre Light File System
   */
  
  #define DEBUG_SUBSYSTEM S_LLITE
@@ -40,11 +57,7 @@ __u32 get_uuid2int(const char *name, int len)
          return (key0 << 1);
  }
  
-#if (LINUX_VERSION_CODE < KERNEL_VERSION(2,5,0))
-static int ll_nfs_test_inode(struct inode *inode, unsigned long ino, void *opaque)
-#else
  static int ll_nfs_test_inode(struct inode *inode, void *opaque)
-#endif
  {
          struct ll_fid *iid = opaque;
  
@@ -103,9 +116,6 @@ static struct dentry *ll_iget_for_nfs(struct super_block *sb, unsigned long ino,
  {
          struct inode *inode;
          struct dentry *result;
-#if (LINUX_VERSION_CODE < KERNEL_VERSION(2,5,0))
-        struct list_head *lp;
-#endif
          ENTRY;
  
          if (ino == 0)
@@ -126,51 +136,12 @@ static struct dentry *ll_iget_for_nfs(struct super_block *sb, unsigned long ino,
                  RETURN(ERR_PTR(-ESTALE));
          }
  
-#if (LINUX_VERSION_CODE > KERNEL_VERSION(2,5,0))
          result = d_alloc_anon(inode);
          if (!result) {
                  iput(inode);
                  RETURN(ERR_PTR(-ENOMEM));
          }
-#else
-        /* now to find a dentry.
-         * If possible, get a well-connected one
-         */
-        spin_lock(&dcache_lock);
-        for (lp = inode->i_dentry.next; lp != &inode->i_dentry ; lp=lp->next) {
-                result = list_entry(lp,struct dentry, d_alias);
-                lock_dentry(result);
-                if (!(result->d_flags & DCACHE_DISCONNECTED)) {
-                        dget_locked(result);
-                        ll_set_dflags(result, DCACHE_REFERENCED);
-                        unlock_dentry(result);
-                        spin_unlock(&dcache_lock);
-                        iput(inode);
-                        RETURN(result);
-                }
-                unlock_dentry(result);
-        }
-        spin_unlock(&dcache_lock);
-        result = d_alloc_root(inode);
-        if (result == NULL) {
-                iput(inode);
-                RETURN(ERR_PTR(-ENOMEM));
-        }
-        result->d_flags |= DCACHE_DISCONNECTED;
-
-#endif
-        ll_set_dd(result);
-
-        lock_dentry(result);
-        if (unlikely(result->d_op == &ll_init_d_ops)) {
-                result->d_op = &ll_d_ops;
-                unlock_dentry(result);
-                smp_wmb();
-                ll_d_wakeup(result);
-        } else {
-                result->d_op = &ll_d_ops;
-                unlock_dentry(result);
-        }
+        ll_dops_init(result, 1);
  
          RETURN(result);
  }
@@ -219,7 +190,7 @@ int ll_dentry_to_fh(struct dentry *dentry, __u32 *datap, int *lenp,
          return 1;
  }
  
-#if (LINUX_VERSION_CODE > KERNEL_VERSION(2,5,0))
+#if THREAD_SIZE >= 8192
  struct dentry *ll_get_dentry(struct super_block *sb, void *data)
  {
          __u32 *inump = (__u32*)data;
diff --git a/lustre/llite/lloop.c b/lustre/llite/lloop.c

index f42fd4a..f7921d0 100644 (file)
--- a/lustre/llite/lloop.c
+++ b/lustre/llite/lloop.c
@@ -1,24 +1,37 @@
  /* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
   * vim:expandtab:shiftwidth=8:tabstop=8:
   *
- * Lustre virtual block device emulator.
+ * GPL HEADER START
   *
- *  Copyright (c) 2001-2003 Cluster File Systems, Inc.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   *
- *   This file is part of Lustre, http://www.lustre.org.
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
   *
- *   Lustre is free software; you can redistribute it and/or
- *   modify it under the terms of version 2 of the GNU General Public
- *   License as published by the Free Software Foundation.
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
   *
- *   Lustre is distributed in the hope that it will be useful,
- *   but WITHOUT ANY WARRANTY; without even the implied warranty of
- *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- *   GNU General Public License for more details.
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
   *
- *   You should have received a copy of the GNU General Public License
- *   along with Lustre; if not, write to the Free Software
- *   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright  2008 Sun Microsystems, Inc. All rights reserved
+ * Use is subject to license terms.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
   */
  
  /*
@@ -29,9 +42,6 @@
   * Copyright 1993 by Theodore Ts'o.  Redistribution of this file is
   * permitted under the GNU General Public License.
   *
- * DES encryption plus some minor changes by Werner Almesberger, 30-MAY-1993
- * more DES encryption plus IDEA encryption by Nicholas J. Leon, June 20, 1996
- *
   * Modularized and updated for 1.1.16 kernel - Mitch Dsouza 28th May 1994
   * Adapted for 1.3.59 kernel - Andries Brouwer, 1 Feb 1996
   *
@@ -43,10 +53,6 @@
   *
   * Loadable modules and other fixes by AK, 1998
   *
- * Make real block number available to downstream transfer functions, enables
- * CBC (and relatives) mode encryption requiring unique IVs per data block.
- * Reed H. Petty, rhp@draper.net
- *
   * Maximum number of loop devices now dynamic via max_loop module parameter.
   * Russell Kroll <rkroll@exploits.org> 19990701
   *
@@ -106,7 +112,7 @@
  #include <lustre_lite.h>
  #include "llite_internal.h"
  
-#define LLOOP_MAX_SEGMENTS        PTLRPC_MAX_BRW_PAGES
+#define LLOOP_MAX_SEGMENTS    PTLRPC_MAX_BRW_PAGES
  
  /* Possible states of device */
  enum {
@@ -121,8 +127,8 @@ struct lloop_device {
          loff_t             lo_offset;
          loff_t             lo_sizelimit;
          int                lo_flags;
-        int                (*ioctl)(struct lloop_device *, int cmd, 
-                                 unsigned long arg); 
+        int                (*ioctl)(struct lloop_device *, int cmd,
+                                    unsigned long arg);
  
          struct file *      lo_backing_file;
          struct block_device *lo_device;
@@ -136,8 +142,8 @@ struct lloop_device {
          int                lo_state;
          struct semaphore   lo_sem;
          struct semaphore   lo_ctl_mutex;
-        struct semaphore   lo_bh_mutex;
          atomic_t           lo_pending;
+        wait_queue_head_t  lo_bh_wait;
  
          request_queue_t    *lo_queue;
  
@@ -146,7 +152,6 @@ struct lloop_device {
                  struct brw_page    lrd_pages[LLOOP_MAX_SEGMENTS];
                  struct obdo        lrd_oa;
          } lo_requests[1];
-
  };
  
  /*
@@ -156,8 +161,9 @@ enum {
          LO_FLAGS_READ_ONLY       = 1,
  };
  
+#define MAX_LOOP_DEFAULT  16
  static int lloop_major;
-static int max_loop = 8;
+static int max_loop = MAX_LOOP_DEFAULT;
  static struct lloop_device *loop_dev;
  static struct gendisk **disks;
  static struct semaphore lloop_mutex;
@@ -181,7 +187,7 @@ static loff_t get_loop_size(struct lloop_device *lo, struct file *file)
          return loopsize >> 9;
  }
  
-static int do_bio_filebacked(struct lloop_device *lo, struct bio *bio)
+static int do_bio_lustrebacked(struct lloop_device *lo, struct bio *head)
  {
          struct inode *inode = lo->lo_backing_file->f_dentry->d_inode;
          struct ll_inode_info *lli = ll_i2info(inode);
@@ -190,42 +196,51 @@ static int do_bio_filebacked(struct lloop_device *lo, struct bio *bio)
          struct brw_page *pg = lo->lo_requests[0].lrd_pages;
          struct obdo *oa = &lo->lo_requests[0].lrd_oa;
          pgoff_t offset;
-        int ret, cmd, i;
+        int ret, i, rw;
+        obd_count page_count = 0;
          struct bio_vec *bvec;
+        struct bio *bio;
  
-        BUG_ON(bio->bi_hw_segments > LLOOP_MAX_SEGMENTS);
+        LASSERT(head != NULL);
  
-        offset = (pgoff_t)(bio->bi_sector << 9) + lo->lo_offset;
-        bio_for_each_segment(bvec, bio, i) {
-                BUG_ON(bvec->bv_offset != 0);
-                BUG_ON(bvec->bv_len != CFS_PAGE_SIZE);
+        rw = head->bi_rw;
+        for (bio = head; bio != NULL; bio = bio->bi_next) {
+                LASSERT(rw == bio->bi_rw);
  
-                pg->pg = bvec->bv_page;
-                pg->off = offset;
-                pg->count = bvec->bv_len;
-                pg->flag = OBD_BRW_SRVLOCK;
+                offset = (pgoff_t)(bio->bi_sector << 9) + lo->lo_offset;
+                bio_for_each_segment(bvec, bio, i) {
+                        BUG_ON(bvec->bv_offset != 0);
+                        BUG_ON(bvec->bv_len != CFS_PAGE_SIZE);
  
-                pg++;
-                offset += bvec->bv_len;
+                        pg->pg = bvec->bv_page;
+                        pg->off = offset;
+                        pg->count = bvec->bv_len;
+                        pg->flag = OBD_BRW_SRVLOCK;
+
+                        CDEBUG(D_INFO, "index %lu offset "LPU64", count %u\n",
+                               pg->pg->index, pg->off, pg->count);
+                        pg++;
+                        page_count++;
+                        offset += bvec->bv_len;
+                }
+                LASSERT(page_count <= LLOOP_MAX_SEGMENTS);
          }
  
+        ll_stats_ops_tally(ll_i2sbi(inode),
+                        (rw == WRITE) ? LPROC_LL_BRW_WRITE : LPROC_LL_BRW_READ,
+                        page_count << PAGE_CACHE_SHIFT);
+
          oa->o_mode = inode->i_mode;
          oa->o_id = lsm->lsm_object_id;
-        oa->o_valid = OBD_MD_FLID | OBD_MD_FLMODE | OBD_MD_FLTYPE;
+        oa->o_gr = lsm->lsm_object_gr;
+        oa->o_valid = OBD_MD_FLID   | OBD_MD_FLGROUP |
+                      OBD_MD_FLMODE | OBD_MD_FLTYPE;
          obdo_from_inode(oa, inode, OBD_MD_FLFID | OBD_MD_FLGENER);
  
-        cmd = OBD_BRW_READ;
-        if (bio_rw(bio) == WRITE)
-                cmd = OBD_BRW_WRITE;
-
-        if (cmd == OBD_BRW_WRITE)
-                ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_BRW_WRITE, bio->bi_size);
-        else
-                ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_BRW_READ, bio->bi_size);
          oinfo.oi_oa = oa;
          oinfo.oi_md = lsm;
-        ret = obd_brw(cmd, ll_i2obdexp(inode), &oinfo, 
-                      (obd_count)(i - bio->bi_idx), 
+        ret = obd_brw((rw == WRITE) ? OBD_BRW_WRITE : OBD_BRW_READ,
+                      ll_i2obdexp(inode), &oinfo, (obd_count)page_count,
                        lo->lo_requests[0].lrd_pages, NULL);
          if (ret == 0)
                  obdo_to_inode(inode, oa, OBD_MD_FLBLOCKS);
@@ -248,41 +263,77 @@ static void loop_add_bio(struct lloop_device *lo, struct bio *bio)
                  lo->lo_bio = lo->lo_biotail = bio;
          spin_unlock_irqrestore(&lo->lo_lock, flags);
  
-        up(&lo->lo_bh_mutex);
+        atomic_inc(&lo->lo_pending);
+        if (waitqueue_active(&lo->lo_bh_wait))
+                wake_up(&lo->lo_bh_wait);
  }
  
  /*
   * Grab first pending buffer
   */
-static struct bio *loop_get_bio(struct lloop_device *lo)
+static unsigned int loop_get_bio(struct lloop_device *lo, struct bio **req)
  {
-        struct bio *bio;
+        struct bio *first;
+        struct bio **bio;
+        unsigned int count = 0;
+        unsigned int page_count = 0;
+        int rw;
  
          spin_lock_irq(&lo->lo_lock);
-        if ((bio = lo->lo_bio)) {
-                if (bio == lo->lo_biotail)
-                        lo->lo_biotail = NULL;
-                lo->lo_bio = bio->bi_next;
-                bio->bi_next = NULL;
+        first = lo->lo_bio;
+        if (unlikely(first == NULL)) {
+                spin_unlock_irq(&lo->lo_lock);
+                return 0;
          }
-        spin_unlock_irq(&lo->lo_lock);
  
-        return bio;
+        /* TODO: need to split the bio, too bad. */
+        LASSERT(first->bi_vcnt <= LLOOP_MAX_SEGMENTS);
+
+        rw = first->bi_rw;
+        bio = &lo->lo_bio;
+        while (*bio && (*bio)->bi_rw == rw) {
+                CDEBUG(D_INFO, "bio sector %llu size %u count %u vcnt%u \n",
+                       (unsigned long long)(*bio)->bi_sector, (*bio)->bi_size,
+                       page_count, (*bio)->bi_vcnt);
+                if (page_count + (*bio)->bi_vcnt > LLOOP_MAX_SEGMENTS)
+                        break;
+
+
+                page_count += (*bio)->bi_vcnt;
+                count++;
+                bio = &(*bio)->bi_next;
+        }
+        if (*bio) {
+                /* Some of bios can't be mergable. */
+                lo->lo_bio = *bio;
+                *bio = NULL;
+        } else {
+                /* Hit the end of queue */
+                lo->lo_biotail = NULL;
+                lo->lo_bio = NULL;
+        }
+        *req = first;
+        spin_unlock_irq(&lo->lo_lock);
+        return count;
  }
  
  static int loop_make_request(request_queue_t *q, struct bio *old_bio)
  {
          struct lloop_device *lo = q->queuedata;
          int rw = bio_rw(old_bio);
+        int inactive;
  
          if (!lo)
-                goto out;
+                goto err;
+
+        CDEBUG(D_INFO, "submit bio sector %llu size %u\n",
+               (unsigned long long)old_bio->bi_sector, old_bio->bi_size);
  
          spin_lock_irq(&lo->lo_lock);
-        if (lo->lo_state != LLOOP_BOUND)
-                goto inactive;
-        atomic_inc(&lo->lo_pending);
+        inactive = (lo->lo_state != LLOOP_BOUND);
          spin_unlock_irq(&lo->lo_lock);
+        if (inactive)
+                goto err;
  
          if (rw == WRITE) {
                  if (lo->lo_flags & LO_FLAGS_READ_ONLY)
@@ -296,14 +347,8 @@ static int loop_make_request(request_queue_t *q, struct bio *old_bio)
          loop_add_bio(lo, old_bio);
          return 0;
  err:
-        if (atomic_dec_and_test(&lo->lo_pending))
-                up(&lo->lo_bh_mutex);
-out:
          bio_io_error(old_bio, old_bio->bi_size);
          return 0;
-inactive:
-        spin_unlock_irq(&lo->lo_lock);
-        goto out;
  }
  
  /*
@@ -320,27 +365,37 @@ static void loop_unplug(request_queue_t *q)
  static inline void loop_handle_bio(struct lloop_device *lo, struct bio *bio)
  {
          int ret;
-        ret = do_bio_filebacked(lo, bio);
-        bio_endio(bio, bio->bi_size, ret);
+        ret = do_bio_lustrebacked(lo, bio);
+        while (bio) {
+                struct bio *tmp = bio->bi_next;
+                bio->bi_next = NULL;
+                bio_endio(bio, bio->bi_size, ret);
+                bio = tmp;
+        }
+}
+
+static inline int loop_active(struct lloop_device *lo)
+{
+        return atomic_read(&lo->lo_pending) || (lo->lo_state == LLOOP_RUNDOWN);
  }
  
  /*
   * worker thread that handles reads/writes to file backed loop devices,
- * to avoid blocking in our make_request_fn. it also does loop decrypting
- * on reads for block backed loop, as that is too heavy to do from
- * b_end_io context where irqs may be disabled.
+ * to avoid blocking in our make_request_fn.
   */
  static int loop_thread(void *data)
  {
          struct lloop_device *lo = data;
          struct bio *bio;
+        unsigned int count;
+        unsigned long times = 0;
+        unsigned long total_count = 0;
  
          daemonize("lloop%d", lo->lo_number);
  
          set_user_nice(current, -20);
  
          lo->lo_state = LLOOP_BOUND;
-        atomic_inc(&lo->lo_pending);
  
          /*
           * up sem, we are running
@@ -348,27 +403,39 @@ static int loop_thread(void *data)
          up(&lo->lo_sem);
  
          for (;;) {
-                down_interruptible(&lo->lo_bh_mutex);
-                /*
-                 * could be upped because of tear-down, not because of
-                 * pending work
-                 */
-                if (!atomic_read(&lo->lo_pending))
-                        break;
+                wait_event(lo->lo_bh_wait, loop_active(lo));
+                if (!atomic_read(&lo->lo_pending)) {
+                        int exiting = 0;
+                        spin_lock_irq(&lo->lo_lock);
+                        exiting = (lo->lo_state == LLOOP_RUNDOWN);
+                        spin_unlock_irq(&lo->lo_lock);
+                        if (exiting)
+                                break;
+                }
  
-                bio = loop_get_bio(lo);
-                if (!bio) {
+                bio = NULL;
+                count = loop_get_bio(lo, &bio);
+                if (!count) {
                          CWARN("lloop(minor: %d): missing bio\n", lo->lo_number);
                          continue;
                  }
-                loop_handle_bio(lo, bio);
  
-                /*
-                 * upped both for pending work and tear-down, lo_pending
-                 * will hit zero then
-                 */
-                if (atomic_dec_and_test(&lo->lo_pending))
-                        break;
+                total_count += count;
+                if (total_count < count) {      /* overflow */
+                        total_count = count;
+                        times = 1;
+                } else {
+                        times++;
+                }
+                if ((times & 127) == 0) {
+                        CDEBUG(D_INFO, "total: %lu, count: %lu, avg: %lu\n",
+                               total_count, times, total_count / times);
+                }
+
+                LASSERT(bio != NULL);
+                LASSERT(count <= atomic_read(&lo->lo_pending));
+                loop_handle_bio(lo, bio);
+                atomic_sub(count, &lo->lo_pending);
          }
  
          up(&lo->lo_sem);
@@ -378,10 +445,10 @@ static int loop_thread(void *data)
  static int loop_set_fd(struct lloop_device *lo, struct file *unused,
                         struct block_device *bdev, struct file *file)
  {
-        struct inode        *inode;
+        struct inode         *inode;
          struct address_space *mapping;
-        int                lo_flags = 0;
-        int                error;
+        int                   lo_flags = 0;
+        int                   error;
          loff_t                size;
  
          if (!try_module_get(THIS_MODULE))
@@ -434,8 +501,10 @@ static int loop_set_fd(struct lloop_device *lo, struct file *unused,
  
          /* queue parameters */
          blk_queue_hardsect_size(lo->lo_queue, CFS_PAGE_SIZE);
-        blk_queue_max_sectors(lo->lo_queue, LLOOP_MAX_SEGMENTS);
+        blk_queue_max_sectors(lo->lo_queue,
+                              LLOOP_MAX_SEGMENTS << (CFS_PAGE_SHIFT - 9));
          blk_queue_max_phys_segments(lo->lo_queue, LLOOP_MAX_SEGMENTS);
+        blk_queue_max_hw_segments(lo->lo_queue, LLOOP_MAX_SEGMENTS);
  
          set_capacity(disks[lo->lo_number], size);
          bd_set_size(bdev, size << 9);
@@ -452,7 +521,7 @@ static int loop_set_fd(struct lloop_device *lo, struct file *unused,
          return error;
  }
  
-static int loop_clr_fd(struct lloop_device *lo, struct block_device *bdev, 
+static int loop_clr_fd(struct lloop_device *lo, struct block_device *bdev,
                         int count)
  {
          struct file *filp = lo->lo_backing_file;
@@ -469,9 +538,8 @@ static int loop_clr_fd(struct lloop_device *lo, struct block_device *bdev,
  
          spin_lock_irq(&lo->lo_lock);
          lo->lo_state = LLOOP_RUNDOWN;
-        if (atomic_dec_and_test(&lo->lo_pending))
-                up(&lo->lo_bh_mutex);
          spin_unlock_irq(&lo->lo_lock);
+        wake_up(&lo->lo_bh_wait);
  
          down(&lo->lo_sem);
          lo->lo_backing_file = NULL;
@@ -514,8 +582,8 @@ static int lo_release(struct inode *inode, struct file *file)
  }
  
  /* lloop device node's ioctl function. */
-static int lo_ioctl(struct inode *inode, struct file *unused, 
-        unsigned int cmd, unsigned long arg)
+static int lo_ioctl(struct inode *inode, struct file *unused,
+                    unsigned int cmd, unsigned long arg)
  {
          struct lloop_device *lo = inode->i_bdev->bd_disk->private_data;
          struct block_device *bdev = inode->i_bdev;
@@ -538,7 +606,7 @@ static int lo_ioctl(struct inode *inode, struct file *unused,
  
                  if (put_user(ino, (__u64 *)arg))
                          err = -EFAULT;
-                break; 
+                break;
          }
  
          default:
@@ -557,15 +625,16 @@ static struct block_device_operations lo_fops = {
          .ioctl =        lo_ioctl,
  };
  
-/* dynamic iocontrol callback. 
- * This callback is registered in lloop_init and will be called by 
- * ll_iocontrol_call. 
- * This is a llite regular file ioctl function. It takes the responsibility 
- * of attaching a file, and detaching a file by a lloop's device numner. 
+/* dynamic iocontrol callback.
+ * This callback is registered in lloop_init and will be called by
+ * ll_iocontrol_call.
+ *
+ * This is a llite regular file ioctl function. It takes the responsibility
+ * of attaching a file, and detaching a file by a lloop's device numner.
   */
-static enum llioc_iter lloop_ioctl(struct inode *unused, struct file *file, 
-                unsigned int cmd, unsigned long arg,
-                void *magic, int *rcp)
+static enum llioc_iter lloop_ioctl(struct inode *unused, struct file *file,
+                                   unsigned int cmd, unsigned long arg,
+                                   void *magic, int *rcp)
  {
          struct lloop_device *lo = NULL;
          struct block_device *bdev = NULL;
@@ -591,7 +660,7 @@ static enum llioc_iter lloop_ioctl(struct inode *unused, struct file *file,
                                          lo_free = lo;
                                  continue;
                          }
-                        if (lo->lo_backing_file->f_dentry->d_inode == 
+                        if (lo->lo_backing_file->f_dentry->d_inode ==
                              file->f_dentry->d_inode)
                                  break;
                  }
@@ -621,7 +690,7 @@ static enum llioc_iter lloop_ioctl(struct inode *unused, struct file *file,
  
          case LL_IOC_LLOOP_DETACH_BYDEV: {
                  int minor;
-                
+
                  dev = old_decode_dev(arg);
                  if (MAJOR(dev) != lloop_major)
                          GOTO(out, err = -EINVAL);
@@ -664,25 +733,27 @@ static int __init lloop_init(void)
          };
  
          if (max_loop < 1 || max_loop > 256) {
+                max_loop = MAX_LOOP_DEFAULT;
                  CWARN("lloop: invalid max_loop (must be between"
-                      " 1 and 256), using default (8)\n");
-                max_loop = 8;
+                      " 1 and 256), using default (%u)\n", max_loop);
          }
  
          lloop_major = register_blkdev(0, "lloop");
          if (lloop_major < 0)
                  return -EIO;
  
+        CDEBUG(D_CONFIG, "registered lloop major %d with %u minors\n",
+               lloop_major, max_loop);
+
          ll_iocontrol_magic = ll_iocontrol_register(lloop_ioctl, 2, cmdlist);
          if (ll_iocontrol_magic == NULL)
                  goto out_mem1;
  
-        loop_dev = kmalloc(max_loop * sizeof(struct lloop_device), GFP_KERNEL);
+        OBD_ALLOC_WAIT(loop_dev, max_loop * sizeof(*loop_dev));
          if (!loop_dev)
                  goto out_mem1;
-        memset(loop_dev, 0, max_loop * sizeof(struct lloop_device));
  
-        disks = kmalloc(max_loop * sizeof(struct gendisk *), GFP_KERNEL);
+        OBD_ALLOC_WAIT(disks, max_loop * sizeof(*disks));
          if (!disks)
                  goto out_mem2;
  
@@ -698,14 +769,13 @@ static int __init lloop_init(void)
                  struct lloop_device *lo = &loop_dev[i];
                  struct gendisk *disk = disks[i];
  
-                memset(lo, 0, sizeof(*lo));
                  lo->lo_queue = blk_alloc_queue(GFP_KERNEL);
                  if (!lo->lo_queue)
                          goto out_mem4;
  
                  init_MUTEX(&lo->lo_ctl_mutex);
                  init_MUTEX_LOCKED(&lo->lo_sem);
-                init_MUTEX_LOCKED(&lo->lo_bh_mutex);
+                init_waitqueue_head(&lo->lo_bh_wait);
                  lo->lo_number = i;
                  spin_lock_init(&lo->lo_lock);
                  disk->major = lloop_major;
@@ -728,9 +798,9 @@ out_mem4:
  out_mem3:
          while (i--)
                  put_disk(disks[i]);
-        kfree(disks);
+        OBD_FREE(disks, max_loop * sizeof(*disks));
  out_mem2:
-        kfree(loop_dev);
+        OBD_FREE(loop_dev, max_loop * sizeof(*loop_dev));
  out_mem1:
          unregister_blkdev(lloop_major, "lloop");
          ll_iocontrol_unregister(ll_iocontrol_magic);
@@ -750,15 +820,17 @@ static void lloop_exit(void)
          }
          if (ll_unregister_blkdev(lloop_major, "lloop"))
                  CWARN("lloop: cannot unregister blkdev\n");
+        else
+                CDEBUG(D_CONFIG, "unregistered lloop major %d\n", lloop_major);
  
-        kfree(disks);
-        kfree(loop_dev);
+        OBD_FREE(disks, max_loop * sizeof(*disks));
+        OBD_FREE(loop_dev, max_loop * sizeof(*loop_dev));
  }
  
  module_init(lloop_init);
  module_exit(lloop_exit);
  
  CFS_MODULE_PARM(max_loop, "i", int, 0444, "maximum of lloop_device");
-MODULE_AUTHOR("Cluster File Systems, Inc. <info@clusterfs.com>");
+MODULE_AUTHOR("Sun Microsystems, Inc. <http://www.lustre.org/>");
  MODULE_DESCRIPTION("Lustre virtual block device");
  MODULE_LICENSE("GPL");
diff --git a/lustre/llite/lproc_llite.c b/lustre/llite/lproc_llite.c

index 9a735c8..6d3bb3d 100644 (file)
--- a/lustre/llite/lproc_llite.c
+++ b/lustre/llite/lproc_llite.c
@@ -1,23 +1,37 @@
  /* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
   * vim:expandtab:shiftwidth=8:tabstop=8:
   *
- *  Copyright (C) 2002 Cluster File Systems, Inc.
+ * GPL HEADER START
   *
- *   This file is part of Lustre, http://www.lustre.org.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   *
- *   Lustre is free software; you can redistribute it and/or
- *   modify it under the terms of version 2 of the GNU General Public
- *   License as published by the Free Software Foundation.
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
   *
- *   Lustre is distributed in the hope that it will be useful,
- *   but WITHOUT ANY WARRANTY; without even the implied warranty of
- *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- *   GNU General Public License for more details.
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
   *
- *   You should have received a copy of the GNU General Public License
- *   along with Lustre; if not, write to the Free Software
- *   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
   *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright  2008 Sun Microsystems, Inc. All rights reserved
+ * Use is subject to license terms.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
   */
  #define DEBUG_SUBSYSTEM S_LLITE
  
@@ -34,7 +48,6 @@ struct proc_dir_entry *proc_lustre_fs_root;
  #ifdef LPROCFS
  /* /proc/lustre/llite mount point registration */
  struct file_operations llite_dump_pgcache_fops;
-struct file_operations ll_ra_stats_fops;
  struct file_operations ll_rw_extents_stats_fops;
  struct file_operations ll_rw_extents_stats_pp_fops;
  struct file_operations ll_rw_offset_stats_fops;
@@ -288,7 +301,8 @@ static int ll_wr_max_cached_mb(struct file *file, const char *buffer,
  {
          struct super_block *sb = data;
          struct ll_sb_info *sbi = ll_s2sbi(sb);
-        int mult, rc, pages_number;
+        unsigned long budget;
+        int mult, rc, pages_number, cpu;
  
          mult = 1 << (20 - CFS_PAGE_SHIFT);
          rc = lprocfs_write_frac_helper(buffer, count, &pages_number, mult);
@@ -304,17 +318,51 @@ static int ll_wr_max_cached_mb(struct file *file, const char *buffer,
          spin_lock(&sbi->ll_lock);
          sbi->ll_async_page_max = pages_number ;
          spin_unlock(&sbi->ll_lock);
-        
+
          if (!sbi->ll_osc_exp)
                  /* Not set up yet, don't call llap_shrink_cache */
                  return count;
  
-        if (sbi->ll_async_page_count >= sbi->ll_async_page_max)
-                llap_shrink_cache(sbi, 0);
+        spin_lock(&sbi->ll_async_page_reblnc_lock);
+        budget = sbi->ll_async_page_max / num_online_cpus();
+        for_each_online_cpu(cpu)
+                LL_PGLIST_DATA_CPU(sbi, cpu)->llpd_budget = budget;
+        spin_unlock(&sbi->ll_async_page_reblnc_lock);
+
+        if (lcounter_read(&sbi->ll_async_page_count) >= sbi->ll_async_page_max)
+                llap_shrink_cache(sbi, -1);
  
          return count;
  }
  
+static int ll_rd_pgcache_balance(char *page, char **start, off_t off,
+                                 int count, int *eof, void *data)
+{
+        struct super_block *sb = data;
+        struct ll_sb_info *sbi = ll_s2sbi(sb);
+        struct ll_pglist_data *pd;
+        unsigned long total_budget = 0;
+        int n = 0, cpu;
+
+        n += snprintf(page +n, count - n, "cpu\tpage count\tbudget"
+                      "\t\treblnc count\tgen\thit\tmiss\tcross\n");
+        for_each_online_cpu(cpu) {
+                pd = LL_PGLIST_DATA_CPU(sbi, cpu);
+                n += snprintf(page + n, count - n,
+                              "%d\t%-8lu\t%-8lu\t%-8lu\t%lu\t%lu\t%lu\t%lu\n",
+                              cpu, pd->llpd_count, pd->llpd_budget,
+                              pd->llpd_reblnc_count, pd->llpd_gen,
+                              pd->llpd_hit, pd->llpd_miss, pd->llpd_cross);
+                total_budget += pd->llpd_budget;
+        }
+        n += snprintf(page + n, count - n,
+                      "Total budget: %lu, page max: %lu, rebalance cnt: %lu\n",
+                      total_budget, sbi->ll_async_page_max,
+                      sbi->ll_async_page_reblnc_count);
+        *eof = 1;
+        return n;
+}
+
  static int ll_rd_checksum(char *page, char **start, off_t off,
                            int count, int *eof, void *data)
  {
@@ -373,7 +421,7 @@ static int ll_wr_max_rw_chunk(struct file *file, const char *buffer,
          return count;
  }
  
-static int ll_rd_track_id(char *page, int count, void *data, 
+static int ll_rd_track_id(char *page, int count, void *data,
                            enum stats_track_type type)
  {
          struct super_block *sb = data;
@@ -381,7 +429,7 @@ static int ll_rd_track_id(char *page, int count, void *data,
          if (ll_s2sbi(sb)->ll_stats_track_type == type) {
                  return snprintf(page, count, "%d\n",
                                  ll_s2sbi(sb)->ll_stats_track_id);
-        
+
          } else if (ll_s2sbi(sb)->ll_stats_track_type == STATS_TRACK_ALL) {
                  return snprintf(page, count, "0 (all)\n");
          } else {
@@ -439,7 +487,7 @@ static int ll_rd_track_gid(char *page, char **start, off_t off,
  
  static int ll_wr_track_gid(struct file *file, const char *buffer,
                            unsigned long count, void *data)
-{                                                                 
+{
          return (ll_wr_track_id(buffer, count, data, STATS_TRACK_GID));
  }
  
@@ -550,6 +598,7 @@ static struct lprocfs_vars lprocfs_llite_obd_vars[] = {
          { "max_read_ahead_whole_mb", ll_rd_max_read_ahead_whole_mb,
                                       ll_wr_max_read_ahead_whole_mb, 0 },
          { "max_cached_mb",  ll_rd_max_cached_mb, ll_wr_max_cached_mb, 0 },
+        { "pgcache_balance",ll_rd_pgcache_balance, 0, 0 },
          { "checksum_pages", ll_rd_checksum, ll_wr_checksum, 0 },
          { "max_rw_chunk",   ll_rd_max_rw_chunk, ll_wr_max_rw_chunk, 0 },
          { "stats_track_pid",  ll_rd_track_pid, ll_wr_track_pid, 0 },
@@ -602,11 +651,7 @@ struct llite_file_opcode {
          { LPROC_LL_TRUNC,          LPROCFS_TYPE_REGS, "truncate" },
          { LPROC_LL_LOCKLESS_TRUNC, LPROCFS_TYPE_REGS, "lockless_truncate" },
          { LPROC_LL_FLOCK,          LPROCFS_TYPE_REGS, "flock" },
-#if (LINUX_VERSION_CODE > KERNEL_VERSION(2,5,0))
          { LPROC_LL_GETATTR,        LPROCFS_TYPE_REGS, "getattr" },
-#else
-        { LPROC_LL_REVALIDATE,     LPROCFS_TYPE_REGS, "getattr" },
-#endif
          /* special inode operation */
          { LPROC_LL_STAFS,          LPROCFS_TYPE_REGS, "statfs" },
          { LPROC_LL_ALLOC_INODE,    LPROCFS_TYPE_REGS, "alloc_inode" },
@@ -654,6 +699,7 @@ int lprocfs_register_mountpoint(struct proc_dir_entry *parent,
          char name[MAX_STRING_SIZE + 1], *ptr;
          int err, id, len;
          struct proc_dir_entry *entry;
+        static const char *ra_stats_string[] = LL_RA_STAT_STRINGS;
          ENTRY;
  
          memset(lvars, 0, sizeof(lvars));
@@ -669,12 +715,12 @@ int lprocfs_register_mountpoint(struct proc_dir_entry *parent,
          len = strlen(lsi->lsi_lmd->lmd_profile);
          ptr = strrchr(lsi->lsi_lmd->lmd_profile, '-');
          if (ptr && (strcmp(ptr, "-client") == 0))
-                len -= 7; 
-        
+                len -= 7;
+
          /* Mount info */
          snprintf(name, MAX_STRING_SIZE, "%.*s-%p", len,
                   lsi->lsi_lmd->lmd_profile, sb);
-        
+
          sbi->ll_proc_root = lprocfs_register(name, parent, NULL, NULL);
          if (IS_ERR(sbi->ll_proc_root)) {
                  err = PTR_ERR(sbi->ll_proc_root);
@@ -688,11 +734,13 @@ int lprocfs_register_mountpoint(struct proc_dir_entry *parent,
          entry->proc_fops = &llite_dump_pgcache_fops;
          entry->data = sbi;
  
-        entry = create_proc_entry("read_ahead_stats", 0644, sbi->ll_proc_root);
-        if (entry == NULL)
-                GOTO(out, err = -ENOMEM);
-        entry->proc_fops = &ll_ra_stats_fops;
-        entry->data = sbi;
+        sbi->ll_ra_stats = lprocfs_alloc_stats(LL_RA_STAT,
+                                               LPROCFS_STATS_FLAG_PERCPU);
+        for (id = 0; id < LL_RA_STAT; id++)
+                lprocfs_counter_init(sbi->ll_ra_stats, id, 0,
+                        ra_stats_string[id], "pages");
+        lprocfs_register_stats(sbi->ll_proc_root, "read_ahead_stats",
+                sbi->ll_ra_stats);
  
          entry = create_proc_entry("extents_stats", 0644, sbi->ll_proc_root);
          if (entry == NULL)
@@ -714,7 +762,7 @@ int lprocfs_register_mountpoint(struct proc_dir_entry *parent,
          entry->data = sbi;
  
          /* File operations stats */
-        sbi->ll_stats = lprocfs_alloc_stats(LPROC_LL_FILE_OPCODES, 
+        sbi->ll_stats = lprocfs_alloc_stats(LPROC_LL_FILE_OPCODES,
                                              LPROCFS_STATS_FLAG_PERCPU);
          if (sbi->ll_stats == NULL)
                  GOTO(out, err = -ENOMEM);
@@ -790,6 +838,7 @@ void lprocfs_unregister_mountpoint(struct ll_sb_info *sbi)
  {
          if (sbi->ll_proc_root) {
                  lprocfs_remove(&sbi->ll_proc_root);
+                lprocfs_free_stats(&sbi->ll_ra_stats);
                  lprocfs_free_stats(&sbi->ll_stats);
          }
  }
@@ -819,6 +868,8 @@ static int llite_dump_pgcache_seq_show(struct seq_file *seq, void *v)
  {
          struct ll_async_page *llap, *dummy_llap = seq->private;
          struct ll_sb_info *sbi = dummy_llap->llap_cookie;
+        struct ll_pglist_data *pd;
+        int cpu = dummy_llap->llap_pglist_cpu;
  
          /* 2.4 doesn't seem to have SEQ_START_TOKEN, so we implement
           * it in our own state */
@@ -828,19 +879,23 @@ static int llite_dump_pgcache_seq_show(struct seq_file *seq, void *v)
                  return 0;
          }
  
-        spin_lock(&sbi->ll_lock);
-
-        llap = llite_pglist_next_llap(sbi, &dummy_llap->llap_pglist_item);
+        pd = ll_pglist_cpu_lock(sbi, cpu);
+        llap = llite_pglist_next_llap(&pd->llpd_list,
+                                      &dummy_llap->llap_pglist_item);
          if (llap != NULL)  {
-                int has_flags = 0;
+                int has_flags = 0, i;
                  struct page *page = llap->llap_page;
+                unsigned long gen = 0UL;
  
                  LASSERTF(llap->llap_origin < LLAP__ORIGIN_MAX, "%u\n",
                           llap->llap_origin);
  
+                for_each_online_cpu(i)
+                         gen += LL_PGLIST_DATA_CPU(sbi, i)->llpd_gen;
+
                  seq_printf(seq," %5lu | %p %p %s %s %s %s | %p %lu/%u(%p) "
                             "%lu %u [",
-                           sbi->ll_pglist_gen,
+                           gen,
                             llap, llap->llap_cookie,
                             llap_origins[llap->llap_origin],
                             llap->llap_write_queued ? "wq" : "- ",
@@ -864,17 +919,18 @@ static int llite_dump_pgcache_seq_show(struct seq_file *seq, void *v)
                  else
                          seq_puts(seq, "]\n");
          }
-
-        spin_unlock(&sbi->ll_lock);
+        ll_pglist_cpu_unlock(sbi, cpu);
  
          return 0;
  }
  
-static void *llite_dump_pgcache_seq_next(struct seq_file *seq, void *v, 
+static void *llite_dump_pgcache_seq_next(struct seq_file *seq, void *v,
                                           loff_t *pos)
  {
          struct ll_async_page *llap, *dummy_llap = seq->private;
          struct ll_sb_info *sbi = dummy_llap->llap_cookie;
+        struct ll_pglist_data *pd, *next;
+        int cpu = dummy_llap->llap_pglist_cpu;
  
          /* bail if we just displayed the banner */
          if (dummy_llap->llap_magic == 0) {
@@ -885,14 +941,35 @@ static void *llite_dump_pgcache_seq_next(struct seq_file *seq, void *v,
          /* we've just displayed the llap that is after us in the list.
           * we advance to a position beyond it, returning null if there
           * isn't another llap in the list beyond that new position. */
-        spin_lock(&sbi->ll_lock);
-        llap = llite_pglist_next_llap(sbi, &dummy_llap->llap_pglist_item);
+        pd = ll_pglist_cpu_lock(sbi, cpu);
+        llap = llite_pglist_next_llap(&pd->llpd_list,
+                        &dummy_llap->llap_pglist_item);
          list_del_init(&dummy_llap->llap_pglist_item);
          if (llap) {
                  list_add(&dummy_llap->llap_pglist_item,&llap->llap_pglist_item);
-                llap =llite_pglist_next_llap(sbi,&dummy_llap->llap_pglist_item);
+                llap = llite_pglist_next_llap(&pd->llpd_list,
+                                &dummy_llap->llap_pglist_item);
          }
-        spin_unlock(&sbi->ll_lock);
+        if (llap == NULL) {
+                int i = cpu + 1;
+                for (next = NULL; i < num_possible_cpus(); i++, next = NULL) {
+                        next = ll_pglist_cpu_lock(sbi, i);
+                        if (!list_empty(&next->llpd_list))
+                                break;
+                        ll_pglist_cpu_unlock(sbi, i);
+                }
+                if (next != NULL) {
+                        list_move(&dummy_llap->llap_pglist_item,
+                                  &next->llpd_list);
+                        dummy_llap->llap_pglist_cpu = i;
+                        ll_pglist_cpu_unlock(sbi, cpu);
+                        llap = llite_pglist_next_llap(&next->llpd_list,
+                                        &dummy_llap->llap_pglist_item);
+                        LASSERT(llap);
+                        cpu = i;
+                }
+        }
+        ll_pglist_cpu_unlock(sbi, cpu);
  
          ++*pos;
          if (llap == NULL) {
@@ -930,6 +1007,7 @@ static int llite_dump_pgcache_seq_open(struct inode *inode, struct file *file)
          struct ll_async_page *dummy_llap;
          struct seq_file *seq;
          struct ll_sb_info *sbi = dp->data;
+        struct ll_pglist_data *pd;
          int rc = -ENOMEM;
  
          LPROCFS_ENTRY_AND_CHECK(dp);
@@ -941,6 +1019,7 @@ static int llite_dump_pgcache_seq_open(struct inode *inode, struct file *file)
          dummy_llap->llap_page = NULL;
          dummy_llap->llap_cookie = sbi;
          dummy_llap->llap_magic = 0;
+        dummy_llap->llap_pglist_cpu = 0;
  
          rc = seq_open(file, &llite_dump_pgcache_seq_sops);
          if (rc) {
@@ -950,9 +1029,9 @@ static int llite_dump_pgcache_seq_open(struct inode *inode, struct file *file)
          seq = file->private_data;
          seq->private = dummy_llap;
  
-        spin_lock(&sbi->ll_lock);
-        list_add(&dummy_llap->llap_pglist_item, &sbi->ll_pglist);
-        spin_unlock(&sbi->ll_lock);
+        pd = ll_pglist_cpu_lock(sbi, 0);
+        list_add(&dummy_llap->llap_pglist_item, &pd->llpd_list);
+        ll_pglist_cpu_unlock(sbi, 0);
  
  out:
          if (rc)
@@ -966,11 +1045,12 @@ static int llite_dump_pgcache_seq_release(struct inode *inode,
          struct seq_file *seq = file->private_data;
          struct ll_async_page *dummy_llap = seq->private;
          struct ll_sb_info *sbi = dummy_llap->llap_cookie;
+        int cpu = dummy_llap->llap_pglist_cpu;
  
-        spin_lock(&sbi->ll_lock);
+        ll_pglist_cpu_lock(sbi, cpu);
          if (!list_empty(&dummy_llap->llap_pglist_item))
                  list_del_init(&dummy_llap->llap_pglist_item);
-        spin_unlock(&sbi->ll_lock);
+        ll_pglist_cpu_unlock(sbi, cpu);
          OBD_FREE(dummy_llap, sizeof(*dummy_llap));
  
          return lprocfs_seq_release(inode, file);
@@ -983,61 +1063,6 @@ struct file_operations llite_dump_pgcache_fops = {
          .release = llite_dump_pgcache_seq_release,
  };
  
-static int ll_ra_stats_seq_show(struct seq_file *seq, void *v)
-{
-        struct timeval now;
-        struct ll_sb_info *sbi = seq->private;
-        struct ll_ra_info *ra = &sbi->ll_ra_info;
-        int i;
-        static char *ra_stat_strings[] = {
-                [RA_STAT_HIT] = "hits",
-                [RA_STAT_MISS] = "misses",
-                [RA_STAT_DISTANT_READPAGE] = "readpage not consecutive",
-                [RA_STAT_MISS_IN_WINDOW] = "miss inside window",
-                [RA_STAT_FAILED_GRAB_PAGE] = "failed grab_cache_page",
-                [RA_STAT_FAILED_MATCH] = "failed lock match",
-                [RA_STAT_DISCARDED] = "read but discarded",
-                [RA_STAT_ZERO_LEN] = "zero length file",
-                [RA_STAT_ZERO_WINDOW] = "zero size window",
-                [RA_STAT_EOF] = "read-ahead to EOF",
-                [RA_STAT_MAX_IN_FLIGHT] = "hit max r-a issue",
-                [RA_STAT_WRONG_GRAB_PAGE] = "wrong page from grab_cache_page",
-        };
-
-        do_gettimeofday(&now);
-
-        spin_lock(&sbi->ll_lock);
-
-        seq_printf(seq, "snapshot_time:         %lu.%lu (secs.usecs)\n",
-                   now.tv_sec, now.tv_usec);
-        seq_printf(seq, "pending issued pages:           %lu\n",
-                   ra->ra_cur_pages);
-
-        for(i = 0; i < _NR_RA_STAT; i++)
-                seq_printf(seq, "%-25s %lu\n", ra_stat_strings[i], 
-                           ra->ra_stats[i]);
-
-        spin_unlock(&sbi->ll_lock);
-
-        return 0;
-}
-
-static ssize_t ll_ra_stats_seq_write(struct file *file, const char *buf,
-                                       size_t len, loff_t *off)
-{
-        struct seq_file *seq = file->private_data;
-        struct ll_sb_info *sbi = seq->private;
-        struct ll_ra_info *ra = &sbi->ll_ra_info;
-
-        spin_lock(&sbi->ll_lock);
-        memset(ra->ra_stats, 0, sizeof(ra->ra_stats));
-        spin_unlock(&sbi->ll_lock);
-
-        return len;
-}
-
-LPROC_SEQ_FOPS(ll_ra_stats);
-
  #define pct(a,b) (b ? a * 100 / b : 0)
  
  static void ll_display_extents_info(struct ll_rw_extents_info *io_extents,
@@ -1090,14 +1115,15 @@ static int ll_rw_extents_stats_pp_seq_show(struct seq_file *seq, void *v)
          do_gettimeofday(&now);
  
          if (!sbi->ll_rw_stats_on) {
-                seq_printf(seq, "Disabled\n"
-                                "Write anything in this file to activate\n");
+                seq_printf(seq, "disabled\n"
+                                "write anything in this file to activate, "
+                                "then 0 or \"[D/d]isabled\" to deactivate\n");
                  return 0;
          }
          seq_printf(seq, "snapshot_time:         %lu.%lu (secs.usecs)\n",
                     now.tv_sec, now.tv_usec);
          seq_printf(seq, "%15s %19s       | %20s\n", " ", "read", "write");
-        seq_printf(seq, "%13s   %14s %4s %4s  | %14s %4s %4s\n", 
+        seq_printf(seq, "%13s   %14s %4s %4s  | %14s %4s %4s\n",
                     "extents", "calls", "%", "cum%",
                     "calls", "%", "cum%");
          spin_lock(&sbi->ll_pp_extent_lock);
@@ -1120,8 +1146,18 @@ static ssize_t ll_rw_extents_stats_pp_seq_write(struct file *file,
          struct ll_sb_info *sbi = seq->private;
          struct ll_rw_extents_info *io_extents = &sbi->ll_rw_extents_info;
          int i;
+        int value = 1, rc = 0;
+
+        rc = lprocfs_write_helper(buf, len, &value);
+        if (rc < 0 && (strcmp(buf, "disabled") == 0 ||
+                       strcmp(buf, "Disabled") == 0))
+                value = 0;
+
+        if (value == 0)
+                sbi->ll_rw_stats_on = 0;
+        else
+                sbi->ll_rw_stats_on = 1;
  
-        sbi->ll_rw_stats_on = 1;
          spin_lock(&sbi->ll_pp_extent_lock);
          for(i = 0; i < LL_PROCESS_HIST_MAX; i++) {
                  io_extents->pp_extents[i].pid = 0;
@@ -1143,15 +1179,16 @@ static int ll_rw_extents_stats_seq_show(struct seq_file *seq, void *v)
          do_gettimeofday(&now);
  
          if (!sbi->ll_rw_stats_on) {
-                seq_printf(seq, "Disabled\n"
-                                "Write anything in this file to activate\n");
+                seq_printf(seq, "disabled\n"
+                                "write anything in this file to activate, "
+                                "then 0 or \"[D/d]isabled\" to deactivate\n");
                  return 0;
          }
          seq_printf(seq, "snapshot_time:         %lu.%lu (secs.usecs)\n",
                     now.tv_sec, now.tv_usec);
  
          seq_printf(seq, "%15s %19s       | %20s\n", " ", "read", "write");
-        seq_printf(seq, "%13s   %14s %4s %4s  | %14s %4s %4s\n", 
+        seq_printf(seq, "%13s   %14s %4s %4s  | %14s %4s %4s\n",
                     "extents", "calls", "%", "cum%",
                     "calls", "%", "cum%");
          spin_lock(&sbi->ll_lock);
@@ -1168,8 +1205,17 @@ static ssize_t ll_rw_extents_stats_seq_write(struct file *file, const char *buf,
          struct ll_sb_info *sbi = seq->private;
          struct ll_rw_extents_info *io_extents = &sbi->ll_rw_extents_info;
          int i;
+        int value = 1, rc = 0;
+
+        rc = lprocfs_write_helper(buf, len, &value);
+        if (rc < 0 && (strcmp(buf, "disabled") == 0 ||
+                       strcmp(buf, "Disabled") == 0))
+                value = 0;
  
-        sbi->ll_rw_stats_on = 1;
+        if (value == 0)
+                sbi->ll_rw_stats_on = 0;
+        else
+                sbi->ll_rw_stats_on = 1;
          spin_lock(&sbi->ll_pp_extent_lock);
          for(i = 0; i <= LL_PROCESS_HIST_MAX; i++)
          {
@@ -1210,15 +1256,15 @@ void ll_rw_stats_tally(struct ll_sb_info *sbi, pid_t pid, struct file
  
          if (cur == -1) {
                  /* new process */
-                sbi->ll_extent_process_count = 
-                        (sbi->ll_extent_process_count + 1) % LL_PROCESS_HIST_MAX;
+                sbi->ll_extent_process_count =
+                        (sbi->ll_extent_process_count+1) % LL_PROCESS_HIST_MAX;
                  cur = sbi->ll_extent_process_count;
                  io_extents->pp_extents[cur].pid = pid;
                  lprocfs_oh_clear(&io_extents->pp_extents[cur].pp_r_hist);
                  lprocfs_oh_clear(&io_extents->pp_extents[cur].pp_w_hist);
          }
  
-        for(i = 0; (count >= (1 << LL_HIST_START << i)) && 
+        for(i = 0; (count >= (1 << LL_HIST_START << i)) &&
               (i < (LL_HIST_MAX - 1)); i++);
          if (rw == 0) {
                  io_extents->pp_extents[cur].pp_r_hist.oh_buckets[i]++;
@@ -1301,8 +1347,9 @@ static int ll_rw_offset_stats_seq_show(struct seq_file *seq, void *v)
          do_gettimeofday(&now);
  
          if (!sbi->ll_rw_stats_on) {
-                seq_printf(seq, "Disabled\n"
-                                "Write anything in this file to activate\n");
+                seq_printf(seq, "disabled\n"
+                                "write anything in this file to activate, "
+                                "then 0 or \"[D/d]isabled\" to deactivate\n");
                  return 0;
          }
          spin_lock(&sbi->ll_process_lock);
@@ -1351,8 +1398,18 @@ static ssize_t ll_rw_offset_stats_seq_write(struct file *file, const char *buf,
          struct ll_sb_info *sbi = seq->private;
          struct ll_rw_process_info *process_info = sbi->ll_rw_process_info;
          struct ll_rw_process_info *offset_info = sbi->ll_rw_offset_info;
+        int value = 1, rc = 0;
+
+        rc = lprocfs_write_helper(buf, len, &value);
  
-        sbi->ll_rw_stats_on = 1;
+        if (rc < 0 && (strcmp(buf, "disabled") == 0 ||
+                           strcmp(buf, "Disabled") == 0))
+                value = 0;
+
+        if (value == 0)
+                sbi->ll_rw_stats_on = 0;
+        else
+                sbi->ll_rw_stats_on = 1;
  
          spin_lock(&sbi->ll_process_lock);
          sbi->ll_offset_process_count = 0;
diff --git a/lustre/llite/namei.c b/lustre/llite/namei.c

index b6a344d..0a16d65 100644 (file)
--- a/lustre/llite/namei.c
+++ b/lustre/llite/namei.c
@@ -1,22 +1,37 @@
  /* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
   * vim:expandtab:shiftwidth=8:tabstop=8:
   *
- *  Copyright (c) 2002, 2003 Cluster File Systems, Inc.
+ * GPL HEADER START
   *
- *   This file is part of Lustre, http://www.lustre.org.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   *
- *   Lustre is free software; you can redistribute it and/or
- *   modify it under the terms of version 2 of the GNU General Public
- *   License as published by the Free Software Foundation.
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
   *
- *   Lustre is distributed in the hope that it will be useful,
- *   but WITHOUT ANY WARRANTY; without even the implied warranty of
- *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- *   GNU General Public License for more details.
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
   *
- *   You should have received a copy of the GNU General Public License
- *   along with Lustre; if not, write to the Free Software
- *   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright  2008 Sun Microsystems, Inc. All rights reserved
+ * Use is subject to license terms.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
   */
  
  #include <linux/fs.h>
@@ -37,121 +52,103 @@
  
  /* methods */
  
-/* called from iget{4,5_locked}->find_inode() under inode_lock spinlock */
-#if (LINUX_VERSION_CODE < KERNEL_VERSION(2,5,0))
-static int ll_test_inode(struct inode *inode, unsigned long ino, void *opaque)
-#else
-static int ll_test_inode(struct inode *inode, void *opaque)
-#endif
+int ll_unlock(__u32 mode, struct lustre_handle *lockh)
  {
-        static int last_ino, last_gen, last_count;
-        struct lustre_md *md = opaque;
+        ENTRY;
  
-        if (!(md->body->valid & (OBD_MD_FLGENER | OBD_MD_FLID))) {
-                CERROR("MDS body missing inum or generation\n");
-                return 0;
-        }
+        ldlm_lock_decref(lockh, mode);
  
-        if (last_ino == md->body->ino && last_gen == md->body->generation &&
-            last_count < 500) {
-                last_count++;
-        } else {
-                if (last_count > 1)
-                        CDEBUG(D_VFSTRACE, "compared %u/%u %u times\n",
-                               last_ino, last_gen, last_count);
-                last_count = 0;
-                last_ino = md->body->ino;
-                last_gen = md->body->generation;
-                CDEBUG(D_VFSTRACE,
-                       "comparing inode %p ino %lu/%u to body "LPU64"/%u\n",
-                       inode, inode->i_ino, inode->i_generation,
-                       md->body->ino, md->body->generation);
-        }
+        RETURN(0);
+}
  
-#if (LINUX_VERSION_CODE > KERNEL_VERSION(2,5,0))
-        if (inode->i_ino != md->body->ino)
-                return 0;
-#endif
-        if (inode->i_generation != md->body->generation) {
-#ifdef HAVE_EXPORT___IGET
-                if (inode->i_state & (I_FREEING | I_CLEAR))
-                        return 0;
-                if (inode->i_nlink == 0)
-                        return 0;
-
-                /* add "duplicate" inode into deathrow for destroy */
-                spin_lock(&ll_i2sbi(inode)->ll_deathrow_lock);
-                if (list_empty(&ll_i2info(inode)->lli_dead_list)) {
-                        __iget(inode);
-                        list_add(&ll_i2info(inode)->lli_dead_list,
-                                 &ll_i2sbi(inode)->ll_deathrow);
-                }
-                spin_unlock(&ll_i2sbi(inode)->ll_deathrow_lock);
-#endif
+/* Get an inode by inode number (already instantiated by the intent lookup).
+ * Returns inode or NULL
+ */
  
-                return 0;
+static inline __u64 fid_flatten(const struct lu_fid *fid)
+{                      
+        return (fid_seq(fid) - 1) * LUSTRE_SEQ_MAX_WIDTH + fid_oid(fid);
+}
+/* Build inode number on passed @fid */
+ino_t ll_fid_build_ino(struct ll_sb_info *sbi,
+                       struct ll_fid *fid)
+{
+        ino_t ino;
+        ENTRY;
+
+        if (fid_is_igif((struct lu_fid*)fid)) {
+                ino = lu_igif_ino((struct lu_fid*)fid);
+                RETURN(ino);
          }
  
-        /* Apply the attributes in 'opaque' to this inode */
-        if (!(inode->i_state & (I_FREEING | I_CLEAR)))
-                ll_update_inode(inode, md);
-        return 1;
+        /*
+         * Very stupid and having many downsides inode allocation algorithm
+         * based on fid.
+         */
+        ino = fid_flatten((struct lu_fid*)fid) & 0xFFFFFFFF;
+
+        if (unlikely(ino == 0))
+                /* the first result ino is 0xFFC001, so this is rarely used */
+                ino = 0xffbcde;
+        ino = ino | 0x80000000;
+        RETURN(ino);
+
  }
  
-int ll_unlock(__u32 mode, struct lustre_handle *lockh)
+/* called from iget5_locked->find_inode() under inode_lock spinlock */
+static int fid_test_inode(struct inode *inode, void *opaque)
  {
-        ENTRY;
+        struct lustre_md     *md = opaque;
  
-        ldlm_lock_decref(lockh, mode);
+        if (unlikely(!(md->body->valid & OBD_MD_FLID))) {
+                CERROR("MDS body missing FID\n");
+                return 0;
+        }
  
-        RETURN(0);
+        return lu_fid_eq(ll_inode_lu_fid(inode),
+                         (struct lu_fid*)&md->body->fid1);
  }
  
-/* Get an inode by inode number (already instantiated by the intent lookup).
- * Returns inode or NULL
- */
-#if (LINUX_VERSION_CODE >= KERNEL_VERSION(2,5,0))
-int ll_set_inode(struct inode *inode, void *opaque)
+static int fid_set_inode(struct inode *inode, void *opaque)
  {
-        ll_read_inode2(inode, opaque);
+        struct lustre_md     *md  = opaque;
+
+        *ll_inode_lu_fid(inode) = *((struct lu_fid*)&md->body->fid1);
          return 0;
  }
  
  struct inode *ll_iget(struct super_block *sb, ino_t hash,
-                      struct lustre_md *md)
+                          struct lustre_md *md)
  {
-        struct inode *inode;
+        struct ll_inode_info *lli;
+        struct inode         *inode;
+        ENTRY;
  
          LASSERT(hash != 0);
-        inode = iget5_locked(sb, hash, ll_test_inode, ll_set_inode, md);
+        inode = iget5_locked(sb, hash, fid_test_inode, fid_set_inode, md);
  
          if (inode) {
-                if (inode->i_state & I_NEW)
+                lli = ll_i2info(inode);
+                if (inode->i_state & I_NEW) {
+                        ll_read_inode2(inode, md);
                          unlock_new_inode(inode);
-                CDEBUG(D_VFSTRACE, "inode: %lu/%u(%p)\n", inode->i_ino,
-                       inode->i_generation, inode);
+                } else {
+                        if (!(inode->i_state & (I_FREEING | I_CLEAR)))
+                                ll_update_inode(inode, md);
+                }
+                CDEBUG(D_VFSTRACE, "got inode: %lu/%u(%p) for "DFID"\n",
+                       inode->i_ino, inode->i_generation, inode,
+                       PFID(ll_inode_lu_fid(inode)));
          }
  
-        return inode;
-}
-#else
-struct inode *ll_iget(struct super_block *sb, ino_t hash,
-                      struct lustre_md *md)
-{
-        struct inode *inode;
-        LASSERT(hash != 0);
-        inode = iget4(sb, hash, ll_test_inode, md);
-        if (inode)
-                CDEBUG(D_VFSTRACE, "inode: %lu/%u(%p)\n", inode->i_ino,
-                       inode->i_generation, inode);
-        return inode;
+        RETURN(inode);
  }
-#endif
  
  static void ll_drop_negative_dentry(struct inode *dir)
  { 
          struct dentry *dentry, *tmp_alias, *tmp_subdir;
  
+        spin_lock(&ll_lookup_lock);
          spin_lock(&dcache_lock);
  restart:
          list_for_each_entry_safe(dentry, tmp_alias,
@@ -172,6 +169,7 @@ restart:
                  }
          }
          spin_unlock(&dcache_lock);
+        spin_unlock(&ll_lookup_lock);
  }
  
  int ll_mdc_blocking_ast(struct ldlm_lock *lock, struct ldlm_lock_desc *desc,
@@ -193,11 +191,14 @@ int ll_mdc_blocking_ast(struct ldlm_lock *lock, struct ldlm_lock_desc *desc,
          case LDLM_CB_CANCELING: {
                  struct inode *inode = ll_inode_from_lock(lock);
                  __u64 bits = lock->l_policy_data.l_inodebits.bits;
+                struct lu_fid *fid;
  
                  /* Invalidate all dentries associated with this inode */
                  if (inode == NULL)
                          break;
  
+                fid = ll_inode_lu_fid(inode);;
+
                  LASSERT(lock->l_flags & LDLM_FL_CANCELING);
                  if ((bits & MDS_INODELOCK_LOOKUP) &&
                      ll_have_md_lock(inode, MDS_INODELOCK_LOOKUP))
@@ -208,9 +209,8 @@ int ll_mdc_blocking_ast(struct ldlm_lock *lock, struct ldlm_lock_desc *desc,
                  if ((bits & MDS_INODELOCK_OPEN) &&
                      ll_have_md_lock(inode, MDS_INODELOCK_OPEN))
                          bits &= ~MDS_INODELOCK_OPEN;
-                
-                if (lock->l_resource->lr_name.name[0] != inode->i_ino ||
-                    lock->l_resource->lr_name.name[1] != inode->i_generation) {
+
+                if (!fid_res_name_eq(fid, &lock->l_resource->lr_name)) {
                          LDLM_ERROR(lock, "data mismatch with ino %lu/%u (%p)",
                                     inode->i_ino, inode->i_generation, inode);
                  }
@@ -267,11 +267,11 @@ int ll_mdc_blocking_ast(struct ldlm_lock *lock, struct ldlm_lock_desc *desc,
  int ll_mdc_cancel_unused(struct lustre_handle *conn, struct inode *inode,
                           int flags, void *opaque)
  {
-        struct ldlm_res_id res_id =
-                { .name = {inode->i_ino, inode->i_generation} };
+        struct ldlm_res_id res_id;
          struct obd_device *obddev = class_conn2obd(conn);
          ENTRY;
  
+        fid_build_reg_res_name(ll_inode_lu_fid(inode), &res_id);
          RETURN(ldlm_cli_cancel_unused(obddev->obd_namespace, &res_id, flags,
                                        opaque));
  }
@@ -378,6 +378,7 @@ static struct dentry *ll_find_alias(struct inode *inode, struct dentry *de)
          struct dentry *dentry;
          struct dentry *last_discon = NULL;
  
+        spin_lock(&ll_lookup_lock);
          spin_lock(&dcache_lock);
          list_for_each(tmp, &inode->i_dentry) {
                  dentry = list_entry(tmp, struct dentry, d_alias);
@@ -414,29 +415,35 @@ static struct dentry *ll_find_alias(struct inode *inode, struct dentry *de)
                  dentry->d_flags &= ~DCACHE_LUSTRE_INVALID;
  #endif
                  unlock_dentry(dentry);
+                ll_dops_init(dentry, 0);
                  d_rehash_cond(dentry, 0); /* avoid taking dcache_lock inside */
                  spin_unlock(&dcache_lock);
+                spin_unlock(&ll_lookup_lock);
                  iput(inode);
                  CDEBUG(D_DENTRY, "alias dentry %.*s (%p) parent %p inode %p "
                         "refc %d\n", de->d_name.len, de->d_name.name, de,
                         de->d_parent, de->d_inode, atomic_read(&de->d_count));
                  return dentry;
          }
+
          if (last_discon) {
-                 CDEBUG(D_DENTRY, "Reuse disconnected dentry %p inode %p "
+                CDEBUG(D_DENTRY, "Reuse disconnected dentry %p inode %p "
                          "refc %d\n", last_discon, last_discon->d_inode,
                          atomic_read(&last_discon->d_count));
-                 dget_locked(last_discon);
-                 spin_unlock(&dcache_lock);
-                 d_rehash(de);
-                 d_move(last_discon, de);
-                 iput(inode);
-                 return last_discon;
+                dget_locked(last_discon);
+                spin_unlock(&dcache_lock);
+                spin_unlock(&ll_lookup_lock);
+                ll_dops_init(last_discon, 1);
+                d_rehash(de);
+                d_move(last_discon, de);
+                iput(inode);
+                return last_discon;
          }
  
          ll_d_add(de, inode);
  
          spin_unlock(&dcache_lock);
+        spin_unlock(&ll_lookup_lock);
  
          return de;
  }
@@ -449,16 +456,9 @@ int lookup_it_finish(struct ptlrpc_request *request, int offset,
          struct inode *parent = icbd->icbd_parent;
          struct ll_sb_info *sbi = ll_i2sbi(parent);
          struct inode *inode = NULL;
-        int set = 0, rc;
+        int rc;
          ENTRY;
  
-        lock_dentry(*de);
-        if (likely((*de)->d_op != &ll_d_ops)) {
-                (*de)->d_op = &ll_init_d_ops;
-                set = 1;
-        }
-        unlock_dentry(*de);
-
          /* NB 1 request reference will be taken away by ll_intent_lock()
           * when I return */
          if (!it_disposition(it, DISP_LOOKUP_NEG)) {
@@ -466,20 +466,8 @@ int lookup_it_finish(struct ptlrpc_request *request, int offset,
  
                  rc = ll_prep_inode(sbi->ll_osc_exp, &inode, request, offset,
                                     (*de)->d_sb);
-                if (rc) {
-                        if (set) {
-                                lock_dentry(*de);
-                                if (likely((*de)->d_op == &ll_init_d_ops)) {
-                                        (*de)->d_op = &ll_fini_d_ops;
-                                        unlock_dentry(*de);
-                                        smp_wmb();
-                                        ll_d_wakeup(*de);
-                                } else {
-                                        unlock_dentry(*de);
-                                }
-                        }
+                if (rc)
                          RETURN(rc);
-                }
  
                  CDEBUG(D_DLMTRACE, "setting l_data to inode %p (%lu/%u)\n",
                         inode, inode->i_ino, inode->i_generation);
@@ -493,19 +481,22 @@ int lookup_it_finish(struct ptlrpc_request *request, int offset,
                     Everybody else who needs correct file size would call
                     ll_glimpse_size or some equivalent themselves anyway.
                     Also see bug 7198. */
+
+                ll_dops_init(*de, 1);
                  *de = ll_find_alias(inode, *de);
-                if (set && *de != save) {
-                        lock_dentry(save);
-                        if (likely(save->d_op == &ll_init_d_ops)) {
-                                save->d_op = &ll_fini_d_ops;
-                                unlock_dentry(save);
-                                smp_wmb();
-                                ll_d_wakeup(save);
-                        } else {
-                                unlock_dentry(save);
+                if (*de != save) {
+                        struct ll_dentry_data *lld = ll_d2d(*de);
+
+                        /* just make sure the ll_dentry_data is ready */
+                        if (unlikely(lld == NULL)) {
+                                ll_set_dd(*de);
+                                lld = ll_d2d(*de);
+                                if (likely(lld != NULL))
+                                        lld->lld_sa_generation = 0;
                          }
                  }
          } else {
+                ll_dops_init(*de, 1);
                  /* Check that parent has UPDATE lock. If there is none, we
                     cannot afford to hash this dentry (done by ll_d_add) as it
                     might get picked up later when UPDATE lock will appear */
@@ -514,6 +505,7 @@ int lookup_it_finish(struct ptlrpc_request *request, int offset,
                          ll_d_add(*de, inode);
                          spin_unlock(&dcache_lock);
                  } else {
+                        (*de)->d_inode = NULL;
                          /* We do not want to hash the dentry if don`t have a
                           * lock, but if this dentry is later used in d_move,
                           * we'd hit uninitialised list head d_hash, so we just
@@ -524,19 +516,6 @@ int lookup_it_finish(struct ptlrpc_request *request, int offset,
                  }
          }
  
-        ll_set_dd(*de);
-
-        lock_dentry(*de);
-        if (likely((*de)->d_op == &ll_init_d_ops)) {
-                (*de)->d_op = &ll_d_ops;
-                unlock_dentry(*de);
-                smp_wmb();
-                ll_d_wakeup(*de);
-        } else {
-                (*de)->d_op = &ll_d_ops;
-                unlock_dentry(*de);
-        }
-
          RETURN(0);
  }
  
@@ -544,11 +523,11 @@ static struct dentry *ll_lookup_it(struct inode *parent, struct dentry *dentry,
                                     struct lookup_intent *it, int lookup_flags)
  {
          struct dentry *save = dentry, *retval;
-        struct mdc_op_data op_data;
+        struct mdc_op_data op_data = { { 0 } };
          struct it_cb_data icbd;
          struct ptlrpc_request *req = NULL;
          struct lookup_intent lookup_it = { .it_op = IT_LOOKUP };
-        int rc;
+        int rc, first = 0;
          ENTRY;
  
          CDEBUG(D_VFSTRACE, "VFS Op:name=%.*s,dir=%lu/%u(%p),intent=%s\n",
@@ -570,10 +549,10 @@ static struct dentry *ll_lookup_it(struct inode *parent, struct dentry *dentry,
          }
  
          if (it->it_op == IT_GETATTR) {
-                rc = ll_statahead_enter(parent, &dentry, 1);
-                if (rc >= 0) {
-                        ll_statahead_exit(dentry, rc);
-                        if (rc == 1)
+                first = ll_statahead_enter(parent, &dentry, 1);
+                if (first >= 0) {
+                        ll_statahead_exit(dentry, first);
+                        if (first == 1)
                                  RETURN(retval = dentry);
                  }
          }
@@ -600,6 +579,9 @@ static struct dentry *ll_lookup_it(struct inode *parent, struct dentry *dentry,
                  GOTO(out, retval = ERR_PTR(rc));
          }
  
+        if (first == -EEXIST)
+                ll_statahead_mark(dentry);
+
          if ((it->it_op & IT_OPEN) && dentry->d_inode &&
              !S_ISREG(dentry->d_inode->i_mode) &&
              !S_ISDIR(dentry->d_inode->i_mode)) {
@@ -617,7 +599,6 @@ static struct dentry *ll_lookup_it(struct inode *parent, struct dentry *dentry,
          return retval;
  }
  
-#if (LINUX_VERSION_CODE > KERNEL_VERSION(2,5,0))
  #ifdef HAVE_VFS_INTENT_PATCHES
  static struct dentry *ll_lookup_nd(struct inode *parent, struct dentry *dentry,
                                     struct nameidata *nd)
@@ -746,7 +727,6 @@ static struct dentry *ll_lookup_nd(struct inode *parent, struct dentry *dentry,
          RETURN(de);
  }
  #endif
-#endif
  
  /* We depend on "mode" being set with the proper file type/umask by now */
  static struct inode *ll_create_node(struct inode *dir, const char *name,
@@ -840,21 +820,15 @@ static void ll_update_times(struct ptlrpc_request *request, int offset,
                                                 sizeof(*body));
          LASSERT(body);
  
-        /* mtime is always updated with ctime, but can be set in past.
-           As write and utime(2) may happen within 1 second, and utime's
-           mtime has a priority over write's one, so take mtime from mds
-           for the same ctimes. */
+        if (body->valid & OBD_MD_FLMTIME &&
+            body->mtime > LTIME_S(inode->i_mtime)) {
+                CDEBUG(D_INODE, "setting ino %lu mtime from %lu to "LPU64"\n",
+                       inode->i_ino, LTIME_S(inode->i_mtime), body->mtime);
+                LTIME_S(inode->i_mtime) = body->mtime;
+        }
          if (body->valid & OBD_MD_FLCTIME &&
-            body->ctime >= LTIME_S(inode->i_ctime)) {
+            body->ctime > LTIME_S(inode->i_ctime))
                  LTIME_S(inode->i_ctime) = body->ctime;
-
-                if (body->valid & OBD_MD_FLMTIME) {
-                        CDEBUG(D_INODE, "setting ino %lu mtime from %lu "
-                               "to "LPU64"\n", inode->i_ino,
-                               LTIME_S(inode->i_mtime), body->mtime);
-                        LTIME_S(inode->i_mtime) = body->mtime;
-                }
-        }
  }
  
  static int ll_new_node(struct inode *dir, struct qstr *name,
@@ -864,7 +838,7 @@ static int ll_new_node(struct inode *dir, struct qstr *name,
          struct ptlrpc_request *request = NULL;
          struct inode *inode = NULL;
          struct ll_sb_info *sbi = ll_i2sbi(dir);
-        struct mdc_op_data op_data;
+        struct mdc_op_data op_data = { { 0 } };
          int tgt_len = 0;
          int err;
  
@@ -879,7 +853,7 @@ static int ll_new_node(struct inode *dir, struct qstr *name,
  
          err = mdc_create(sbi->ll_mdc_exp, &op_data, tgt, tgt_len,
                           mode, current->fsuid, current->fsgid,
-                         current->cap_effective, rdev, &request);
+                         cfs_curproc_cap_pack(), rdev, &request);
          if (err)
                  GOTO(err_exit, err);
  
@@ -933,7 +907,6 @@ static int ll_mknod_generic(struct inode *dir, struct qstr *name, int mode,
          RETURN(err);
  }
  
-#if (LINUX_VERSION_CODE > KERNEL_VERSION(2,5,0))
  #ifndef HAVE_VFS_INTENT_PATCHES
  static int ll_create_nd(struct inode *dir, struct dentry *dentry, int mode, struct nameidata *nd)
  {
@@ -978,7 +951,6 @@ static int ll_create_nd(struct inode *dir, struct dentry *dentry, int mode, stru
          return ll_create_it(dir, dentry, mode, &nd->intent);
  }
  #endif
-#endif
  
  static int ll_symlink_generic(struct inode *dir, struct qstr *name,
                                const char *tgt, struct dentry *dchild)
@@ -999,7 +971,7 @@ static int ll_link_generic(struct inode *src,  struct inode *dir,
                             struct qstr *name, struct dentry *dchild)
  {
          struct ptlrpc_request *request = NULL;
-        struct mdc_op_data op_data;
+        struct mdc_op_data op_data = { { 0 } };
          int err;
          struct ll_sb_info *sbi = ll_i2sbi(dir);
  
@@ -1064,7 +1036,7 @@ static int ll_rmdir_generic(struct inode *dir, struct dentry *dparent,
                              struct qstr *name)
  {
          struct ptlrpc_request *request = NULL;
-        struct mdc_op_data op_data = {{0}};
+        struct mdc_op_data op_data = { { 0 } };
          struct dentry *dentry;
          int rc;
          ENTRY;
@@ -1149,8 +1121,9 @@ int ll_objects_destroy(struct ptlrpc_request *request, struct inode *dir)
                  GOTO(out_free_memmd, rc = -ENOMEM);
  
          oa->o_id = lsm->lsm_object_id;
+        oa->o_gr = lsm->lsm_object_gr;
          oa->o_mode = body->mode & S_IFMT;
-        oa->o_valid = OBD_MD_FLID | OBD_MD_FLTYPE;
+        oa->o_valid = OBD_MD_FLID | OBD_MD_FLGROUP | OBD_MD_FLTYPE;
  
          if (body->valid & OBD_MD_FLCOOKIE) {
                  oa->o_valid |= OBD_MD_FLCOOKIE;
@@ -1167,8 +1140,8 @@ int ll_objects_destroy(struct ptlrpc_request *request, struct inode *dir)
          rc = obd_destroy(ll_i2obdexp(dir), oa, lsm, &oti, ll_i2mdcexp(dir));
          OBDO_FREE(oa);
          if (rc)
-                CERROR("obd destroy objid "LPX64" error %d\n",
-                       lsm->lsm_object_id, rc);
+                CERROR("obd destroy objid "LPX64"@"LPX64" error %d\n",
+                       lsm->lsm_object_id, lsm->lsm_object_gr, rc);
   out_free_memmd:
          obd_free_memmd(ll_i2obdexp(dir), &lsm);
   out:
@@ -1178,7 +1151,7 @@ int ll_objects_destroy(struct ptlrpc_request *request, struct inode *dir)
  static int ll_unlink_generic(struct inode * dir, struct qstr *name)
  {
          struct ptlrpc_request *request = NULL;
-        struct mdc_op_data op_data = {{0}};
+        struct mdc_op_data op_data = { { 0 } };
          int rc;
          ENTRY;
  
@@ -1211,7 +1184,7 @@ static int ll_rename_generic(struct inode *src, struct qstr *src_name,
  {
          struct ptlrpc_request *request = NULL;
          struct ll_sb_info *sbi = ll_i2sbi(src);
-        struct mdc_op_data op_data = {{0}};
+        struct mdc_op_data op_data = { { 0 } };
          int err;
  
          ENTRY;
@@ -1284,7 +1257,6 @@ static int ll_mknod(struct inode *dir, struct dentry *dchild, int mode,
                                  old_encode_dev(rdev), dchild);
  }
  
-#if (LINUX_VERSION_CODE > KERNEL_VERSION(2,5,0))
  static int ll_unlink(struct inode * dir, struct dentry *dentry)
  {
          return ll_unlink_generic(dir, &dentry->d_name);
@@ -1322,7 +1294,6 @@ static int ll_rename(struct inode *old_dir, struct dentry *old_dentry,
          }
          return err;
  }
-#endif
  
  struct inode_operations ll_dir_inode_operations = {
  #ifdef HAVE_VFS_INTENT_PATCHES
@@ -1337,11 +1308,6 @@ struct inode_operations ll_dir_inode_operations = {
          .setattr_raw        = ll_setattr_raw,
  #endif
          .mknod              = ll_mknod,
-#if (LINUX_VERSION_CODE < KERNEL_VERSION(2,5,0))
-        .create_it          = ll_create_it,
-        .lookup_it          = ll_lookup_it,
-        .revalidate_it      = ll_inode_revalidate_it,
-#else
          .lookup             = ll_lookup_nd,
          .create             = ll_create_nd,
          /* We need all these non-raw things for NFSD, to not patch it. */
@@ -1353,7 +1319,6 @@ struct inode_operations ll_dir_inode_operations = {
          .rename             = ll_rename,
          .setattr            = ll_setattr,
          .getattr            = ll_getattr,
-#endif
          .permission         = ll_inode_permission,
          .setxattr           = ll_setxattr,
          .getxattr           = ll_getxattr,
@@ -1366,11 +1331,7 @@ struct inode_operations ll_special_inode_operations = {
          .setattr_raw    = ll_setattr_raw,
  #endif
          .setattr        = ll_setattr,
-#if (LINUX_VERSION_CODE > KERNEL_VERSION(2,5,0))
          .getattr        = ll_getattr,
-#else
-        .revalidate_it  = ll_inode_revalidate_it,
-#endif
          .permission     = ll_inode_permission,
          .setxattr       = ll_setxattr,
          .getxattr       = ll_getxattr,
diff --git a/lustre/llite/rw.c b/lustre/llite/rw.c

index e79fa25..a7952b2 100644 (file)
--- a/lustre/llite/rw.c
+++ b/lustre/llite/rw.c
@@ -1,25 +1,43 @@
  /* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
   * vim:expandtab:shiftwidth=8:tabstop=8:
   *
- * Lustre Lite I/O page cache routines shared by different kernel revs
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   *
- *  Copyright (c) 2001-2003 Cluster File Systems, Inc.
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
   *
- *   This file is part of Lustre, http://www.lustre.org.
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
   *
- *   Lustre is free software; you can redistribute it and/or
- *   modify it under the terms of version 2 of the GNU General Public
- *   License as published by the Free Software Foundation.
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
   *
- *   Lustre is distributed in the hope that it will be useful,
- *   but WITHOUT ANY WARRANTY; without even the implied warranty of
- *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- *   GNU General Public License for more details.
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
   *
- *   You should have received a copy of the GNU General Public License
- *   along with Lustre; if not, write to the Free Software
- *   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ * GPL HEADER END
+ */
+/*
+ * Copyright  2008 Sun Microsystems, Inc. All rights reserved
+ * Use is subject to license terms.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/llite/rw.c
+ *
+ * Lustre Lite I/O page cache routines shared by different kernel revs
   */
+
  #ifndef AUTOCONF_INCLUDED
  #include <linux/config.h>
  #endif
@@ -108,6 +126,7 @@ int ll_file_punch(struct inode * inode, loff_t new_size, int srvlock)
          struct ll_inode_info *lli = ll_i2info(inode);
          struct obd_info oinfo = { { { 0 } } };
          struct obdo oa;
+        obd_valid valid;
          int rc;
  
          ENTRY;
@@ -119,7 +138,12 @@ int ll_file_punch(struct inode * inode, loff_t new_size, int srvlock)
          oinfo.oi_policy.l_extent.end = OBD_OBJECT_EOF;
          oinfo.oi_oa = &oa;
          oa.o_id = lli->lli_smd->lsm_object_id;
-        oa.o_valid = OBD_MD_FLID;
+        oa.o_gr = lli->lli_smd->lsm_object_gr;
+        oa.o_valid = OBD_MD_FLID | OBD_MD_FLGROUP;
+
+        valid = OBD_MD_FLTYPE | OBD_MD_FLMODE |OBD_MD_FLFID |
+                OBD_MD_FLATIME | OBD_MD_FLUID | OBD_MD_FLGID | OBD_MD_FLGENER |
+                OBD_MD_FLBLOCKS;
          if (srvlock) {
                  /* set OBD_MD_FLFLAGS in o_valid, only if we 
                   * set OBD_FL_TRUNCLOCK, otherwise ost_punch
@@ -127,11 +151,42 @@ int ll_file_punch(struct inode * inode, loff_t new_size, int srvlock)
                   * in ost_punch */
                  oa.o_flags = OBD_FL_TRUNCLOCK;
                  oa.o_valid |= OBD_MD_FLFLAGS;
+
+                /* lockless truncate
+                 *
+                 * 1. do not use inode's timestamps because concurrent
+                 * stat might fill the inode with out-of-date times,
+                 * send current instead
+                 *
+                 * 2.do no update lsm, as long as stat (via
+                 * ll_glimpse_size) will bring attributes from osts
+                 * anyway */
+                oa.o_mtime = oa.o_ctime = LTIME_S(CURRENT_TIME);
+                oa.o_valid |= OBD_MD_FLMTIME | OBD_MD_FLCTIME;
+        } else {
+                /* truncate under locks
+                 *
+                 * 1. update inode's mtime and ctime as long as
+                 * concurrent stat (via ll_glimpse_size) might bring
+                 * out-of-date ones
+                 *
+                 * 2. update lsm so that next stat (via
+                 * ll_glimpse_size) could get correct values in lsm */
+                struct ost_lvb xtimes;
+
+                lov_stripe_lock(lli->lli_smd);
+                LTIME_S(inode->i_mtime) = LTIME_S(CURRENT_TIME);
+                LTIME_S(inode->i_ctime) = LTIME_S(CURRENT_TIME);
+                xtimes.lvb_mtime = LTIME_S(inode->i_mtime);
+                xtimes.lvb_ctime = LTIME_S(inode->i_ctime);
+                obd_update_lvb(ll_i2obdexp(inode), lli->lli_smd, &xtimes,
+                               OBD_MD_FLMTIME | OBD_MD_FLCTIME);
+                lov_stripe_unlock(lli->lli_smd);
+
+                valid |= OBD_MD_FLMTIME | OBD_MD_FLCTIME;
          }
-        obdo_from_inode(&oa, inode, OBD_MD_FLTYPE | OBD_MD_FLMODE |OBD_MD_FLFID|
-                        OBD_MD_FLATIME | OBD_MD_FLMTIME | OBD_MD_FLCTIME |
-                        OBD_MD_FLUID | OBD_MD_FLGID | OBD_MD_FLGENER |
-                        OBD_MD_FLBLOCKS);
+        obdo_from_inode(&oa, inode, valid);
+
          rc = obd_punch_rqset(ll_i2obdexp(inode), &oinfo, NULL);
          if (rc) {
                  CERROR("obd_truncate fails (%d) ino %lu\n", rc, inode->i_ino);
@@ -254,7 +309,9 @@ int ll_prepare_write(struct file *file, struct page *page, unsigned from,
  
          oa.o_mode = inode->i_mode;
          oa.o_id = lsm->lsm_object_id;
-        oa.o_valid = OBD_MD_FLID | OBD_MD_FLMODE | OBD_MD_FLTYPE;
+        oa.o_gr = lsm->lsm_object_gr;
+        oa.o_valid = OBD_MD_FLID   | OBD_MD_FLMODE |
+                     OBD_MD_FLTYPE | OBD_MD_FLGROUP;
          obdo_from_inode(&oa, inode, OBD_MD_FLFID | OBD_MD_FLGENER);
  
          oinfo.oi_oa = &oa;
@@ -414,7 +471,8 @@ void ll_inode_fill_obdo(struct inode *inode, int cmd, struct obdo *oa)
          lsm = ll_i2info(inode)->lli_smd;
  
          oa->o_id = lsm->lsm_object_id;
-        oa->o_valid = OBD_MD_FLID;
+        oa->o_gr = lsm->lsm_object_gr;
+        oa->o_valid = OBD_MD_FLID | OBD_MD_FLGROUP;
          valid_flags = OBD_MD_FLTYPE | OBD_MD_FLATIME;
          if (cmd & OBD_BRW_WRITE) {
                  oa->o_valid |= OBD_MD_FLEPOCH;
@@ -470,50 +528,36 @@ struct ll_async_page *llap_cast_private(struct page *page)
          return llap;
  }
  
-/* Try to shrink the page cache for the @sbi filesystem by 1/@shrink_fraction.
+/* Try to reap @target pages in the specific @cpu's async page list.
   *
   * There is an llap attached onto every page in lustre, linked off @sbi.
   * We add an llap to the list so we don't lose our place during list walking.
   * If llaps in the list are being moved they will only move to the end
   * of the LRU, and we aren't terribly interested in those pages here (we
- * start at the beginning of the list where the least-used llaps are.
- */
-int llap_shrink_cache(struct ll_sb_info *sbi, int shrink_fraction)
+ * start at the beginning of the list where the least-used llaps are. */
+static inline int llap_shrink_cache_internal(struct ll_sb_info *sbi, 
+        int cpu, int target)
  {
          struct ll_async_page *llap, dummy_llap = { .llap_magic = 0xd11ad11a };
-        unsigned long total, want, count = 0;
-
-        total = sbi->ll_async_page_count;
-
-        /* There can be a large number of llaps (600k or more in a large
-         * memory machine) so the VM 1/6 shrink ratio is likely too much.
-         * Since we are freeing pages also, we don't necessarily want to
-         * shrink so much.  Limit to 40MB of pages + llaps per call. */
-        if (shrink_fraction == 0)
-                want = sbi->ll_async_page_count - sbi->ll_async_page_max + 32;
-        else
-                want = (total + shrink_fraction - 1) / shrink_fraction;
-
-        if (want > 40 << (20 - CFS_PAGE_SHIFT))
-                want = 40 << (20 - CFS_PAGE_SHIFT);
-
-        CDEBUG(D_CACHE, "shrinking %lu of %lu pages (1/%d)\n",
-               want, total, shrink_fraction);
-
-        spin_lock(&sbi->ll_lock);
-        list_add(&dummy_llap.llap_pglist_item, &sbi->ll_pglist);
-
-        while (--total >= 0 && count < want) {
+        struct ll_pglist_data *pd;
+        struct list_head *head;
+        int count = 0;
+
+        pd = ll_pglist_cpu_lock(sbi, cpu);
+        head = &pd->llpd_list;
+        list_add(&dummy_llap.llap_pglist_item, head);
+        while (count < target) {
                  struct page *page;
                  int keep;
  
                  if (unlikely(need_resched())) {
-                        spin_unlock(&sbi->ll_lock);
+                        ll_pglist_cpu_unlock(sbi, cpu);
                          cond_resched();
-                        spin_lock(&sbi->ll_lock);
+                        ll_pglist_cpu_lock(sbi, cpu);
                  }
  
-                llap = llite_pglist_next_llap(sbi,&dummy_llap.llap_pglist_item);
+                llap = llite_pglist_next_llap(head, 
+                        &dummy_llap.llap_pglist_item);
                  list_del_init(&dummy_llap.llap_pglist_item);
                  if (llap == NULL)
                          break;
@@ -549,7 +593,7 @@ int llap_shrink_cache(struct ll_sb_info *sbi, int shrink_fraction)
                  }
  
                  page_cache_get(page);
-                spin_unlock(&sbi->ll_lock);
+                ll_pglist_cpu_unlock(sbi, cpu);
  
                  if (page->mapping != NULL) {
                          ll_teardown_mmaps(page->mapping,
@@ -571,15 +615,146 @@ int llap_shrink_cache(struct ll_sb_info *sbi, int shrink_fraction)
                  unlock_page(page);
                  page_cache_release(page);
  
-                spin_lock(&sbi->ll_lock);
+                ll_pglist_cpu_lock(sbi, cpu);
          }
          list_del(&dummy_llap.llap_pglist_item);
-        spin_unlock(&sbi->ll_lock);
+        ll_pglist_cpu_unlock(sbi, cpu);
+
+        CDEBUG(D_CACHE, "shrank %d, expected %d however. \n", count, target);
+        return count;
+}
+
+
+/* Try to shrink the page cache for the @sbi filesystem by 1/@shrink_fraction.
+ *
+ * At first, this code calculates total pages wanted by @shrink_fraction, then
+ * it deduces how many pages should be reaped from each cpu in proportion as 
+ * their own # of page count(llpd_count).
+ */
+int llap_shrink_cache(struct ll_sb_info *sbi, int shrink_fraction)
+{
+        unsigned long total, want, percpu_want, count = 0;
+        int cpu, nr_cpus;
+
+        total = lcounter_read(&sbi->ll_async_page_count);
+        if (total == 0)
+                return 0;
+
+#ifdef HAVE_SHRINKER_CACHE
+        want = shrink_fraction;
+        if (want == 0)
+                return total;
+#else
+        /* There can be a large number of llaps (600k or more in a large
+         * memory machine) so the VM 1/6 shrink ratio is likely too much.
+         * Since we are freeing pages also, we don't necessarily want to
+         * shrink so much.  Limit to 40MB of pages + llaps per call. */
+        if (shrink_fraction <= 0)
+                want = total - sbi->ll_async_page_max + 32*num_online_cpus();
+        else
+                want = (total + shrink_fraction - 1) / shrink_fraction;
+#endif
+
+        if (want > 40 << (20 - CFS_PAGE_SHIFT))
+                want = 40 << (20 - CFS_PAGE_SHIFT);
+
+        CDEBUG(D_CACHE, "shrinking %lu of %lu pages (1/%d)\n",
+               want, total, shrink_fraction);
+
+        nr_cpus = num_possible_cpus();
+        cpu = sbi->ll_async_page_clock_hand;
+        /* we at most do one round */
+        do {
+                int c;
+
+                cpu = (cpu + 1) % nr_cpus;
+                c = LL_PGLIST_DATA_CPU(sbi, cpu)->llpd_count;
+                if (!cpu_online(cpu))
+                        percpu_want = c;
+                else
+                        percpu_want = want / ((total / (c + 1)) + 1);
+                if (percpu_want == 0)
+                        continue;
+
+                count += llap_shrink_cache_internal(sbi, cpu, percpu_want);
+                if (count >= want)
+                        sbi->ll_async_page_clock_hand = cpu;
+        } while (cpu != sbi->ll_async_page_clock_hand);
  
          CDEBUG(D_CACHE, "shrank %lu/%lu and left %lu unscanned\n",
                 count, want, total);
  
+#ifdef HAVE_SHRINKER_CACHE
+        return lcounter_read(&sbi->ll_async_page_count);
+#else
          return count;
+#endif
+}
+
+/* Rebalance the async page queue len for each cpu. We hope that the cpu
+ * which do much IO job has a relative longer queue len.
+ * This function should be called with preempt disabled.
+ */
+static inline int llap_async_cache_rebalance(struct ll_sb_info *sbi)
+{
+        unsigned long sample = 0, *cpu_sample, bias, slice;
+        struct ll_pglist_data *pd;
+        cpumask_t mask;
+        int cpu, surplus;
+        int w1 = 7, w2 = 3, base = (w1 + w2); /* weight value */
+        atomic_t *pcnt;
+
+        if (!spin_trylock(&sbi->ll_async_page_reblnc_lock)) {
+                /* someone else is doing the job */
+                return 1;
+        }
+
+        pcnt = &LL_PGLIST_DATA(sbi)->llpd_sample_count;
+        if (!atomic_read(pcnt)) {
+                /* rare case, somebody else has gotten this job done */
+                spin_unlock(&sbi->ll_async_page_reblnc_lock);
+                return 1;
+        }
+
+        sbi->ll_async_page_reblnc_count++;
+        cpu_sample = sbi->ll_async_page_sample;
+        memset(cpu_sample, 0, num_possible_cpus() * sizeof(unsigned long));
+        for_each_online_cpu(cpu) {
+                pcnt = &LL_PGLIST_DATA_CPU(sbi, cpu)->llpd_sample_count;
+                cpu_sample[cpu] = atomic_read(pcnt);
+                atomic_set(pcnt, 0);
+                sample += cpu_sample[cpu];
+        }
+
+        cpus_clear(mask);
+        surplus = sbi->ll_async_page_max;
+        slice = surplus / sample + 1;
+        sample /= num_online_cpus();
+        bias = sample >> 4;
+        for_each_online_cpu(cpu) {
+                pd = LL_PGLIST_DATA_CPU(sbi, cpu);
+                if (labs((long int)sample - cpu_sample[cpu]) > bias) {
+                        unsigned long budget = pd->llpd_budget;
+                        /* weighted original queue length and expected queue
+                         * length to avoid thrashing. */
+                        pd->llpd_budget = (budget * w1) / base +
+                                        (slice * cpu_sample[cpu]) * w2 / base;
+                        cpu_set(cpu, mask);
+                }
+                surplus -= pd->llpd_budget;
+        }
+        surplus /= cpus_weight(mask) ?: 1;
+        for_each_cpu_mask(cpu, mask)
+                LL_PGLIST_DATA_CPU(sbi, cpu)->llpd_budget += surplus;
+        spin_unlock(&sbi->ll_async_page_reblnc_lock);
+
+        /* TODO: do we really need to call llap_shrink_cache_internal 
+         * for every cpus with its page_count greater than budget?
+         * for_each_cpu_mask(cpu, mask) 
+         *      ll_shrink_cache_internal(...) 
+         */
+
+        return 0;
  }
  
  static struct ll_async_page *llap_from_page_with_lockh(struct page *page,
@@ -590,7 +765,8 @@ static struct ll_async_page *llap_from_page_with_lockh(struct page *page,
          struct obd_export *exp;
          struct inode *inode = page->mapping->host;
          struct ll_sb_info *sbi;
-        int rc;
+        struct ll_pglist_data *pd;
+        int rc, cpu, target;
          ENTRY;
  
          if (!inode) {
@@ -613,11 +789,30 @@ static struct ll_async_page *llap_from_page_with_lockh(struct page *page,
                  /* move to end of LRU list, except when page is just about to
                   * die */
                  if (origin != LLAP_ORIGIN_REMOVEPAGE) {
-                        spin_lock(&sbi->ll_lock);
-                        sbi->ll_pglist_gen++;
-                        list_del_init(&llap->llap_pglist_item);
-                        list_add_tail(&llap->llap_pglist_item, &sbi->ll_pglist);
-                        spin_unlock(&sbi->ll_lock);
+                        int old_cpu = llap->llap_pglist_cpu;
+                        struct ll_pglist_data *old_pd;
+
+                        pd = ll_pglist_double_lock(sbi, old_cpu, &old_pd);
+                        pd->llpd_hit++;
+                        while (old_cpu != llap->llap_pglist_cpu) {
+                                /* rarely case, someone else is touching this
+                                 * page too. */
+                                ll_pglist_double_unlock(sbi, old_cpu);
+                                old_cpu = llap->llap_pglist_cpu;
+                                pd=ll_pglist_double_lock(sbi, old_cpu, &old_pd);
+                        }
+
+                        list_move(&llap->llap_pglist_item,
+                                  &pd->llpd_list);
+                        old_pd->llpd_gen++;
+                        if (pd->llpd_cpu != old_cpu) {
+                                pd->llpd_count++;
+                                old_pd->llpd_count--;
+                                old_pd->llpd_gen++;
+                                llap->llap_pglist_cpu = pd->llpd_cpu;
+                                pd->llpd_cross++;
+                        }
+                        ll_pglist_double_unlock(sbi, old_cpu);
                  }
                  GOTO(out, llap);
          }
@@ -627,8 +822,28 @@ static struct ll_async_page *llap_from_page_with_lockh(struct page *page,
                  RETURN(ERR_PTR(-EINVAL));
  
          /* limit the number of lustre-cached pages */
-        if (sbi->ll_async_page_count >= sbi->ll_async_page_max)
-                llap_shrink_cache(sbi, 0);
+        cpu = get_cpu();
+        pd = LL_PGLIST_DATA(sbi);
+        target = pd->llpd_count - pd->llpd_budget;
+        if (target > 0) {
+                rc = 0;
+                atomic_inc(&pd->llpd_sample_count);
+                if (atomic_read(&pd->llpd_sample_count) > 
+                    sbi->ll_async_page_sample_max) {
+                        pd->llpd_reblnc_count++;
+                        rc = llap_async_cache_rebalance(sbi);
+                        if (rc == 0)
+                                target = pd->llpd_count - pd->llpd_budget;
+                }
+                /* if rc equals 1, it means other cpu is doing the rebalance
+                 * job, and our budget # would be modified when we read it. 
+                 * Furthermore, it is much likely being increased because
+                 * we have already reached the rebalance threshold. In this
+                 * case, we skip to shrink cache here. */
+                if ((rc == 0) && target > 0)
+                        llap_shrink_cache_internal(sbi, cpu, target + 32);
+        }
+        put_cpu();
  
          OBD_SLAB_ALLOC(llap, ll_async_page_slab, CFS_ALLOC_STD,
                         ll_async_page_slab_size);
@@ -657,11 +872,14 @@ static struct ll_async_page *llap_from_page_with_lockh(struct page *page,
          __set_page_ll_data(page, llap);
          llap->llap_page = page;
  
-        spin_lock(&sbi->ll_lock);
-        sbi->ll_pglist_gen++;
-        sbi->ll_async_page_count++;
-        list_add_tail(&llap->llap_pglist_item, &sbi->ll_pglist);
-        spin_unlock(&sbi->ll_lock);
+        lcounter_inc(&sbi->ll_async_page_count);
+        pd = ll_pglist_lock(sbi);
+        list_add_tail(&llap->llap_pglist_item, &pd->llpd_list);
+        pd->llpd_count++;
+        pd->llpd_gen++;
+        pd->llpd_miss++;
+        llap->llap_pglist_cpu = pd->llpd_cpu;
+        ll_pglist_unlock(sbi);
  
   out:
          if (unlikely(sbi->ll_flags & LL_SBI_LLITE_CHECKSUM)) {
@@ -825,7 +1043,7 @@ int ll_commit_write(struct file *file, struct page *page, unsigned from,
          if (exp == NULL)
                  RETURN(-EINVAL);
  
-        llap->llap_ignore_quota = capable(CAP_SYS_RESOURCE);
+        llap->llap_ignore_quota = cfs_capable(CFS_CAP_SYS_RESOURCE);
  
          /* queue a write for some time in the future the first time we
           * dirty the page */
@@ -865,28 +1083,40 @@ out:
          RETURN(rc);
  }
  
+static void ll_ra_stats_inc_sbi(struct ll_sb_info *sbi, enum ra_stat which);
+
+/* WARNING: This algorithm is used to reduce the contention on 
+ * sbi->ll_lock. It should work well if the ra_max_pages is much 
+ * greater than the single file's read-ahead window.
+ *
+ * TODO: There may exist a `global sync problem' in this implementation. 
+ * Considering the global ra window is 100M, and each file's ra window is 10M,
+ * there are over 10 files trying to get its ra budget and reach 
+ * ll_ra_count_get at the exactly same time. All of them will get a zero ra
+ * window, although the global window is 100M. -jay
+ */
  static unsigned long ll_ra_count_get(struct ll_sb_info *sbi, unsigned long len)
  {
          struct ll_ra_info *ra = &sbi->ll_ra_info;
          unsigned long ret;
          ENTRY;
  
-        spin_lock(&sbi->ll_lock);
-        ret = min(ra->ra_max_pages - ra->ra_cur_pages, len);
-        ra->ra_cur_pages += ret;
-        spin_unlock(&sbi->ll_lock);
+        ret = min(ra->ra_max_pages - atomic_read(&ra->ra_cur_pages), len);
+        if ((int)ret < 0)
+                GOTO(out, ret = 0);
  
+        if (atomic_add_return(ret, &ra->ra_cur_pages) > ra->ra_max_pages) {
+                atomic_sub(ret, &ra->ra_cur_pages);
+                ret = 0;
+        }
+out:
          RETURN(ret);
  }
  
  static void ll_ra_count_put(struct ll_sb_info *sbi, unsigned long len)
  {
          struct ll_ra_info *ra = &sbi->ll_ra_info;
-        spin_lock(&sbi->ll_lock);
-        LASSERTF(ra->ra_cur_pages >= len, "r_c_p %lu len %lu\n",
-                 ra->ra_cur_pages, len);
-        ra->ra_cur_pages -= len;
-        spin_unlock(&sbi->ll_lock);
+        atomic_sub(len, &ra->ra_cur_pages);
  }
  
  /* called for each page in a completed rpc.*/
@@ -920,14 +1150,10 @@ int ll_ap_completion(void *data, int cmd, struct obdo *oa, int rc)
                          llap->llap_defer_uptodate = 0;
                  }
                  SetPageError(page);
-#if (LINUX_VERSION_CODE > KERNEL_VERSION(2,5,0))
                  if (rc == -ENOSPC)
                          set_bit(AS_ENOSPC, &page->mapping->flags);
                  else
                          set_bit(AS_EIO, &page->mapping->flags);
-#else
-                page->mapping->gfp_mask |= AS_EIO_MASK;
-#endif
          }
  
          /* be carefull about clear WB.
@@ -956,7 +1182,8 @@ static void __ll_put_llap(struct page *page)
          struct obd_export *exp;
          struct ll_async_page *llap;
          struct ll_sb_info *sbi = ll_i2sbi(inode);
-        int rc;
+        struct ll_pglist_data *pd;
+        int rc, cpu;
          ENTRY;
  
          exp = ll_i2obdexp(inode);
@@ -984,12 +1211,14 @@ static void __ll_put_llap(struct page *page)
           * is providing exclusivity to memory pressure/truncate/writeback..*/
          __clear_page_ll_data(page);
  
-        spin_lock(&sbi->ll_lock);
+        lcounter_dec(&sbi->ll_async_page_count);
+        cpu = llap->llap_pglist_cpu;
+        pd = ll_pglist_cpu_lock(sbi, cpu);
+        pd->llpd_gen++;
+        pd->llpd_count--;
          if (!list_empty(&llap->llap_pglist_item))
                  list_del_init(&llap->llap_pglist_item);
-        sbi->ll_pglist_gen++;
-        sbi->ll_async_page_count--;
-        spin_unlock(&sbi->ll_lock);
+        ll_pglist_cpu_unlock(sbi, cpu);
          OBD_SLAB_FREE(llap, ll_async_page_slab, ll_async_page_slab_size);
  
          EXIT;
@@ -1043,20 +1272,16 @@ static int ll_issue_page_read(struct obd_export *exp,
          RETURN(rc);
  }
  
-static void ll_ra_stats_inc_unlocked(struct ll_ra_info *ra, enum ra_stat which)
+static void ll_ra_stats_inc_sbi(struct ll_sb_info *sbi, enum ra_stat which)
  {
          LASSERTF(which >= 0 && which < _NR_RA_STAT, "which: %u\n", which);
-        ra->ra_stats[which]++;
+        lprocfs_counter_incr(sbi->ll_ra_stats, which);
  }
  
  static void ll_ra_stats_inc(struct address_space *mapping, enum ra_stat which)
  {
          struct ll_sb_info *sbi = ll_i2sbi(mapping->host);
-        struct ll_ra_info *ra = &ll_i2sbi(mapping->host)->ll_ra_info;
-
-        spin_lock(&sbi->ll_lock);
-        ll_ra_stats_inc_unlocked(ra, which);
-        spin_unlock(&sbi->ll_lock);
+        ll_ra_stats_inc_sbi(sbi, which);
  }
  
  void ll_ra_accounting(struct ll_async_page *llap, struct address_space *mapping)
@@ -1467,6 +1692,8 @@ static void ras_reset(struct ll_readahead_state *ras, unsigned long index)
  static void ras_stride_reset(struct ll_readahead_state *ras)
  {
          ras->ras_consecutive_stride_requests = 0;
+        ras->ras_stride_length = 0;
+        ras->ras_stride_pages = 0;
          RAS_CDEBUG(ras);
  }
  
@@ -1478,46 +1705,39 @@ void ll_readahead_init(struct inode *inode, struct ll_readahead_state *ras)
          INIT_LIST_HEAD(&ras->ras_read_beads);
  }
  
-/* Check whether the read request is in the stride window.
+/* 
+ * Check whether the read request is in the stride window.
   * If it is in the stride window, return 1, otherwise return 0.
- * and also update stride_gap and stride_pages.
   */
  static int index_in_stride_window(unsigned long index,
                                    struct ll_readahead_state *ras,
                                    struct inode *inode)
  {
-        int stride_gap = index - ras->ras_last_readpage - 1;
-
-        LASSERT(stride_gap != 0);
-
-        if (ras->ras_consecutive_pages == 0)
+        unsigned long stride_gap = index - ras->ras_last_readpage - 1;
+ 
+        if (ras->ras_stride_length == 0 || ras->ras_stride_pages == 0)
                  return 0;
  
+        /* If it is contiguous read */
+        if (stride_gap == 0)
+                return ras->ras_consecutive_pages + 1 <= ras->ras_stride_pages;
+        
          /*Otherwise check the stride by itself */
-        if ((ras->ras_stride_length - ras->ras_stride_pages) == stride_gap &&
-            ras->ras_consecutive_pages == ras->ras_stride_pages)
-                return 1;
+        return (ras->ras_stride_length - ras->ras_stride_pages) == stride_gap &&
+             ras->ras_consecutive_pages == ras->ras_stride_pages;
+}
+
+static void ras_update_stride_detector(struct ll_readahead_state *ras,
+                                       unsigned long index)
+{
+        unsigned long stride_gap = index - ras->ras_last_readpage - 1;
  
-        if (stride_gap >= 0) {
-                /*
-                 * only set stride_pages, stride_length if
-                 * it is forward reading ( stride_gap > 0)
-                 */
+        if (!stride_io_mode(ras) && (stride_gap != 0 || 
+             ras->ras_consecutive_stride_requests == 0)) {
                  ras->ras_stride_pages = ras->ras_consecutive_pages;
-                ras->ras_stride_length = stride_gap + ras->ras_consecutive_pages;
-        } else {
-                /*
-                 * If stride_gap < 0,(back_forward reading),
-                 * reset the stride_pages/length.
-                 * FIXME:back_ward stride I/O read.
-                 *
-                 */
-                ras->ras_stride_pages = 0;
-                ras->ras_stride_length = 0;
+                ras->ras_stride_length = stride_gap +ras->ras_consecutive_pages;
          }
          RAS_CDEBUG(ras);
-
-        return 0;
  }
  
  static unsigned long
@@ -1589,13 +1809,12 @@ static void ras_update(struct ll_sb_info *sbi, struct inode *inode,
                         unsigned hit)
  {
          struct ll_ra_info *ra = &sbi->ll_ra_info;
-        int zero = 0, stride_zero = 0, stride_detect = 0, ra_miss = 0;
+        int zero = 0, stride_detect = 0, ra_miss = 0;
          ENTRY;
  
-        spin_lock(&sbi->ll_lock);
          spin_lock(&ras->ras_lock);
  
-        ll_ra_stats_inc_unlocked(ra, hit ? RA_STAT_HIT : RA_STAT_MISS);
+        ll_ra_stats_inc_sbi(sbi, hit ? RA_STAT_HIT : RA_STAT_MISS);
  
          /* reset the read-ahead window in two cases.  First when the app seeks
           * or reads to some other part of the file.  Secondly if we get a
@@ -1604,22 +1823,13 @@ static void ras_update(struct ll_sb_info *sbi, struct inode *inode,
           * reclaiming it before we get to it. */
          if (!index_in_window(index, ras->ras_last_readpage, 8, 8)) {
                  zero = 1;
-                ll_ra_stats_inc_unlocked(ra, RA_STAT_DISTANT_READPAGE);
-               /* check whether it is in stride I/O mode*/
-                if (!index_in_stride_window(index, ras, inode))
-                        stride_zero = 1;
+                ll_ra_stats_inc_sbi(sbi, RA_STAT_DISTANT_READPAGE);
          } else if (!hit && ras->ras_window_len &&
                     index < ras->ras_next_readahead &&
                     index_in_window(index, ras->ras_window_start, 0,
                                     ras->ras_window_len)) {
-                zero = 1;
                 ra_miss = 1;
-                /* If it hits read-ahead miss and the stride I/O is still
-                 * not detected, reset stride stuff to re-detect the whole
-                 * stride I/O mode to avoid complication */
-                if (!stride_io_mode(ras))
-                        stride_zero = 1;
-                ll_ra_stats_inc_unlocked(ra, RA_STAT_MISS_IN_WINDOW);
+                ll_ra_stats_inc_sbi(sbi, RA_STAT_MISS_IN_WINDOW);
          }
  
          /* On the second access to a file smaller than the tunable
@@ -1647,42 +1857,51 @@ static void ras_update(struct ll_sb_info *sbi, struct inode *inode,
                          GOTO(out_unlock, 0);
                  }
          }
-
          if (zero) {
-                /* If it is discontinuous read, check
-                 * whether it is stride I/O mode*/
-                if (stride_zero) {
+               /* check whether it is in stride I/O mode*/
+                if (!index_in_stride_window(index, ras, inode)) {
                          ras_reset(ras, index);
                          ras->ras_consecutive_pages++;
                          ras_stride_reset(ras);
-                        RAS_CDEBUG(ras);
                          GOTO(out_unlock, 0);
                  } else {
-                        /* The read is still in stride window or
-                        * it hits read-ahead miss */
-
-                        /* If ra-window miss is hitted, which probably means VM
-                         * pressure, and some read-ahead pages were reclaimed.So
-                         * the length of ra-window will not increased, but also
-                         * not reset to avoid redetecting the stride I/O mode.*/
-                       ras->ras_consecutive_requests = 0;
-                        if (!ra_miss) {
-                                ras->ras_consecutive_pages = 0;
-                                if (++ras->ras_consecutive_stride_requests > 1)
-                                        stride_detect = 1;
-                        }
+                       ras->ras_consecutive_requests = 0;
+                        if (++ras->ras_consecutive_stride_requests > 1)
+                                stride_detect = 1;
                          RAS_CDEBUG(ras);
                  }
-        } else if (ras->ras_consecutive_stride_requests > 1) {
-                /* If this is contiguous read but in stride I/O mode
-                 * currently, check whether stride step still is valid,
-                 * if invalid, it will reset the stride ra window*/    
-                if (ras->ras_consecutive_pages + 1 > ras->ras_stride_pages)
-                        ras_stride_reset(ras);
+        } else {
+                if (ra_miss) {
+                        if (index_in_stride_window(index, ras, inode) &&
+                            stride_io_mode(ras)) {
+                                /*If stride-RA hit cache miss, the stride dector 
+                                 *will not be reset to avoid the overhead of
+                                 *redetecting read-ahead mode */
+                                if (index != ras->ras_last_readpage + 1)
+                                       ras->ras_consecutive_pages = 0;
+                                RAS_CDEBUG(ras);
+                        } else {
+                                /* Reset both stride window and normal RA window */
+                                ras_reset(ras, index);
+                                ras->ras_consecutive_pages++;
+                                ras_stride_reset(ras);
+                                GOTO(out_unlock, 0);
+                        }
+                } else if (stride_io_mode(ras)) {
+                        /* If this is contiguous read but in stride I/O mode
+                         * currently, check whether stride step still is valid,
+                         * if invalid, it will reset the stride ra window*/    
+                        if (!index_in_stride_window(index, ras, inode)) {
+                                /* Shrink stride read-ahead window to be zero */
+                                ras_stride_reset(ras);
+                                ras->ras_window_len = 0;
+                                ras->ras_next_readahead = index;
+                        }
+                }
          }
-
-        ras->ras_last_readpage = index;
          ras->ras_consecutive_pages++;
+        ras_update_stride_detector(ras, index);
+        ras->ras_last_readpage = index;
          ras_set_start(ras, index);
          ras->ras_next_readahead = max(ras->ras_window_start,
                                        ras->ras_next_readahead);
@@ -1716,7 +1935,6 @@ out_unlock:
          RAS_CDEBUG(ras);
          ras->ras_request_index++;
          spin_unlock(&ras->ras_lock);
-        spin_unlock(&sbi->ll_lock);
          return;
  }
  
@@ -2020,7 +2238,7 @@ static int ll_file_oig_pages(struct inode * inode, struct page **pages,
          if (rc)
                  RETURN(rc);
          brw_flags = OBD_BRW_SRVLOCK;
-        if (capable(CAP_SYS_RESOURCE))
+        if (cfs_capable(CFS_CAP_SYS_RESOURCE))
                  brw_flags |= OBD_BRW_NOQUOTA;
  
          for (i = 0; i < numpages; i++) {
@@ -2113,7 +2331,7 @@ ssize_t ll_file_lockless_io(struct file *file, const struct iovec *iov,
                  rc = generic_write_checks(file, ppos, &count, 0);
                  if (rc)
                          GOTO(out, rc);
-                rc = remove_suid(file->f_dentry);
+                rc = ll_remove_suid(file->f_dentry, file->f_vfsmnt);
                  if (rc)
                          GOTO(out, rc);
          }
diff --git a/lustre/llite/rw24.c b/lustre/llite/rw24.c

index ab83f8f..4652d5f 100644 (file)
--- a/lustre/llite/rw24.c
+++ b/lustre/llite/rw24.c
@@ -1,25 +1,43 @@
  /* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
   * vim:expandtab:shiftwidth=8:tabstop=8:
   *
- * Lustre Lite I/O page cache for the 2.4 kernel version
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   *
- *  Copyright (c) 2001-2003 Cluster File Systems, Inc.
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
   *
- *   This file is part of Lustre, http://www.lustre.org.
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
   *
- *   Lustre is free software; you can redistribute it and/or
- *   modify it under the terms of version 2 of the GNU General Public
- *   License as published by the Free Software Foundation.
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
   *
- *   Lustre is distributed in the hope that it will be useful,
- *   but WITHOUT ANY WARRANTY; without even the implied warranty of
- *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- *   GNU General Public License for more details.
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
   *
- *   You should have received a copy of the GNU General Public License
- *   along with Lustre; if not, write to the Free Software
- *   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ * GPL HEADER END
+ */
+/*
+ * Copyright  2008 Sun Microsystems, Inc. All rights reserved
+ * Use is subject to license terms.
   */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/llite/rw24.c
+ *
+ * Lustre Lite I/O page cache for the 2.4 kernel version
+ */
+
  #ifndef AUTOCONF_INCLUDED
  #include <linux/config.h>
  #endif
@@ -141,4 +159,3 @@ struct address_space_operations ll_aops = {
          .max_readahead  = ll_max_readahead,
  #endif
  };
-
diff --git a/lustre/llite/rw26.c b/lustre/llite/rw26.c

index 82efcd3..4ed7a49 100644 (file)
--- a/lustre/llite/rw26.c
+++ b/lustre/llite/rw26.c
@@ -1,25 +1,43 @@
  /* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
   * vim:expandtab:shiftwidth=8:tabstop=8:
   *
- * Lustre Lite I/O page cache routines for the 2.5/2.6 kernel version
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
   *
- *  Copyright (c) 2001-2003 Cluster File Systems, Inc.
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
   *
- *   This file is part of Lustre, http://www.lustre.org.
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
   *
- *   Lustre is free software; you can redistribute it and/or
- *   modify it under the terms of version 2 of the GNU General Public
- *   License as published by the Free Software Foundation.
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright  2008 Sun Microsystems, Inc. All rights reserved
+ * Use is subject to license terms.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
   *
- *   Lustre is distributed in the hope that it will be useful,
- *   but WITHOUT ANY WARRANTY; without even the implied warranty of
- *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- *   GNU General Public License for more details.
+ * lustre/llite/rw26.c
   *
- *   You should have received a copy of the GNU General Public License
- *   along with Lustre; if not, write to the Free Software
- *   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ * Lustre Lite I/O page cache routines for the 2.5/2.6 kernel version
   */
+
  #ifndef AUTOCONF_INCLUDED
  #include <linux/config.h>
  #endif
@@ -36,7 +54,6 @@
  
  #include <linux/fs.h>
  #include <linux/buffer_head.h>
-#include <linux/mpage.h>
  #include <linux/writeback.h>
  #include <linux/stat.h>
  #include <asm/uaccess.h>
@@ -80,7 +97,12 @@ static void ll_invalidatepage(struct page *page, unsigned long offset)
  }
  #endif
  
-static int ll_releasepage(struct page *page, gfp_t gfp_mask)
+#ifdef HAVE_RELEASEPAGE_WITH_GFP
+#define RELEASEPAGE_ARG_TYPE gfp_t
+#else
+#define RELEASEPAGE_ARG_TYPE int
+#endif
+static int ll_releasepage(struct page *page, RELEASEPAGE_ARG_TYPE gfp_mask)
  {
          if (PagePrivate(page))
                  ll_removepage(page);
@@ -159,7 +181,7 @@ static ssize_t ll_direct_IO_26_seg(int rw, struct inode *inode,
                  /* To the end of the page, or the length, whatever is less */
                  pga[i].count = min_t(int, CFS_PAGE_SIZE -(file_offset & ~CFS_PAGE_MASK),
                                       length);
-                pga[i].flag = 0;
+                pga[i].flag = OBD_BRW_SYNC;
                  if (rw == READ)
                          POISON_PAGE(pages[i], 0x0d);
          }
diff --git a/lustre/llite/statahead.c b/lustre/llite/statahead.c

index 6262dd0..2ecdc67 100644 (file)
--- a/lustre/llite/statahead.c
+++ b/lustre/llite/statahead.c
@@ -1,22 +1,37 @@
  /* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
   * vim:expandtab:shiftwidth=8:tabstop=8:
   *
- *  Copyright (c) 2007 Cluster File Systems, Inc.
+ * GPL HEADER START
   *
- *   This file is part of Lustre, http://www.lustre.org.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   *
- *   Lustre is free software; you can redistribute it and/or
- *   modify it under the terms of version 2 of the GNU General Public
- *   License as published by the Free Software Foundation.
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
   *
- *   Lustre is distributed in the hope that it will be useful,
- *   but WITHOUT ANY WARRANTY; without even the implied warranty of
- *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- *   GNU General Public License for more details.
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
   *
- *   You should have received a copy of the GNU General Public License
- *   along with Lustre; if not, write to the Free Software
- *   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright  2008 Sun Microsystems, Inc. All rights reserved
+ * Use is subject to license terms.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
   */
  
  #include <linux/fs.h>
@@ -38,6 +53,8 @@ struct ll_sai_entry {
          struct list_head        se_list;
          unsigned int            se_index;
          int                     se_stat;
+        struct ptlrpc_request  *se_req;
+        struct md_enqueue_info *se_minfo;
  };
  
  enum {
@@ -48,6 +65,96 @@ enum {
  static unsigned int sai_generation = 0;
  static spinlock_t sai_generation_lock = SPIN_LOCK_UNLOCKED;
  
+/**
+ * Check whether first entry was stated already or not.
+ * No need to hold lli_lock, for:
+ * (1) it is me that remove entry from the list
+ * (2) the statahead thread only add new entry to the list
+ */
+static int ll_sai_entry_stated(struct ll_statahead_info *sai)
+{
+        struct ll_sai_entry  *entry;
+        int                   rc = 0;
+
+        if (!list_empty(&sai->sai_entries_stated)) {
+                entry = list_entry(sai->sai_entries_stated.next,
+                                   struct ll_sai_entry, se_list);
+                if (entry->se_index == sai->sai_index_next)
+                        rc = 1;
+        }
+        return rc;
+}
+
+static inline int sa_received_empty(struct ll_statahead_info *sai)
+{
+        return list_empty(&sai->sai_entries_received);
+}
+
+static inline int sa_not_full(struct ll_statahead_info *sai)
+{
+        return (sai->sai_index < sai->sai_hit + sai->sai_miss + sai->sai_max);
+}
+
+static inline int sa_is_running(struct ll_statahead_info *sai)
+{
+        return !!(sai->sai_thread.t_flags & SVC_RUNNING);
+}
+
+static inline int sa_is_stopping(struct ll_statahead_info *sai)
+{
+        return !!(sai->sai_thread.t_flags & SVC_STOPPING);
+}
+
+static inline int sa_is_stopped(struct ll_statahead_info *sai)
+{
+        return !!(sai->sai_thread.t_flags & SVC_STOPPED);
+}
+
+/**
+ * (1) hit ratio less than 80%
+ * or
+ * (2) consecutive miss more than 8
+ */
+static inline int sa_low_hit(struct ll_statahead_info *sai)
+{
+        return ((sai->sai_hit < 4 * sai->sai_miss && sai->sai_hit > 7) ||
+                (sai->sai_consecutive_miss > 8));
+}
+
+/**
+ * process the deleted entry's member and free the entry.
+ * (1) release intent
+ * (2) free md_enqueue_info
+ * (3) drop dentry's ref count
+ * (4) release request's ref count
+ */
+static void ll_sai_entry_cleanup(struct ll_sai_entry *entry, int free)
+{
+        struct ptlrpc_request  *req = entry->se_req;
+        struct md_enqueue_info *minfo = entry->se_minfo;
+        ENTRY;
+
+        if (minfo) {
+                struct dentry        *dentry = minfo->mi_dentry;
+                struct lookup_intent *it = &minfo->mi_it;
+
+                entry->se_minfo = NULL;
+                ll_intent_release(it);
+                OBD_FREE_PTR(minfo);
+                dput(dentry);
+        }
+        if (req) {
+                entry->se_req = NULL;
+                ptlrpc_req_finished(req);
+        }
+        if (free) {
+                LASSERT(list_empty(&entry->se_list));
+                OBD_FREE_PTR(entry);
+        }
+
+        EXIT;
+}
+
  static struct ll_statahead_info *ll_sai_alloc(void)
  {
          struct ll_statahead_info *sai;
@@ -65,11 +172,13 @@ static struct ll_statahead_info *ll_sai_alloc(void)
          sai->sai_max = LL_SA_RPC_MIN;
          cfs_waitq_init(&sai->sai_waitq);
          cfs_waitq_init(&sai->sai_thread.t_ctl_waitq);
-        CFS_INIT_LIST_HEAD(&sai->sai_entries);
+        CFS_INIT_LIST_HEAD(&sai->sai_entries_sent);
+        CFS_INIT_LIST_HEAD(&sai->sai_entries_received);
+        CFS_INIT_LIST_HEAD(&sai->sai_entries_stated);
          return sai;
  }
  
-static inline 
+static inline
  struct ll_statahead_info *ll_sai_get(struct ll_statahead_info *sai)
  {
          LASSERT(sai);
@@ -80,16 +189,20 @@ struct ll_statahead_info *ll_sai_get(struct ll_statahead_info *sai)
  static void ll_sai_put(struct ll_statahead_info *sai)
  {
          struct inode         *inode = sai->sai_inode;
-        struct ll_inode_info *lli = ll_i2info(inode);
+        struct ll_inode_info *lli;
          ENTRY;
  
+        LASSERT(inode != NULL);
+        lli = ll_i2info(inode);
          if (atomic_dec_and_lock(&sai->sai_refcount, &lli->lli_lock)) {
                  struct ll_sai_entry *entry, *next;
  
+                LASSERT(lli->lli_opendir_key == NULL);
                  lli->lli_sai = NULL;
+                lli->lli_opendir_pid = 0;
                  spin_unlock(&lli->lli_lock);
  
-                LASSERT(sai->sai_thread.t_flags & SVC_STOPPED);
+                LASSERT(sa_is_stopped(sai));
  
                  if (sai->sai_sent > sai->sai_replied)
                          CDEBUG(D_READA,"statahead for dir %lu/%u does not "
@@ -97,10 +210,20 @@ static void ll_sai_put(struct ll_statahead_info *sai)
                                inode->i_ino, inode->i_generation,
                                sai->sai_sent, sai->sai_replied);
  
-                list_for_each_entry_safe(entry, next, &sai->sai_entries,
+                list_for_each_entry_safe(entry, next, &sai->sai_entries_sent,
                                           se_list) {
-                        list_del(&entry->se_list);
-                        OBD_FREE_PTR(entry);
+                        list_del_init(&entry->se_list);
+                        ll_sai_entry_cleanup(entry, 1);
+                }
+                list_for_each_entry_safe(entry, next, &sai->sai_entries_received,
+                                         se_list) {
+                        list_del_init(&entry->se_list);
+                        ll_sai_entry_cleanup(entry, 1);
+                }
+                list_for_each_entry_safe(entry, next, &sai->sai_entries_stated,
+                                         se_list) {
+                        list_del_init(&entry->se_list);
+                        ll_sai_entry_cleanup(entry, 1);
                  }
                  OBD_FREE_PTR(sai);
                  iput(inode);
@@ -108,8 +231,11 @@ static void ll_sai_put(struct ll_statahead_info *sai)
          EXIT;
  }
  
+/**
+ * insert it into sai_entries_sent tail when init.
+ */
  static struct ll_sai_entry *
-ll_sai_entry_get(struct ll_statahead_info *sai, unsigned int index, int stat)
+ll_sai_entry_init(struct ll_statahead_info *sai, unsigned int index)
  {
          struct ll_inode_info *lli = ll_i2info(sai->sai_inode);
          struct ll_sai_entry  *entry;
@@ -118,148 +244,184 @@ ll_sai_entry_get(struct ll_statahead_info *sai, unsigned int index, int stat)
          OBD_ALLOC_PTR(entry);
          if (entry == NULL)
                  RETURN(ERR_PTR(-ENOMEM));
-        
-        CDEBUG(D_READA, "alloc sai entry %p index %u, stat %d\n",
-               entry, index, stat);
+
+        CDEBUG(D_READA, "alloc sai entry %p index %u\n",
+               entry, index);
          entry->se_index = index;
-        entry->se_stat  = stat;
+        entry->se_stat  = SA_ENTRY_UNSTATED;
  
          spin_lock(&lli->lli_lock);
-        list_add_tail(&entry->se_list, &sai->sai_entries);
+        list_add_tail(&entry->se_list, &sai->sai_entries_sent);
          spin_unlock(&lli->lli_lock);
  
          RETURN(entry);
  }
  
-/*
- * inside lli_lock
- * return value:
- *  0: can not find the entry with the index
- *  1: it is the first entry
- *  2: it is not the first entry
+/**
+ * delete it from sai_entries_stated head when fini, it need not
+ * to process entry's member.
   */
-static int
-ll_sai_entry_set(struct ll_statahead_info *sai, unsigned int index, int stat)
+static void ll_sai_entry_fini(struct ll_statahead_info *sai)
  {
-        struct ll_sai_entry *entry;
-        int                  rc = 0;
+        struct ll_inode_info *lli = ll_i2info(sai->sai_inode);
+        struct ll_sai_entry  *entry;
          ENTRY;
  
-        if (list_empty(&sai->sai_entries))
-                RETURN(0);
-
-        entry = list_entry(sai->sai_entries.next, struct ll_sai_entry, se_list);
-        if (entry->se_index == index)
-                GOTO(out, rc = 1);
-
-        while (entry->se_list.next != &sai->sai_entries &&
-               entry->se_index < index) {
-                entry = list_entry(entry->se_list.next, struct ll_sai_entry,
-                                   se_list);
-                if (entry->se_index == index)
-                        GOTO(out, rc = 2);
-        }
+        spin_lock(&lli->lli_lock);
+        sai->sai_index_next++;
+        if (likely(!list_empty(&sai->sai_entries_stated))) {
+                entry = list_entry(sai->sai_entries_stated.next,
+                                   struct ll_sai_entry, se_list);
+                if (entry->se_index < sai->sai_index_next) {
+                        list_del(&entry->se_list);
+                        OBD_FREE_PTR(entry);
+                }
+        } else
+                LASSERT(sa_is_stopped(sai));
+        spin_unlock(&lli->lli_lock);
  
          EXIT;
-
-out:
-        if (rc) {
-                LASSERT(entry->se_stat == SA_ENTRY_UNSTATED);
-                entry->se_stat = stat;
-        }
-
-        return rc;
  }
  
-/*
- * Check whether first entry was stated already or not.
- * No need to hold lli_lock, for:
- * (1) it is me that remove entry from the list
- * (2) the statahead thread only add new entry to the list tail
+/**
+ * inside lli_lock.
+ * \retval NULL : can not find the entry in sai_entries_sent with the index
+ * \retval entry: find the entry in sai_entries_sent with the index
   */
-static int ll_sai_entry_stated(struct ll_statahead_info *sai)
+static struct ll_sai_entry *
+ll_sai_entry_set(struct ll_statahead_info *sai, unsigned int index, int stat,
+                 struct ptlrpc_request *req, struct md_enqueue_info *minfo)
  {
-        struct ll_sai_entry  *entry;
-        int                   rc = 0;
+        struct ll_sai_entry *entry;
          ENTRY;
  
-        if (!list_empty(&sai->sai_entries)) {
-                entry = list_entry(sai->sai_entries.next, struct ll_sai_entry,
-                                   se_list);
-                rc = (entry->se_stat != SA_ENTRY_UNSTATED);
+        if (!list_empty(&sai->sai_entries_sent)) {
+                list_for_each_entry(entry, &sai->sai_entries_sent, se_list) {
+                        if (entry->se_index == index) {
+                                entry->se_stat = stat;
+                                entry->se_req = ptlrpc_request_addref(req);
+                                entry->se_minfo = minfo;
+                                RETURN(entry);
+                        } else if (entry->se_index > index)
+                                RETURN(NULL);
+                }
          }
+        RETURN(NULL);
+}
  
-        RETURN(rc);
+/**
+ * inside lli_lock.
+ * Move entry to sai_entries_received and
+ * insert it into sai_entries_received tail.
+ */
+static inline void
+ll_sai_entry_to_received(struct ll_statahead_info *sai, struct ll_sai_entry *entry)
+{
+        if (!list_empty(&entry->se_list))
+                list_del_init(&entry->se_list);
+        list_add_tail(&entry->se_list, &sai->sai_entries_received);
  }
  
-static void ll_sai_entry_put(struct ll_statahead_info *sai)
+/**
+ * Move entry to sai_entries_stated and
+ * sort with the index.
+ */
+static int
+ll_sai_entry_to_stated(struct ll_statahead_info *sai, struct ll_sai_entry *entry)
  {
          struct ll_inode_info *lli = ll_i2info(sai->sai_inode);
-        struct ll_sai_entry  *entry;
+        struct ll_sai_entry  *se;
          ENTRY;
-        
+
+        ll_sai_entry_cleanup(entry, 0);
+
          spin_lock(&lli->lli_lock);
-        if (!list_empty(&sai->sai_entries)) {
-                entry = list_entry(sai->sai_entries.next,
-                                   struct ll_sai_entry, se_list);
-                list_del(&entry->se_list);
+        if (!list_empty(&entry->se_list))
+                list_del_init(&entry->se_list);
+
+        if (unlikely(entry->se_index < sai->sai_index_next)) {
+                spin_unlock(&lli->lli_lock);
                  OBD_FREE_PTR(entry);
+                RETURN(0);
          }
-        spin_unlock(&lli->lli_lock);
  
-        EXIT;
+        list_for_each_entry_reverse(se, &sai->sai_entries_stated, se_list) {
+                if (se->se_index < entry->se_index) {
+                        list_add(&entry->se_list, &se->se_list);
+                        spin_unlock(&lli->lli_lock);
+                        RETURN(1);
+                }
+        }
+
+        /*
+         * I am the first entry.
+         */
+        list_add(&entry->se_list, &sai->sai_entries_stated);
+        spin_unlock(&lli->lli_lock);
+        RETURN(1);
  }
  
-/* finish lookup/revalidate */
-static int ll_statahead_interpret(struct obd_export *exp,
-                                  struct ptlrpc_request *req,
-                                  struct md_enqueue_info *minfo,
-                                  int rc)
+/**
+ * finish lookup/revalidate.
+ */
+static int do_statahead_interpret(struct ll_statahead_info *sai)
  {
-        struct lookup_intent     *it = &minfo->mi_it;
-        struct dentry            *dentry = minfo->mi_dentry;
-        struct inode             *dir = dentry->d_parent->d_inode;
-        struct ll_inode_info     *lli = ll_i2info(dir);
-        struct ll_statahead_info *sai = NULL;
+        struct ll_inode_info   *lli = ll_i2info(sai->sai_inode);
+        struct ll_sai_entry    *entry;
+        struct ptlrpc_request  *req;
+        struct md_enqueue_info *minfo;
+        struct dentry          *dentry;
+        struct lookup_intent   *it;
+        int                     rc = 0;
          ENTRY;
  
-        CDEBUG(D_READA, "interpret statahead %.*s rc %d\n",
-               dentry->d_name.len, dentry->d_name.name, rc);
-
          spin_lock(&lli->lli_lock);
-        if (unlikely(lli->lli_sai == NULL ||
-            lli->lli_sai->sai_generation != minfo->mi_generation)) {
-                spin_unlock(&lli->lli_lock);
-                GOTO(out_free, rc = -ESTALE);
-        } else {
-                sai = ll_sai_get(lli->lli_sai);
-                spin_unlock(&lli->lli_lock);
+        LASSERT(!sa_received_empty(sai));
+        entry = list_entry(sai->sai_entries_received.next, struct ll_sai_entry,
+                           se_list);
+        list_del_init(&entry->se_list);
+        spin_unlock(&lli->lli_lock);
+
+        if (unlikely(entry->se_index < sai->sai_index_next)) {
+                ll_sai_entry_cleanup(entry, 1);
+                RETURN(0);
          }
  
-        if (rc || dir == NULL)
-                GOTO(out, rc);
+        if (entry->se_stat != SA_ENTRY_STATED)
+                GOTO(out, rc = entry->se_stat);
+
+        req = entry->se_req;
+        minfo = entry->se_minfo;
+        dentry = minfo->mi_dentry;
+        it = &minfo->mi_it;
  
          if (dentry->d_inode == NULL) {
-                /* lookup */
+                /*
+                 * lookup.
+                 */
                  struct dentry    *save = dentry;
                  struct it_cb_data icbd = {
-                        .icbd_parent   = dir,
+                        .icbd_parent   = dentry->d_parent->d_inode,
                          .icbd_childp   = &dentry
                  };
  
                  rc = lookup_it_finish(req, DLM_REPLY_REC_OFF, it, &icbd);
                  if (!rc)
-                        /* 
+                        /*
                           * Here dentry->d_inode might be NULL,
                           * because the entry may have been removed before
                           * we start doing stat ahead.
                           */
                          ll_lookup_finish_locks(it, dentry);
  
-                if (dentry != save)
+                if (dentry != save) {
+                        minfo->mi_dentry = dentry;
                          dput(save);
+                }
          } else {
-                /* revalidate */
+                /*
+                 * revalidate.
+                 */
                  struct mds_body *body;
  
                  body = lustre_msg_buf(req->rq_repmsg, DLM_REPLY_REC_OFF,
@@ -276,6 +438,7 @@ static int ll_statahead_interpret(struct obd_export *exp,
                          GOTO(out, rc);
                  }
  
+                spin_lock(&ll_lookup_lock);
                  spin_lock(&dcache_lock);
                  lock_dentry(dentry);
                  __d_drop(dentry);
@@ -285,42 +448,67 @@ static int ll_statahead_interpret(struct obd_export *exp,
                  unlock_dentry(dentry);
                  d_rehash_cond(dentry, 0);
                  spin_unlock(&dcache_lock);
+                spin_unlock(&ll_lookup_lock);
  
                  ll_lookup_finish_locks(it, dentry);
          }
          EXIT;
  
  out:
-        if (sai != NULL) {
-                int first;
+        if (likely(ll_sai_entry_to_stated(sai, entry)))
+                cfs_waitq_signal(&sai->sai_waitq);
+        return rc;
+}
  
-                sai->sai_replied++;
-                spin_lock(&lli->lli_lock);
-                first = ll_sai_entry_set(sai,
-                                         (unsigned int)(long)minfo->mi_cbdata,
-                                         SA_ENTRY_STATED);
-                /*
-                 * wake up the "ls -l" process only when the first entry
-                 * returned.
-                 */
+static int ll_statahead_interpret(struct obd_export *exp,
+                                  struct ptlrpc_request *req,
+                                  struct md_enqueue_info *minfo,
+                                  int rc)
+{
+        struct dentry            *dentry = minfo->mi_dentry;
+        struct lookup_intent     *it = &minfo->mi_it;
+        struct inode             *dir = dentry->d_parent->d_inode;
+        struct ll_inode_info     *lli = ll_i2info(dir);
+        struct ll_statahead_info *sai;
+        struct ll_sai_entry      *entry;
+        ENTRY;
+
+        CDEBUG(D_READA, "interpret statahead %.*s rc %d\n",
+               dentry->d_name.len, dentry->d_name.name, rc);
+
+        spin_lock(&lli->lli_lock);
+        if (unlikely(lli->lli_sai == NULL ||
+                     lli->lli_sai->sai_generation != minfo->mi_generation)) {
                  spin_unlock(&lli->lli_lock);
-                if (first == 1)
-                        cfs_waitq_signal(&sai->sai_waitq);
-                else if (first == 0)
-                        CDEBUG(D_READA, "can't find sai entry for dir "
-                               "%lu/%u generation %u index %d\n",
-                               dir->i_ino, dir->i_generation,
-                               minfo->mi_generation,
-                               (unsigned int)(long)minfo->mi_cbdata);
+                ll_intent_release(it);
+                dput(dentry);
+                OBD_FREE_PTR(minfo);
+                RETURN(-ESTALE);
+        } else {
+                sai = ll_sai_get(lli->lli_sai);
+                if (rc || dir == NULL)
+                        rc = -ESTALE;
  
+                entry = ll_sai_entry_set(sai,
+                                         (unsigned int)(long)minfo->mi_cbdata,
+                                         rc ? SA_ENTRY_UNSTATED :
+                                         SA_ENTRY_STATED, req, minfo);
+                LASSERT(entry != NULL);
+                if (likely(sa_is_running(sai))) {
+                        ll_sai_entry_to_received(sai, entry);
+                        sai->sai_replied++;
+                        spin_unlock(&lli->lli_lock);
+                        cfs_waitq_signal(&sai->sai_thread.t_ctl_waitq);
+                } else {
+                        if (!list_empty(&entry->se_list))
+                                list_del_init(&entry->se_list);
+                        sai->sai_replied++;
+                        spin_unlock(&lli->lli_lock);
+                        ll_sai_entry_cleanup(entry, 1);
+                }
                  ll_sai_put(sai);
+                RETURN(rc);
          }
-out_free:
-        ll_intent_release(it);
-        OBD_FREE_PTR(minfo);
-
-        dput(dentry);
-        return rc;
  }
  
  static void sa_args_fini(struct md_enqueue_info *minfo,
@@ -349,7 +537,6 @@ static int sa_args_prep(struct inode *dir, struct dentry *dentry,
                  return -ENOMEM;
          }
  
-        minfo->mi_exp = ll_i2mdcexp(dir);
          minfo->mi_it.it_op = IT_GETATTR;
          minfo->mi_dentry = dentry;
          minfo->mi_cb = ll_statahead_interpret;
@@ -369,12 +556,14 @@ static int sa_args_prep(struct inode *dir, struct dentry *dentry,
          return 0;
  }
  
-/* similar to ll_lookup_it(). */
+/**
+ * similar to ll_lookup_it().
+ */
  static int do_sa_lookup(struct inode *dir, struct dentry *dentry)
  {
          struct md_enqueue_info   *minfo;
          struct ldlm_enqueue_info *einfo;
-        int                       rc;                
+        int                       rc;
          ENTRY;
  
          rc = sa_args_prep(dir, dentry, &minfo, &einfo);
@@ -385,7 +574,7 @@ static int do_sa_lookup(struct inode *dir, struct dentry *dentry)
                                      dentry->d_name.name, dentry->d_name.len, 0,
                                      NULL);
          if (rc == 0)
-                rc = mdc_intent_getattr_async(minfo->mi_exp, minfo, einfo);
+                rc = mdc_intent_getattr_async(ll_i2mdcexp(dir), minfo, einfo);
  
          if (rc)
                  sa_args_fini(minfo, einfo);
@@ -393,16 +582,16 @@ static int do_sa_lookup(struct inode *dir, struct dentry *dentry)
          RETURN(rc);
  }
  
-/* 
+/**
   * similar to ll_revalidate_it().
- * return value:
- *  1      -- dentry valid
- *  0      -- will send stat-ahead request
- *  others -- prepare stat-ahead request failed
+ * \retval      1 -- dentry valid
+ * \retval      0 -- will send stat-ahead request
+ * \retval others -- prepare stat-ahead request failed
   */
  static int do_sa_revalidate(struct dentry *dentry)
  {
          struct inode             *inode = dentry->d_inode;
+        struct inode             *dir = dentry->d_parent->d_inode;
          struct ll_fid             fid;
          struct lookup_intent      it = { .it_op = IT_GETATTR };
          struct md_enqueue_info   *minfo;
@@ -421,13 +610,13 @@ static int do_sa_revalidate(struct dentry *dentry)
  
          ll_inode2fid(&fid, inode);
  
-        rc = mdc_revalidate_lock(ll_i2mdcexp(inode), &it, &fid);
+        rc = mdc_revalidate_lock(ll_i2mdcexp(dir), &it, &fid);
          if (rc == 1) {
                  ll_intent_release(&it);
                  RETURN(1);
          }
  
-        rc = sa_args_prep(dentry->d_parent->d_inode, dentry, &minfo, &einfo);
+        rc = sa_args_prep(dir, dentry, &minfo, &einfo);
          if (rc)
                  RETURN(rc);
  
@@ -435,7 +624,7 @@ static int do_sa_revalidate(struct dentry *dentry)
                                      inode, dentry->d_name.name,
                                      dentry->d_name.len, 0, NULL);
          if (rc == 0)
-                rc = mdc_intent_getattr_async(minfo->mi_exp, minfo, einfo);
+                rc = mdc_intent_getattr_async(ll_i2mdcexp(dir), minfo, einfo);
  
          if (rc)
                  sa_args_fini(minfo, einfo);
@@ -443,28 +632,22 @@ static int do_sa_revalidate(struct dentry *dentry)
          RETURN(rc);
  }
  
-static inline void ll_name2qstr(struct qstr *this, const char *name, int namelen)
+static inline void ll_name2qstr(struct qstr *q, const char *name, int namelen)
  {
-        unsigned long hash = init_name_hash();
-        unsigned int  c;
-
-        this->name = name;
-        this->len  = namelen;
-        for (; namelen > 0; namelen--, name++) {
-                c = *(const unsigned char *)name;
-                hash = partial_name_hash(c, hash);
-        }
-        this->hash = end_name_hash(hash);
+        q->name = name;
+        q->len  = namelen;
+        q->hash = full_name_hash(name, namelen);
  }
  
  static int ll_statahead_one(struct dentry *parent, struct ll_dir_entry *de)
  {
-        struct inode           *dir = parent->d_inode;
-        struct ll_inode_info   *lli = ll_i2info(dir);
-        struct qstr             name;
-        struct dentry          *dentry;
-        struct ll_sai_entry    *se;
-        int                     rc;
+        struct inode             *dir = parent->d_inode;
+        struct ll_inode_info     *lli = ll_i2info(dir);
+        struct ll_statahead_info *sai = lli->lli_sai;
+        struct qstr               name;
+        struct dentry            *dentry;
+        struct ll_sai_entry      *se;
+        int                       rc;
          ENTRY;
  
  #ifdef DCACHE_LUSTRE_INVALID
@@ -478,8 +661,7 @@ static int ll_statahead_one(struct dentry *parent, struct ll_dir_entry *de)
                  RETURN(-EINVAL);
          }
  
-        se = ll_sai_entry_get(lli->lli_sai, lli->lli_sai->sai_index,
-                              SA_ENTRY_UNSTATED);
+        se = ll_sai_entry_init(sai, sai->sai_index);
          if (IS_ERR(se))
                  RETURN(PTR_ERR(se));
  
@@ -507,34 +689,15 @@ out:
                  CDEBUG(D_READA, "set sai entry %p index %u stat %d rc %d\n",
                         se, se->se_index, se->se_stat, rc);
                  se->se_stat = rc;
-                cfs_waitq_signal(&lli->lli_sai->sai_waitq);
+                if (ll_sai_entry_to_stated(sai, se))
+                        cfs_waitq_signal(&sai->sai_waitq);
          } else {
-                lli->lli_sai->sai_sent++;
+                sai->sai_sent++;
          }
  
-        lli->lli_sai->sai_index++;
+        sai->sai_index++;
          return rc;
  }
-                
-static inline int sa_check_stop(struct ll_statahead_info *sai)
-{
-        return !!(sai->sai_thread.t_flags & SVC_STOPPING);
-}
-
-static inline int sa_not_full(struct ll_statahead_info *sai)
-{
-        return sai->sai_index < sai->sai_hit + sai->sai_miss + sai->sai_max;
-}
-
-/* (1) hit ratio less than 80%
- * or
- * (2) consecutive miss more than 8
- */
-static inline int sa_low_hit(struct ll_statahead_info *sai)
-{
-        return ((sai->sai_hit < 4 * sai->sai_miss && sai->sai_hit > 7) ||
-                (sai->sai_consecutive_miss > 8));
-}
  
  struct ll_sa_thread_args {
          struct dentry   *sta_parent;
@@ -548,17 +711,31 @@ static int ll_statahead_thread(void *arg)
          struct inode             *dir = parent->d_inode;
          struct ll_inode_info     *lli = ll_i2info(dir);
          struct ll_sb_info        *sbi = ll_i2sbi(dir);
-        struct ll_statahead_info *sai = ll_sai_get(lli->lli_sai);
-        struct ptlrpc_thread     *thread = &sai->sai_thread;
+        struct ll_statahead_info *sai;
+        struct ptlrpc_thread     *thread;
          unsigned long             index = 0;
          int                       first = 0;
          int                       rc = 0;
-        char                      name[16] = "";
          ENTRY;
  
+        spin_lock(&lli->lli_lock);
+        if (unlikely(lli->lli_sai == NULL)) {
+                spin_unlock(&lli->lli_lock);
+                dput(parent);
+                RETURN(-EAGAIN);
+        } else {
+                sai = ll_sai_get(lli->lli_sai);
+                spin_unlock(&lli->lli_lock);
+        }
+
+        {
+                char pname[16];
+                snprintf(pname, 15, "ll_sa_%u", sta->sta_pid);
+                cfs_daemonize(pname);
+        }
+
+        thread = &sai->sai_thread;
          sbi->ll_sa_total++;
-        snprintf(name, 15, "ll_sa_%u", sta->sta_pid);
-        cfs_daemonize(name);
          spin_lock(&lli->lli_lock);
          thread->t_flags = SVC_RUNNING;
          spin_unlock(&lli->lli_lock);
@@ -573,11 +750,25 @@ static int ll_statahead_thread(void *arg)
                  struct page *page;
  
                  npages = dir_pages(dir);
-                /* reach the end of dir */
+                /*
+                 * reach the end of dir.
+                 */
                  if (index >= npages) {
                          CDEBUG(D_READA, "reach end, index/npages %lu/%lu\n",
                                 index, npages);
-                        break;
+
+                        while (1) {
+                                l_wait_event(thread->t_ctl_waitq,
+                                             !sa_is_running(sai) ||
+                                             !sa_received_empty(sai) ||
+                                             sai->sai_sent == sai->sai_replied,
+                                             &lwi);
+                                if (!sa_received_empty(sai) &&
+                                    sa_is_running(sai))
+                                        do_statahead_interpret(sai);
+                                else
+                                        GOTO(out, rc);
+                        }
                  }
  
                  page = ll_get_dir_page(dir, index);
@@ -593,35 +784,56 @@ static int ll_statahead_thread(void *arg)
                  limit = kaddr + CFS_PAGE_SIZE - ll_dir_rec_len(1);
                  de = (struct ll_dir_entry *)kaddr;
                  if (!index) {
-                        de = ll_dir_next_entry(de); /* skip "." */
-                        de = ll_dir_next_entry(de); /* skip ".." */
+                        /*
+                         * skip "."
+                         */
+                        de = ll_dir_next_entry(de);
+                        /*
+                         * skip ".."
+                         */
+                        de = ll_dir_next_entry(de);
                  }
  
                  for (; (char*)de <= limit; de = ll_dir_next_entry(de)) {
-                        if (!de->lde_inode)
+                        if (de->lde_inode == 0)
                                  continue;
  
                          if (de->lde_name[0] == '.' && !sai->sai_ls_all) {
-                                /* skip hidden files */
+                                /*
+                                 * skip hidden files..
+                                 */
                                  sai->sai_skip_hidden++;
                                  continue;
                          }
  
-                        /* don't stat-ahead first entry */
+                        /*
+                         * don't stat-ahead first entry.
+                         */
                          if (unlikely(!first)) {
                                  first++;
                                  continue;
                          }
  
+keep_de:
                          l_wait_event(thread->t_ctl_waitq,
-                                     sa_check_stop(sai) || sa_not_full(sai),
+                                     !sa_is_running(sai) || sa_not_full(sai) ||
+                                     !sa_received_empty(sai),
                                       &lwi);
  
-                        if (unlikely(sa_check_stop(sai))) {
+                        while (!sa_received_empty(sai) && sa_is_running(sai))
+                                do_statahead_interpret(sai);
+
+                        if (unlikely(!sa_is_running(sai))) {
                                  ll_put_page(page);
                                  GOTO(out, rc);
                          }
  
+                        if (!sa_not_full(sai))
+                                /*
+                                 * do not skip the current de.
+                                 */
+                                goto keep_de;
+
                          rc = ll_statahead_one(parent, de);
                          if (rc < 0) {
                                  ll_put_page(page);
@@ -632,6 +844,7 @@ static int ll_statahead_thread(void *arg)
                  index++;
          }
          EXIT;
+
  out:
          spin_lock(&lli->lli_lock);
          thread->t_flags = SVC_STOPPED;
@@ -645,27 +858,29 @@ out:
          return rc;
  }
  
-/* called in ll_file_release() */
+/**
+ * called in ll_file_release().
+ */
  void ll_stop_statahead(struct inode *inode, void *key)
  {
          struct ll_inode_info *lli = ll_i2info(inode);
-        struct ptlrpc_thread *thread;
+
+        if (unlikely(key == NULL))
+                return;
  
          spin_lock(&lli->lli_lock);
-        if (lli->lli_opendir_pid == 0 ||
-            unlikely(lli->lli_opendir_key != key)) {
+        if (lli->lli_opendir_key != key || lli->lli_opendir_pid == 0) {
                  spin_unlock(&lli->lli_lock);
                  return;
          }
  
          lli->lli_opendir_key = NULL;
-        lli->lli_opendir_pid = 0;
  
          if (lli->lli_sai) {
                  struct l_wait_info lwi = { 0 };
+                struct ptlrpc_thread *thread = &lli->lli_sai->sai_thread;
  
-                thread = &lli->lli_sai->sai_thread;
-                if (!(thread->t_flags & SVC_STOPPED)) {
+                if (!sa_is_stopped(lli->lli_sai)) {
                          thread->t_flags = SVC_STOPPING;
                          spin_unlock(&lli->lli_lock);
                          cfs_waitq_signal(&thread->t_ctl_waitq);
@@ -673,7 +888,7 @@ void ll_stop_statahead(struct inode *inode, void *key)
                          CDEBUG(D_READA, "stopping statahead thread, pid %d\n",
                                 cfs_curproc_pid());
                          l_wait_event(thread->t_ctl_waitq,
-                                     thread->t_flags & SVC_STOPPED,
+                                     sa_is_stopped(lli->lli_sai),
                                       &lwi);
                  } else {
                          spin_unlock(&lli->lli_lock);
@@ -685,15 +900,25 @@ void ll_stop_statahead(struct inode *inode, void *key)
                   * maybe inflight.
                   */
                  ll_sai_put(lli->lli_sai);
-                return;
+        } else {
+                lli->lli_opendir_pid = 0;
+                spin_unlock(&lli->lli_lock);
          }
-        spin_unlock(&lli->lli_lock);
  }
  
  enum {
-        LS_NONE_FIRST_DE = 0,   /* not first dirent, or is "." */
-        LS_FIRST_DE,            /* the first non-hidden dirent */
-        LS_FIRST_DOT_DE         /* the first hidden dirent, that is ".xxx" */
+        /*
+         * not first dirent, or is "."
+         */
+        LS_NONE_FIRST_DE = 0,
+        /*
+         * the first non-hidden dirent
+         */
+        LS_FIRST_DE,
+        /*
+         * the first hidden dirent, that is ".xxx
+         */
+        LS_FIRST_DOT_DE
  };
  
  static int is_first_dirent(struct inode *dir, struct dentry *dentry)
@@ -708,7 +933,9 @@ static int is_first_dirent(struct inode *dir, struct dentry *dentry)
  
          while (1) {
                  npages = dir_pages(dir);
-                /* reach the end of dir */
+                /*
+                 * reach the end of dir.
+                 */
                  if (index >= npages) {
                          CDEBUG(D_READA, "reach end, index/npages %lu/%lu\n",
                                 index, npages);
@@ -729,16 +956,20 @@ static int is_first_dirent(struct inode *dir, struct dentry *dentry)
                  if (!index) {
                          if (unlikely(!(de->lde_name_len == 1 &&
                                         strncmp(de->lde_name, ".", 1) == 0)))
-                                CWARN("Maybe got bad on-disk dir: %lu\n",
-                                      dir->i_ino);
-                        /* skip "." or ingore bad entry */
+                                CWARN("Maybe got bad on-disk dir: %lu/%u\n",
+                                      dir->i_ino, dir->i_generation);
+                        /*
+                         * skip "." or ingore bad entry.
+                         */
                          de = ll_dir_next_entry(de);
  
                          if (unlikely(!(de->lde_name_len == 2 &&
                                         strncmp(de->lde_name, "..", 2) == 0)))
-                                CWARN("Maybe got bad on-disk dir: %lu\n",
-                                      dir->i_ino);
-                        /* skip ".." or ingore bad entry */
+                                CWARN("Maybe got bad on-disk dir: %lu/%u\n",
+                                      dir->i_ino, dir->i_generation);
+                        /*
+                         * skip ".." or ingore bad entry.
+                         */
                          de = ll_dir_next_entry(de);
                  }
  
@@ -772,14 +1003,15 @@ static int is_first_dirent(struct inode *dir, struct dentry *dentry)
          RETURN(rc);
  }
  
-/* Start statahead thread if this is the first dir entry.
+/**
+ * Start statahead thread if this is the first dir entry.
   * Otherwise if a thread is started already, wait it until it is ahead of me.
- * Return value: 
- *  0       -- miss
- *  1       -- hit
- *  -EEXIST -- stat ahead thread started, and this is the first dentry
- *  -EBADFD -- statahead thread exit and not dentry available
- *  others  -- error
+ * \retval 0       -- stat ahead thread process such dentry, for lookup, it miss
+ * \retval 1       -- stat ahead thread process such dentry, for lookup, it hit
+ * \retval -EEXIST -- stat ahead thread started, and this is the first dentry
+ * \retval -EBADFD -- statahead thread exit and not dentry available
+ * \retval -EAGAIN -- try to stat by caller
+ * \retval others  -- error
   */
  int do_statahead_enter(struct inode *dir, struct dentry **dentryp, int lookup)
  {
@@ -794,8 +1026,8 @@ int do_statahead_enter(struct inode *dir, struct dentry **dentryp, int lookup)
          LASSERT(lli->lli_opendir_pid == cfs_curproc_pid());
  
          if (sai) {
-                if (unlikely(sai->sai_thread.t_flags & SVC_STOPPED &&
-                             list_empty(&sai->sai_entries)))
+                if (unlikely(sa_is_stopped(sai) &&
+                             list_empty(&sai->sai_entries_stated)))
                          RETURN(-EBADFD);
  
                  if ((*dentryp)->d_name.name[0] == '.') {
@@ -829,10 +1061,11 @@ int do_statahead_enter(struct inode *dir, struct dentry **dentryp, int lookup)
                          sbi->ll_sa_cached++;
                  } else {
                          sbi->ll_sa_blocked++;
-                        /* thread started already, avoid double-stat */
+                        /*
+                         * thread started already, avoid double-stat.
+                         */
                          l_wait_event(sai->sai_waitq,
-                                     ll_sai_entry_stated(sai) ||
-                                     sai->sai_thread.t_flags & SVC_STOPPED,
+                                     ll_sai_entry_stated(sai) || sa_is_stopped(sai),
                                       &lwi);
                  }
  
@@ -843,21 +1076,28 @@ int do_statahead_enter(struct inode *dir, struct dentry **dentryp, int lookup)
                                            &(*dentryp)->d_name);
                          if (result) {
                                  LASSERT(result != *dentryp);
-                                dput(*dentryp);
+                                /* BUG 16303: do not drop reference count for
+                                 * "*dentryp", VFS will do that by itself. */
                                  *dentryp = result;
                                  RETURN(1);
                          }
                  }
-                /* do nothing for revalidate */
+                /*
+                 * do nothing for revalidate.
+                 */
                  RETURN(0);
          }
  
-         /* I am the "lli_opendir_pid" owner, only me can set "lli_sai". */ 
+         /*
+          * I am the "lli_opendir_pid" owner, only me can set "lli_sai".
+          */
          LASSERT(lli->lli_sai == NULL);
  
          rc = is_first_dirent(dir, *dentryp);
          if (rc == LS_NONE_FIRST_DE) {
-                /* It is not "ls -{a}l" operation, no need statahead for it */
+                /*
+                 * It is not "ls -{a}l" operation, no need statahead for it.
+                 */
                  spin_lock(&lli->lli_lock);
                  lli->lli_opendir_key = NULL;
                  lli->lli_opendir_pid = 0;
@@ -868,9 +1108,17 @@ int do_statahead_enter(struct inode *dir, struct dentry **dentryp, int lookup)
          sai = ll_sai_alloc();
          if (sai == NULL)
                  RETURN(-ENOMEM);
-        
-        sai->sai_inode  = igrab(dir);
+
          sai->sai_ls_all = (rc == LS_FIRST_DOT_DE);
+        sai->sai_inode = igrab(dir);
+        if (unlikely(sai->sai_inode == NULL)) {
+                CWARN("Do not start stat ahead on dying inode %lu/%u.\n",
+                      dir->i_ino, dir->i_generation);
+                OBD_FREE_PTR(sai);
+                RETURN(-ESTALE);
+        }
+
+        LASSERT(sai->sai_inode == (*dentryp)->d_parent->d_inode);
  
          sta.sta_parent = (*dentryp)->d_parent;
          sta.sta_pid    = cfs_curproc_pid();
@@ -879,14 +1127,15 @@ int do_statahead_enter(struct inode *dir, struct dentry **dentryp, int lookup)
          rc = cfs_kernel_thread(ll_statahead_thread, &sta, 0);
          if (rc < 0) {
                  CERROR("can't start ll_sa thread, rc: %d\n", rc);
+                lli->lli_opendir_key = NULL;
                  sai->sai_thread.t_flags = SVC_STOPPED;
                  ll_sai_put(sai);
                  LASSERT(lli->lli_sai == NULL);
-                RETURN(rc);
+                RETURN(-EAGAIN);
          }
  
-        l_wait_event(sai->sai_thread.t_ctl_waitq, 
-                     sai->sai_thread.t_flags & (SVC_RUNNING | SVC_STOPPED),
+        l_wait_event(sai->sai_thread.t_ctl_waitq,
+                     sa_is_running(sai) || sa_is_stopped(sai),
                       &lwi);
  
          /*
@@ -896,21 +1145,20 @@ int do_statahead_enter(struct inode *dir, struct dentry **dentryp, int lookup)
          RETURN(-EEXIST);
  }
  
-/* update hit/miss count */
+/**
+ * update hit/miss count.
+ */
  void ll_statahead_exit(struct dentry *dentry, int result)
  {
-        struct dentry         *parent = dentry->d_parent;
-        struct ll_inode_info  *lli = ll_i2info(parent->d_inode);
-        struct ll_sb_info     *sbi = ll_i2sbi(parent->d_inode);
-        struct ll_dentry_data *ldd = ll_d2d(dentry);
-
-        if (lli->lli_opendir_pid != cfs_curproc_pid())
-                return;
-
-        if (lli->lli_sai) {
-                struct ll_statahead_info *sai = lli->lli_sai;
+        struct dentry            *parent = dentry->d_parent;
+        struct ll_inode_info     *lli = ll_i2info(parent->d_inode);
+        struct ll_sb_info        *sbi = ll_i2sbi(parent->d_inode);
+        struct ll_statahead_info *sai = lli->lli_sai;
+        struct ll_dentry_data    *ldd = ll_d2d(dentry);
+        ENTRY;
  
-                if (result == 1) {
+        if (lli->lli_opendir_pid == cfs_curproc_pid() && sai) {
+                if (result >= 1) {
                          sbi->ll_sa_hit++;
                          sai->sai_hit++;
                          sai->sai_consecutive_miss = 0;
@@ -919,8 +1167,7 @@ void ll_statahead_exit(struct dentry *dentry, int result)
                          sbi->ll_sa_miss++;
                          sai->sai_miss++;
                          sai->sai_consecutive_miss++;
-                        if (sa_low_hit(sai) &&
-                            sai->sai_thread.t_flags & SVC_RUNNING) {
+                        if (sa_low_hit(sai) && sa_is_running(sai)) {
                                  sbi->ll_sa_wrong++;
                                  CDEBUG(D_READA, "statahead for dir %.*s hit "
                                         "ratio too low: hit/miss %u/%u, "
@@ -931,16 +1178,17 @@ void ll_statahead_exit(struct dentry *dentry, int result)
                                         sai->sai_sent, sai->sai_replied,
                                         cfs_curproc_pid());
                                  spin_lock(&lli->lli_lock);
-                                if (!(sai->sai_thread.t_flags & SVC_STOPPED))
+                                if (!sa_is_stopped(sai))
                                          sai->sai_thread.t_flags = SVC_STOPPING;
                                  spin_unlock(&lli->lli_lock);
                          }
                  }
  
-                cfs_waitq_signal(&sai->sai_thread.t_ctl_waitq);
-                ll_sai_entry_put(sai);
-
+                if (!sa_is_stopped(sai))
+                        cfs_waitq_signal(&sai->sai_thread.t_ctl_waitq);
+                ll_sai_entry_fini(sai);
                  if (likely(ldd != NULL))
                          ldd->lld_sa_generation = sai->sai_generation;
          }
+        EXIT;
  }
diff --git a/lustre/llite/super.c b/lustre/llite/super.c

index 3bb9358..18236ea 100644 (file)
--- a/lustre/llite/super.c
+++ b/lustre/llite/super.c
@@ -1,24 +1,37 @@
  /* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
   * vim:expandtab:shiftwidth=8:tabstop=8:
   *
- * Lustre Client Super operations
+ * GPL HEADER START
   *
- *  Copyright (c) 2002, 2003 Cluster File Systems, Inc.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   *
- *   This file is part of Lustre, http://www.lustre.org.
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
   *
- *   Lustre is free software; you can redistribute it and/or
- *   modify it under the terms of version 2 of the GNU General Public
- *   License as published by the Free Software Foundation.
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
   *
- *   Lustre is distributed in the hope that it will be useful,
- *   but WITHOUT ANY WARRANTY; without even the implied warranty of
- *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- *   GNU General Public License for more details.
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
   *
- *   You should have received a copy of the GNU General Public License
- *   along with Lustre; if not, write to the Free Software
- *   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright  2008 Sun Microsystems, Inc. All rights reserved
+ * Use is subject to license terms.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
   */
  
  #define DEBUG_SUBSYSTEM S_LLITE
@@ -53,6 +66,7 @@ struct super_operations lustre_super_operations =
          .fh_to_dentry   = ll_fh_to_dentry,
          .dentry_to_fh   = ll_dentry_to_fh,
          .remount_fs     = ll_remount_fs,
+        .show_options   = ll_show_options,
  };
  
  
@@ -65,7 +79,7 @@ static int __init init_lustre_lite(void)
          lnet_process_id_t lnet_id;
  
          printk(KERN_INFO "Lustre: Lustre Client File System; "
-               "info@clusterfs.com\n");
+               "http://www.lustre.org/\n");
          ll_file_data_slab = cfs_mem_cache_create("ll_file_data",
                                                   sizeof(struct ll_file_data), 0,
                                                   SLAB_HWCACHE_ALIGN);
@@ -119,7 +133,7 @@ static void __exit exit_lustre_lite(void)
                  lprocfs_remove(&proc_lustre_fs_root);
  }
  
-MODULE_AUTHOR("Cluster File Systems, Inc. <info@clusterfs.com>");
+MODULE_AUTHOR("Sun Microsystems, Inc. <http://www.lustre.org/>");
  MODULE_DESCRIPTION("Lustre Lite Client File System");
  MODULE_LICENSE("GPL");
  
diff --git a/lustre/llite/super25.c b/lustre/llite/super25.c

index d7fc835..2078b04 100644 (file)
--- a/lustre/llite/super25.c
+++ b/lustre/llite/super25.c
@@ -1,24 +1,37 @@
  /* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
   * vim:expandtab:shiftwidth=8:tabstop=8:
   *
- * Lustre Light Super operations
+ * GPL HEADER START
   *
- *  Copyright (c) 2002, 2003 Cluster File Systems, Inc.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   *
- *   This file is part of Lustre, http://www.lustre.org.
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
   *
- *   Lustre is free software; you can redistribute it and/or
- *   modify it under the terms of version 2 of the GNU General Public
- *   License as published by the Free Software Foundation.
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
   *
- *   Lustre is distributed in the hope that it will be useful,
- *   but WITHOUT ANY WARRANTY; without even the implied warranty of
- *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- *   GNU General Public License for more details.
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
   *
- *   You should have received a copy of the GNU General Public License
- *   along with Lustre; if not, write to the Free Software
- *   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright  2008 Sun Microsystems, Inc. All rights reserved
+ * Use is subject to license terms.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
   */
  
  #define DEBUG_SUBSYSTEM S_LLITE
@@ -85,6 +98,7 @@ struct super_operations lustre_super_operations =
          .statfs        = ll_statfs,
          .umount_begin  = ll_umount_begin,
          .remount_fs    = ll_remount_fs,
+        .show_options  = ll_show_options,
  };
  
  
@@ -97,7 +111,7 @@ static int __init init_lustre_lite(void)
          lnet_process_id_t lnet_id;
  
          printk(KERN_INFO "Lustre: Lustre Client File System; "
-               "info@clusterfs.com\n");
+               "http://www.lustre.org/\n");
          rc = ll_init_inodecache();
          if (rc)
                  return -ENOMEM;
@@ -162,7 +176,7 @@ static void __exit exit_lustre_lite(void)
                  lprocfs_remove(&proc_lustre_fs_root);
  }
  
-MODULE_AUTHOR("Cluster File Systems, Inc. <info@clusterfs.com>");
+MODULE_AUTHOR("Sun Microsystems, Inc. <http://www.lustre.org/>");
  MODULE_DESCRIPTION("Lustre Lite Client File System");
  MODULE_LICENSE("GPL");
  
diff --git a/lustre/llite/symlink.c b/lustre/llite/symlink.c

index 21de06b..ec6fcc4 100644 (file)
--- a/lustre/llite/symlink.c
+++ b/lustre/llite/symlink.c
@@ -1,22 +1,37 @@
  /* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
   * vim:expandtab:shiftwidth=8:tabstop=8:
   *
- *  Copyright (c) 2002 Cluster File Systems, Inc.
+ * GPL HEADER START
   *
- *   This file is part of Lustre, http://www.lustre.org.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   *
- *   Lustre is free software; you can redistribute it and/or
- *   modify it under the terms of version 2 of the GNU General Public
- *   License as published by the Free Software Foundation.
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
   *
- *   Lustre is distributed in the hope that it will be useful,
- *   but WITHOUT ANY WARRANTY; without even the implied warranty of
- *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- *   GNU General Public License for more details.
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
   *
- *   You should have received a copy of the GNU General Public License
- *   along with Lustre; if not, write to the Free Software
- *   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright  2008 Sun Microsystems, Inc. All rights reserved
+ * Use is subject to license terms.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
   */
  
  #include <linux/fs.h>
@@ -135,7 +150,7 @@ static LL_FOLLOW_LINK_RETURN_TYPE ll_follow_link(struct dentry *dentry, struct n
  #ifdef HAVE_VFS_INTENT_PATCHES
          struct lookup_intent *it = ll_nd2it(nd);
  #endif
-        struct ptlrpc_request *request;
+        struct ptlrpc_request *request = NULL;
          int rc;
          char *symname;
          ENTRY;
@@ -152,9 +167,15 @@ static LL_FOLLOW_LINK_RETURN_TYPE ll_follow_link(struct dentry *dentry, struct n
  #endif
  
          CDEBUG(D_VFSTRACE, "VFS Op\n");
-        down(&lli->lli_size_sem);
-        rc = ll_readlink_internal(inode, &request, &symname);
-        up(&lli->lli_size_sem);
+        /* Limit the recursive symlink depth to 5 instead of default
+         * 8 links when kernel has 4k stack to prevent stack overflow. */
+        if (THREAD_SIZE < 8192 && current->link_count >= 5) {
+                rc = -ELOOP;
+        } else {
+                down(&lli->lli_size_sem);
+                rc = ll_readlink_internal(inode, &request, &symname);
+                up(&lli->lli_size_sem);
+        }
          if (rc) {
                  path_release(nd); /* Kernel assumes that ->follow_link()
                                       releases nameidata on error */
@@ -206,11 +227,7 @@ struct inode_operations ll_fast_symlink_inode_operations = {
  #ifdef HAVE_COOKIE_FOLLOW_LINK
          .put_link       = ll_put_link,
  #endif
-#if (LINUX_VERSION_CODE < KERNEL_VERSION(2,5,0))
-        .revalidate_it  = ll_inode_revalidate_it,
-#else 
          .getattr        = ll_getattr,
-#endif
          .permission     = ll_inode_permission,
          .setxattr       = ll_setxattr,
          .getxattr       = ll_getxattr,
diff --git a/lustre/llite/xattr.c b/lustre/llite/xattr.c

index 2565fed..359f3d6 100644 (file)
--- a/lustre/llite/xattr.c
+++ b/lustre/llite/xattr.c
@@ -1,22 +1,37 @@
  /* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
   * vim:expandtab:shiftwidth=8:tabstop=8:
   *
- *  Copyright (c) 2004 - 2005 Cluster File Systems, Inc.
+ * GPL HEADER START
   *
- *   This file is part of Lustre, http://www.lustre.org.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   *
- *   Lustre is free software; you can redistribute it and/or
- *   modify it under the terms of version 2 of the GNU General Public
- *   License as published by the Free Software Foundation.
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
   *
- *   Lustre is distributed in the hope that it will be useful,
- *   but WITHOUT ANY WARRANTY; without even the implied warranty of
- *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- *   GNU General Public License for more details.
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
   *
- *   You should have received a copy of the GNU General Public License
- *   along with Lustre; if not, write to the Free Software
- *   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright  2008 Sun Microsystems, Inc. All rights reserved
+ * Use is subject to license terms.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
   */
  
  #include <linux/fs.h>
@@ -97,7 +112,7 @@ int xattr_type_filter(struct ll_sb_info *sbi, int xattr_type)
  
          if (xattr_type == XATTR_USER_T && !(sbi->ll_flags & LL_SBI_USER_XATTR))
                  return -EOPNOTSUPP;
-        if (xattr_type == XATTR_TRUSTED_T && !capable(CAP_SYS_ADMIN))
+        if (xattr_type == XATTR_TRUSTED_T && !cfs_capable(CFS_CAP_SYS_ADMIN))
                  return -EPERM;
          if (xattr_type == XATTR_OTHER_T)
                  return -EOPNOTSUPP;
@@ -133,7 +148,7 @@ int ll_setxattr_common(struct inode *inode, const char *name,
          if (rc) {
                  if (rc == -EOPNOTSUPP && xattr_type == XATTR_USER_T) {
                          LCONSOLE_INFO("Disabling user_xattr feature because "
-                                      "it is not supported on the server\n"); 
+                                      "it is not supported on the server\n");
                          sbi->ll_flags &= ~LL_SBI_USER_XATTR;
                  }
                  RETURN(rc);
@@ -156,10 +171,10 @@ int ll_setxattr(struct dentry *dentry, const char *name,
  
          ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_SETXATTR, 1);
  
-        if ((strncmp(name, XATTR_TRUSTED_PREFIX, 
+        if ((strncmp(name, XATTR_TRUSTED_PREFIX,
                      sizeof(XATTR_TRUSTED_PREFIX) - 1) == 0 &&
               strcmp(name + sizeof(XATTR_TRUSTED_PREFIX) - 1, "lov") == 0) ||
-            (strncmp(name, XATTR_LUSTRE_PREFIX, 
+            (strncmp(name, XATTR_LUSTRE_PREFIX,
                      sizeof(XATTR_LUSTRE_PREFIX) - 1) == 0 &&
               strcmp(name + sizeof(XATTR_LUSTRE_PREFIX) - 1, "lov") == 0)) {
                  struct lov_user_md *lump = (struct lov_user_md *)value;
@@ -168,18 +183,21 @@ int ll_setxattr(struct dentry *dentry, const char *name,
                  if (S_ISREG(inode->i_mode)) {
                          struct file f;
                          int flags = FMODE_WRITE;
-                        
+
                          f.f_dentry = dentry;
-                        rc = ll_lov_setstripe_ea_info(inode, &f, flags, 
+                        rc = ll_lov_setstripe_ea_info(inode, &f, flags,
                                                        lump, sizeof(*lump));
                          /* b10667: rc always be 0 here for now */
                          rc = 0;
                  } else if (S_ISDIR(inode->i_mode)) {
                          rc = ll_dir_setstripe(inode, lump, 0);
                  }
-                
+
                  return rc;
-        }
+        } else if (strcmp(name, "trusted.lma") == 0 &&
+                   !OBD_FAIL_CHECK(OBD_FAIL_MDS_ALLOW_COMMON_EA_SETTING))
+                return 0;
+
  
          return ll_setxattr_common(inode, name, value, size, flags,
                                    OBD_MD_FLXATTR);
@@ -261,7 +279,7 @@ do_getxattr:
          if (rc) {
                  if (rc == -EOPNOTSUPP && xattr_type == XATTR_USER_T) {
                          LCONSOLE_INFO("Disabling user_xattr feature because "
-                                      "it is not supported on the server\n"); 
+                                      "it is not supported on the server\n");
                          sbi->ll_flags &= ~LL_SBI_USER_XATTR;
                  }
                  RETURN(rc);
@@ -318,23 +336,25 @@ ssize_t ll_getxattr(struct dentry *dentry, const char *name,
  
          ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_GETXATTR, 1);
  
-        if ((strncmp(name, XATTR_TRUSTED_PREFIX, 
+        if ((strncmp(name, XATTR_TRUSTED_PREFIX,
                      sizeof(XATTR_TRUSTED_PREFIX) - 1) == 0 &&
               strcmp(name + sizeof(XATTR_TRUSTED_PREFIX) - 1, "lov") == 0) ||
-            (strncmp(name, XATTR_LUSTRE_PREFIX, 
+            (strncmp(name, XATTR_LUSTRE_PREFIX,
                      sizeof(XATTR_LUSTRE_PREFIX) - 1) == 0 &&
               strcmp(name + sizeof(XATTR_LUSTRE_PREFIX) - 1, "lov") == 0)) {
                  struct lov_user_md *lump;
                  struct lov_mds_md *lmm = NULL;
                  struct ptlrpc_request *request = NULL;
-                int rc = 0, lmmsize;
+                int rc = 0, lmmsize = 0;
  
                  if (S_ISREG(inode->i_mode)) {
-                        rc = ll_lov_getstripe_ea_info(dentry->d_parent->d_inode, 
-                                                      dentry->d_name.name, &lmm, 
+                        rc = ll_lov_getstripe_ea_info(dentry->d_parent->d_inode,
+                                                      dentry->d_name.name, &lmm,
                                                        &lmmsize, &request);
                  } else if (S_ISDIR(inode->i_mode)) {
                          rc = ll_dir_getstripe(inode, &lmm, &lmmsize, &request);
+                } else {
+                        rc = -ENODATA;
                  }
  
                  if (rc < 0)
@@ -343,8 +363,8 @@ ssize_t ll_getxattr(struct dentry *dentry, const char *name,
                         GOTO(out, rc = lmmsize);
  
                  if (size < lmmsize) {
-                        CERROR("server bug: replied size %u > %u\n",
-                               lmmsize, (int)size);
+                        CERROR("server bug: replied size %d > %d for %s (%s)\n",
+                               lmmsize, (int)size, dentry->d_name.name, name);
                          GOTO(out, rc = -ERANGE);
                  }
  
@@ -382,7 +402,7 @@ ssize_t ll_listxattr(struct dentry *dentry, char *buffer, size_t size)
                  struct lov_stripe_md *lsm = NULL;
                  lsm = lli->lli_smd;
                  if (lsm == NULL)
-                        rc2 = -1; 
+                        rc2 = -1;
          } else if (S_ISDIR(inode->i_mode)) {
                  rc2 = ll_dir_getstripe(inode, &lmm, &lmmsize, &request);
          }
@@ -405,7 +425,6 @@ ssize_t ll_listxattr(struct dentry *dentry, char *buffer, size_t size)
  out:
          ptlrpc_req_finished(request);
          rc = rc + rc2;
-        
+
          return rc;
  }
-
diff --git a/lustre/lov/Makefile.in b/lustre/lov/Makefile.in

index f714192..0f223f8 100644 (file)
--- a/lustre/lov/Makefile.in
+++ b/lustre/lov/Makefile.in
@@ -1,4 +1,4 @@
  MODULES := lov
-lov-objs := lov_log.o lov_obd.o lov_pack.o lproc_lov.o lov_offset.o lov_merge.o lov_request.o lov_qos.o lov_ea.o
+lov-objs := lov_log.o lov_obd.o lov_pack.o lproc_lov.o lov_offset.o lov_merge.o lov_request.o lov_qos.o lov_ea.o lov_pool.o
  
  @INCLUDE_RULES@
diff --git a/lustre/lov/autoMakefile.am b/lustre/lov/autoMakefile.am

index 583a425..763690a 100644 (file)
--- a/lustre/lov/autoMakefile.am
+++ b/lustre/lov/autoMakefile.am
@@ -1,11 +1,42 @@
-# Copyright (C) 2002 Cluster File Systems, Inc.
  #
-# This code is issued under the GNU General Public License.
-# See the file COPYING in this distribution
+# GPL HEADER START
+#
+# DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License version 2 only,
+# as published by the Free Software Foundation.
+#
+# This program is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+# General Public License version 2 for more details (a copy is included
+# in the LICENSE file that accompanied this code).
+#
+# You should have received a copy of the GNU General Public License
+# version 2 along with this program; If not, see
+# http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+#
+# Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+# CA 95054 USA or visit www.sun.com if you need additional information or
+# have any questions.
+#
+# GPL HEADER END
+#
+
+#
+# Copyright  2008 Sun Microsystems, Inc. All rights reserved
+# Use is subject to license terms.
+#
+
+#
+# This file is part of Lustre, http://www.lustre.org/
+# Lustre is a trademark of Sun Microsystems, Inc.
+#
  
  if LIBLUSTRE
  noinst_LIBRARIES = liblov.a
-liblov_a_SOURCES = lov_log.c lov_obd.c lov_pack.c lov_request.c lov_offset.c lov_qos.c lov_merge.c lov_ea.c lov_internal.h
+liblov_a_SOURCES = lov_log.c lov_pool.c lov_obd.c lov_pack.c lov_request.c lov_offset.c lov_qos.c lov_merge.c lov_ea.c lov_internal.h
  liblov_a_CPPFLAGS = $(LLCPPFLAGS)
  liblov_a_CFLAGS = $(LLCFLAGS)
  endif
@@ -20,6 +51,7 @@ macos_PROGRAMS := lov
  
  lov_SOURCES :=          \
          lov_log.c       \
+       lov_pool.c      \
          lov_obd.c       \
          lov_pack.c      \
          lov_request.c   \
diff --git a/lustre/lov/lov_ea.c b/lustre/lov/lov_ea.c

index d5d60d6..05056b3 100755 (executable)
--- a/lustre/lov/lov_ea.c
+++ b/lustre/lov/lov_ea.c
@@ -1,26 +1,41 @@
  /* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
   * vim:expandtab:shiftwidth=8:tabstop=8:
   *
- *  Copyright (c) 2001-2005 Cluster File Systems, Inc.
- *   Author: Wang Di <wangdi@clusterfs.com>
+ * GPL HEADER START
   *
- *   This file is part of the Lustre file system, http://www.lustre.org
- *   Lustre is a trademark of Cluster File Systems, Inc.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   *
- *   You may have signed or agreed to another license before downloading
- *   this software.  If so, you are bound by the terms and conditions
- *   of that agreement, and the following does not apply to you.  See the
- *   LICENSE file included with this distribution for more information.
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
   *
- *   If you did not agree to a different license, then this copy of Lustre
- *   is open source software; you can redistribute it and/or modify it
- *   under the terms of version 2 of the GNU General Public License as
- *   published by the Free Software Foundation.
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
   *
- *   In either case, Lustre is distributed in the hope that it will be
- *   useful, but WITHOUT ANY WARRANTY; without even the implied warranty
- *   of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- *   license text for more details.
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright  2008 Sun Microsystems, Inc. All rights reserved
+ * Use is subject to license terms.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/lov/lov_ea.c
+ *
+ * Author: Wang Di <wangdi@clusterfs.com>
   */
  
  #ifndef EXPORT_SYMTAB
@@ -53,29 +68,27 @@ static int lsm_lmm_verify_common(struct lov_mds_md *lmm, int lmm_bytes,
  
          if (stripe_count == 0 || stripe_count > LOV_V1_INSANE_STRIPE_COUNT) {
                  CERROR("bad stripe count %d\n", stripe_count);
-                lov_dump_lmm_v1(D_WARNING, lmm);
+                lov_dump_lmm(D_WARNING, lmm);
                  return -EINVAL;
          }
  
          if (lmm->lmm_object_id == 0) {
                  CERROR("zero object id\n");
-                lov_dump_lmm_v1(D_WARNING, lmm);
+                lov_dump_lmm(D_WARNING, lmm);
                  return -EINVAL;
          }
  
          if (lmm->lmm_pattern != cpu_to_le32(LOV_PATTERN_RAID0)) {
                  CERROR("bad striping pattern\n");
-                lov_dump_lmm_v1(D_WARNING, lmm);
+                lov_dump_lmm(D_WARNING, lmm);
                  return -EINVAL;
          }
  
          if (lmm->lmm_stripe_size == 0 ||
-            (stripe_count != -1 &&
-             (__u64)le32_to_cpu(lmm->lmm_stripe_size)*stripe_count >
-             0xffffffff)) {
+             (le32_to_cpu(lmm->lmm_stripe_size)&(LOV_MIN_STRIPE_SIZE-1)) != 0) {
                  CERROR("bad stripe size %u\n",
                         le32_to_cpu(lmm->lmm_stripe_size));
-                lov_dump_lmm_v1(D_WARNING, lmm);
+                lov_dump_lmm(D_WARNING, lmm);
                  return -EINVAL;
          }
          return 0;
@@ -103,6 +116,7 @@ struct lov_stripe_md *lsm_alloc_plain(int stripe_count, int *size)
                  lsm->lsm_oinfo[i] = loi;
          }
          lsm->lsm_stripe_count = stripe_count;
+        lsm->lsm_pool_name[0] = '\0';
          return lsm;
  
  err:
@@ -127,26 +141,31 @@ void lsm_free_plain(struct lov_stripe_md *lsm)
  static void lsm_unpackmd_common(struct lov_stripe_md *lsm,
                                  struct lov_mds_md *lmm)
  {
+        /*
+         * This supposes lov_mds_md_v1/v3 first fields are
+         * are the same
+         */
          lsm->lsm_object_id = le64_to_cpu(lmm->lmm_object_id);
          lsm->lsm_object_gr = le64_to_cpu(lmm->lmm_object_gr);
          lsm->lsm_stripe_size = le32_to_cpu(lmm->lmm_stripe_size);
          lsm->lsm_pattern = le32_to_cpu(lmm->lmm_pattern);
+        lsm->lsm_pool_name[0] = '\0';
  }
  
  static void
  lsm_stripe_by_index_plain(struct lov_stripe_md *lsm, int *stripeno,
-                           obd_off *lov_off, unsigned long *swidth)
+                           obd_off *lov_off, obd_off *swidth)
  {
          if (swidth)
-                *swidth = (ulong)lsm->lsm_stripe_size * lsm->lsm_stripe_count;
+                *swidth = (obd_off)lsm->lsm_stripe_size * lsm->lsm_stripe_count;
  }
  
  static void
  lsm_stripe_by_offset_plain(struct lov_stripe_md *lsm, int *stripeno,
-                           obd_off *lov_off, unsigned long *swidth)
+                           obd_off *lov_off, obd_off *swidth)
  {
          if (swidth)
-                *swidth = (ulong)lsm->lsm_stripe_size * lsm->lsm_stripe_count;
+                *swidth = (obd_off)lsm->lsm_stripe_size * lsm->lsm_stripe_count;
  }
  
  static obd_off
@@ -182,20 +201,20 @@ static int lsm_destroy_plain(struct lov_stripe_md *lsm, struct obdo *oa,
          return 0;
  }
  
-static int lsm_lmm_verify_plain(struct lov_mds_md *lmm, int lmm_bytes,
+static int lsm_lmm_verify_v1(struct lov_mds_md_v1 *lmm, int lmm_bytes,
                               int *stripe_count)
  {
          if (lmm_bytes < sizeof(*lmm)) {
-                CERROR("lov_mds_md too small: %d, need at least %d\n",
+                CERROR("lov_mds_md_v1 too small: %d, need at least %d\n",
                         lmm_bytes, (int)sizeof(*lmm));
                  return -EINVAL;
          }
  
          *stripe_count = le32_to_cpu(lmm->lmm_stripe_count);
  
-        if (lmm_bytes < lov_mds_md_v1_size(*stripe_count)) {
-                CERROR("LOV EA too small: %d, need %d\n",
-                       lmm_bytes, lov_mds_md_v1_size(*stripe_count));
+        if (lmm_bytes < lov_mds_md_size(*stripe_count, LOV_MAGIC_V1)) {
+                CERROR("LOV EA V1 too small: %d, need %d\n",
+                       lmm_bytes, lov_mds_md_size(*stripe_count, LOV_MAGIC_V1));
                  lov_dump_lmm_v1(D_WARNING, lmm);
                  return -EINVAL;
          }
@@ -203,7 +222,7 @@ static int lsm_lmm_verify_plain(struct lov_mds_md *lmm, int lmm_bytes,
          return lsm_lmm_verify_common(lmm, lmm_bytes, *stripe_count);
  }
  
-int lsm_unpackmd_plain(struct lov_obd *lov, struct lov_stripe_md *lsm,
+int lsm_unpackmd_v1(struct lov_obd *lov, struct lov_stripe_md *lsm,
                      struct lov_mds_md_v1 *lmm)
  {
          struct lov_oinfo *loi;
@@ -234,7 +253,7 @@ int lsm_unpackmd_plain(struct lov_obd *lov, struct lov_stripe_md *lsm,
          return 0;
  }
  
-struct lsm_operations lsm_plain_ops = {
+struct lsm_operations lsm_v1_ops = {
          .lsm_free            = lsm_free_plain,
          .lsm_destroy         = lsm_destroy_plain,
          .lsm_stripe_by_index    = lsm_stripe_by_index_plain,
@@ -243,8 +262,8 @@ struct lsm_operations lsm_plain_ops = {
          .lsm_stripe_offset_by_index  = lsm_stripe_offset_by_index_plain,
          .lsm_stripe_offset_by_offset = lsm_stripe_offset_by_offset_plain,
          .lsm_stripe_index_by_offset  = lsm_stripe_index_by_offset_plain,
-        .lsm_lmm_verify         = lsm_lmm_verify_plain,
-        .lsm_unpackmd           = lsm_unpackmd_plain,
+        .lsm_lmm_verify         = lsm_lmm_verify_v1,
+        .lsm_unpackmd           = lsm_unpackmd_v1,
  };
  
  struct lov_extent *lovea_off2le(struct lov_stripe_md *lsm, obd_off lov_off)
@@ -316,7 +335,7 @@ static void lsm_free_join(struct lov_stripe_md *lsm)
  
  static void
  lsm_stripe_by_index_join(struct lov_stripe_md *lsm, int *stripeno,
-                           obd_off *lov_off, unsigned long *swidth)
+                           obd_off *lov_off, obd_off *swidth)
  {
          struct lov_extent *le;
  
@@ -329,7 +348,7 @@ lsm_stripe_by_index_join(struct lov_stripe_md *lsm, int *stripeno,
          *stripeno -= le->le_loi_idx;
  
          if (swidth)
-                *swidth = (ulong)lsm->lsm_stripe_size * le->le_stripe_count;
+                *swidth = (obd_off)lsm->lsm_stripe_size * le->le_stripe_count;
  
          if (lov_off) {
                  struct lov_extent *lov_le = lovea_off2le(lsm, *lov_off);
@@ -346,7 +365,7 @@ lsm_stripe_by_index_join(struct lov_stripe_md *lsm, int *stripeno,
  
  static void
  lsm_stripe_by_offset_join(struct lov_stripe_md *lsm, int *stripeno,
-                           obd_off *lov_off, unsigned long *swidth)
+                           obd_off *lov_off, obd_off *swidth)
  {
          struct lov_extent *le;
  
@@ -362,7 +381,7 @@ lsm_stripe_by_offset_join(struct lov_stripe_md *lsm, int *stripeno,
                  *stripeno -= le->le_loi_idx;
  
          if (swidth)
-                *swidth = (ulong)lsm->lsm_stripe_size * le->le_stripe_count;
+                *swidth = (obd_off)lsm->lsm_stripe_size * le->le_stripe_count;
  }
  
  static obd_off
@@ -429,7 +448,8 @@ static int lovea_unpack_array(struct llog_handle *handle,
          /* insert extent desc into lsm extent array  */
          lai->lai_ext_array[cursor].le_start = le64_to_cpu(med->med_start);
          lai->lai_ext_array[cursor].le_len   = le64_to_cpu(med->med_len);
-        lai->lai_ext_array[cursor].le_stripe_count = lmm->lmm_stripe_count;
+        lai->lai_ext_array[cursor].le_stripe_count =
+                                   le32_to_cpu(lmm->lmm_stripe_count);
  
          /* unpack extent's lmm to lov_oinfo array */
          loi_index = lai->lai_ext_array[cursor].le_loi_idx;
@@ -569,8 +589,10 @@ static int lovea_init_array_info(struct lov_stripe_md *lsm,
          if (!lai)
                  RETURN(-ENOMEM);
  
-        lai->lai_array_id = *logid;
-        lai->lai_ext_count = extent_count;
+        lai->lai_array_id.lgl_oid = le64_to_cpu(logid->lgl_oid);
+        lai->lai_array_id.lgl_ogr = le64_to_cpu(logid->lgl_ogr);
+        lai->lai_array_id.lgl_ogen = le32_to_cpu(logid->lgl_ogen);
+        lai->lai_ext_count = le32_to_cpu(extent_count);
          lsm->lsm_array = lai;
          RETURN(0);
  }
@@ -609,3 +631,77 @@ struct lsm_operations lsm_join_ops = {
  };
  
  
+static int lsm_lmm_verify_v3(struct lov_mds_md *lmmv1, int lmm_bytes,
+                             int *stripe_count)
+{
+        struct lov_mds_md_v3 *lmm;
+
+        lmm = (struct lov_mds_md_v3 *)lmmv1;
+
+        if (lmm_bytes < sizeof(*lmm)) {
+                CERROR("lov_mds_md_v3 too small: %d, need at least %d\n",
+                       lmm_bytes, (int)sizeof(*lmm));
+                return -EINVAL;
+        }
+
+        *stripe_count = le32_to_cpu(lmm->lmm_stripe_count);
+
+        if (lmm_bytes < lov_mds_md_size(*stripe_count, LOV_MAGIC_V3)) {
+                CERROR("LOV EA V3 too small: %d, need %d\n",
+                       lmm_bytes, lov_mds_md_size(*stripe_count, LOV_MAGIC_V3));
+                lov_dump_lmm_v3(D_WARNING, lmm);
+                return -EINVAL;
+        }
+
+        return lsm_lmm_verify_common((struct lov_mds_md_v1 *)lmm, lmm_bytes,
+                                     *stripe_count);
+}
+
+int lsm_unpackmd_v3(struct lov_obd *lov, struct lov_stripe_md *lsm,
+                    struct lov_mds_md *lmmv1)
+{
+        struct lov_mds_md_v3 *lmm;
+        struct lov_oinfo *loi;
+        int i;
+
+        lmm = (struct lov_mds_md_v3 *)lmmv1;
+
+        lsm_unpackmd_common(lsm, (struct lov_mds_md_v1 *)lmm);
+        strncpy(lsm->lsm_pool_name, lmm->lmm_pool_name, LOV_MAXPOOLNAME);
+
+        for (i = 0; i < lsm->lsm_stripe_count; i++) {
+                /* XXX LOV STACKING call down to osc_unpackmd() */
+                loi = lsm->lsm_oinfo[i];
+                loi->loi_id = le64_to_cpu(lmm->lmm_objects[i].l_object_id);
+                loi->loi_gr = le64_to_cpu(lmm->lmm_objects[i].l_object_gr);
+                loi->loi_ost_idx = le32_to_cpu(lmm->lmm_objects[i].l_ost_idx);
+                loi->loi_ost_gen = le32_to_cpu(lmm->lmm_objects[i].l_ost_gen);
+                if (loi->loi_ost_idx >= lov->desc.ld_tgt_count) {
+                        CERROR("OST index %d more than OST count %d\n",
+                               loi->loi_ost_idx, lov->desc.ld_tgt_count);
+                        lov_dump_lmm_v3(D_WARNING, lmm);
+                        return -EINVAL;
+                }
+                if (!lov->lov_tgts[loi->loi_ost_idx]) {
+                        CERROR("OST index %d missing\n", loi->loi_ost_idx);
+                        lov_dump_lmm_v3(D_WARNING, lmm);
+                        return -EINVAL;
+                }
+        }
+
+        return 0;
+}
+
+struct lsm_operations lsm_v3_ops = {
+        .lsm_free            = lsm_free_plain,
+        .lsm_destroy         = lsm_destroy_plain,
+        .lsm_stripe_by_index    = lsm_stripe_by_index_plain,
+        .lsm_stripe_by_offset   = lsm_stripe_by_offset_plain,
+        .lsm_revalidate         = lsm_revalidate_plain,
+        .lsm_stripe_offset_by_index  = lsm_stripe_offset_by_index_plain,
+        .lsm_stripe_offset_by_offset = lsm_stripe_offset_by_offset_plain,
+        .lsm_stripe_index_by_offset  = lsm_stripe_index_by_offset_plain,
+        .lsm_lmm_verify         = lsm_lmm_verify_v3,
+        .lsm_unpackmd           = lsm_unpackmd_v3,
+};
+
diff --git a/lustre/lov/lov_internal.h b/lustre/lov/lov_internal.h

index 9465768..4f4d72f 100644 (file)
--- a/lustre/lov/lov_internal.h
+++ b/lustre/lov/lov_internal.h
@@ -1,10 +1,37 @@
  /* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
   * vim:expandtab:shiftwidth=8:tabstop=8:
   *
- * Copyright (C) 2003 Cluster File Systems, Inc.
+ * GPL HEADER START
   *
- * This code is issued under the GNU General Public License.
- * See the file COPYING in this distribution
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright  2008 Sun Microsystems, Inc. All rights reserved
+ * Use is subject to license terms.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
   */
  
  #ifndef LOV_INTERNAL_H
@@ -130,6 +157,8 @@ int lov_merge_lvb(struct obd_export *exp, struct lov_stripe_md *lsm,
                    struct ost_lvb *lvb, int kms_only);
  int lov_adjust_kms(struct obd_export *exp, struct lov_stripe_md *lsm,
                     obd_off size, int shrink);
+int lov_update_lvb(struct obd_export *exp, struct lov_stripe_md *lsm,
+                   struct ost_lvb *lvb, obd_flag valid);
  
  /* lov_offset.c */
  obd_size lov_stripe_size(struct lov_stripe_md *lsm, obd_size ost_size,
@@ -192,9 +221,8 @@ int lov_prep_punch_set(struct obd_export *exp, struct obd_info *oinfo,
                         struct lov_request_set **reqset);
  int lov_fini_punch_set(struct lov_request_set *set);
  int lov_prep_sync_set(struct obd_export *exp, struct obd_info *obd_info,
-                      struct obdo *src_oa,
-                      struct lov_stripe_md *lsm, obd_off start,
-                      obd_off end, struct lov_request_set **reqset);
+                      obd_off start, obd_off end,
+                      struct lov_request_set **reqset);
  int lov_fini_sync_set(struct lov_request_set *set);
  int lov_prep_enqueue_set(struct obd_export *exp, struct obd_info *oinfo,
                           struct ldlm_enqueue_info *einfo,
@@ -216,8 +244,8 @@ int lov_prep_cancel_set(struct obd_export *exp, struct obd_info *oinfo,
  int lov_fini_cancel_set(struct lov_request_set *set);
  int lov_prep_statfs_set(struct obd_device *obd, struct obd_info *oinfo,
                          struct lov_request_set **reqset);
-void lov_update_statfs(struct obd_device *obd, struct obd_statfs *osfs,
-                       struct obd_statfs *lov_sfs, int success);
+void lov_update_statfs(struct obd_statfs *osfs, struct obd_statfs *lov_sfs,
+                       int success);
  int lov_fini_statfs(struct obd_device *obd, struct obd_statfs *osfs,
                      int success);
  int lov_fini_statfs_set(struct lov_request_set *set);
@@ -254,6 +282,9 @@ void lov_free_memmd(struct lov_stripe_md **lsmp);
  
  void lov_dump_lmm_v1(int level, struct lov_mds_md_v1 *lmm);
  void lov_dump_lmm_join(int level, struct lov_mds_md_join *lmmj);
+void lov_dump_lmm_v3(int level, struct lov_mds_md_v3 *lmm);
+void lov_dump_lmm(int level, void *lmm);
+
  /* lov_ea.c */
  int lov_unpackmd_join(struct lov_obd *lov, struct lov_stripe_md *lsm,
                        struct lov_mds_md *lmm);
@@ -275,4 +306,54 @@ static inline void lprocfs_lov_init_vars(struct lprocfs_static_vars *lvars)
  }
  #endif
  
+/* pools */
+extern lustre_hash_ops_t pool_hash_operations;
+
+/* ost_pool methods */
+int lov_ost_pool_init(struct ost_pool *op, unsigned int count);
+int lov_ost_pool_extend(struct ost_pool *op, unsigned int max_count);
+int lov_ost_pool_add(struct ost_pool *op, __u32 idx, unsigned int max_count);
+int lov_ost_pool_remove(struct ost_pool *op, __u32 idx);
+int lov_ost_pool_free(struct ost_pool *op);
+
+/* high level pool methods */
+int lov_pool_new(struct obd_device *obd, char *poolname);
+int lov_pool_del(struct obd_device *obd, char *poolname);
+int lov_pool_add(struct obd_device *obd, char *poolname, char *ostname);
+int lov_pool_remove(struct obd_device *obd, char *poolname, char *ostname);
+void lov_dump_pool(int level, struct pool_desc *pool);
+struct pool_desc *lov_find_pool(struct lov_obd *lov, char *poolname);
+int lov_check_index_in_pool(__u32 idx, struct pool_desc *pool);
+
+
+#if BITS_PER_LONG == 64
+# define ll_do_div64(n,base) ({                                 \
+        uint64_t __base = (base);                               \
+        uint64_t __rem;                                         \
+        __rem = ((uint64_t)(n)) % __base;                       \
+        (n) = ((uint64_t)(n)) / __base;                         \
+        __rem;                                                  \
+  })
+#elif BITS_PER_LONG == 32
+# define ll_do_div64(n,base) ({                                 \
+        uint64_t __rem;                                         \
+        if ((sizeof(base) > 4) && (((base)&0xffffffff00000000ULL) != 0)) { \
+                int __remainder;                                \
+                LASSERTF(!((base) & (LOV_MIN_STRIPE_SIZE - 1)), "64 bit lov "\
+                          "division %llu / %llu\n", (n), (base)); \
+                __remainder = (n) & (LOV_MIN_STRIPE_SIZE - 1);  \
+                (n) >>= LOV_MIN_STRIPE_BITS;                    \
+                (base) >>= LOV_MIN_STRIPE_BITS;                 \
+                __rem = do_div(n, base);                        \
+                __rem <<= LOV_MIN_STRIPE_BITS;                  \
+                __rem += __remainder;                           \
+        } else {                                                \
+                __rem = do_div(n, base);                        \
+        }                                                       \
+        __rem;                                                  \
+  })
+#else
+#error Unsupported architecture.
+#endif
+
  #endif
diff --git a/lustre/lov/lov_log.c b/lustre/lov/lov_log.c

index 6e59ecf..8a5c39e 100644 (file)
--- a/lustre/lov/lov_log.c
+++ b/lustre/lov/lov_log.c
@@ -1,28 +1,43 @@
- /* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
+/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
   * vim:expandtab:shiftwidth=8:tabstop=8:
   *
- * Copyright (C) 2002, 2003 Cluster File Systems, Inc.
- * Author: Phil Schwan <phil@clusterfs.com>
- *         Peter Braam <braam@clusterfs.com>
- *         Mike Shaver <shaver@clusterfs.com>
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
   *
- *   This file is part of the Lustre file system, http://www.lustre.org
- *   Lustre is a trademark of Cluster File Systems, Inc.
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
   *
- *   You may have signed or agreed to another license before downloading
- *   this software.  If so, you are bound by the terms and conditions
- *   of that agreement, and the following does not apply to you.  See the
- *   LICENSE file included with this distribution for more information.
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
   *
- *   If you did not agree to a different license, then this copy of Lustre
- *   is open source software; you can redistribute it and/or modify it
- *   under the terms of version 2 of the GNU General Public License as
- *   published by the Free Software Foundation.
+ * GPL HEADER END
+ */
+/*
+ * Copyright  2008 Sun Microsystems, Inc. All rights reserved
+ * Use is subject to license terms.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
   *
- *   In either case, Lustre is distributed in the hope that it will be
- *   useful, but WITHOUT ANY WARRANTY; without even the implied warranty
- *   of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- *   license text for more details.
+ * lustre/lov/lov_log.c
+ *
+ * Author: Phil Schwan <phil@clusterfs.com>
+ * Author: Peter Braam <braam@clusterfs.com>
+ * Author: Mike Shaver <shaver@clusterfs.com>
   */
  
  #ifndef EXPORT_SYMTAB
@@ -53,9 +68,9 @@
   * we need to keep cookies in stripe order, even if some are NULL, so that
   * the right cookies are passed back to the right OSTs at the client side.
   * Unset cookies should be all-zero (which will never occur naturally). */
-static int lov_llog_origin_add(struct llog_ctxt *ctxt,
-                        struct llog_rec_hdr *rec, struct lov_stripe_md *lsm,
-                        struct llog_cookie *logcookies, int numcookies)
+static int lov_llog_origin_add(struct llog_ctxt *ctxt, struct llog_rec_hdr *rec,
+                               struct lov_stripe_md *lsm, 
+                               struct llog_cookie *logcookies, int numcookies)
  {
          struct obd_device *obd = ctxt->loc_obd;
          struct lov_obd *lov = &obd->u.lov;
@@ -77,13 +92,19 @@ static int lov_llog_origin_add(struct llog_ctxt *ctxt,
                  case MDS_UNLINK_REC: {
                          struct llog_unlink_rec *lur = (struct llog_unlink_rec *)rec;
                          lur->lur_oid = loi->loi_id;
-                        lur->lur_ogen = loi->loi_gr;
+                        lur->lur_ogr = loi->loi_gr;
                          break;
                  }
                  case MDS_SETATTR_REC: {
                          struct llog_setattr_rec *lsr = (struct llog_setattr_rec *)rec;
                          lsr->lsr_oid = loi->loi_id;
-                        lsr->lsr_ogen = loi->loi_gr;
+                        lsr->lsr_ogr = loi->loi_gr;
+                        break;
+                }
+                case MDS_SETATTR64_REC: {
+                        struct llog_setattr64_rec *lsr = (struct llog_setattr64_rec *)rec;
+                        lsr->lsr_oid = loi->loi_id;
+                        lsr->lsr_ogr = loi->loi_gr;
                          break;
                  }
                  default:
@@ -98,7 +119,7 @@ static int lov_llog_origin_add(struct llog_ctxt *ctxt,
          RETURN(rc);
  }
  
-static int lov_llog_origin_connect(struct llog_ctxt *ctxt, int count,
+static int lov_llog_origin_connect(struct llog_ctxt *ctxt,
                                     struct llog_logid *logid,
                                     struct llog_gen *gen,
                                     struct obd_uuid *uuid)
@@ -109,7 +130,7 @@ static int lov_llog_origin_connect(struct llog_ctxt *ctxt, int count,
          ENTRY;
  
          lov_getref(obd);
-        for (i = 0; i < count; i++) {
+        for (i = 0; i < lov->desc.ld_tgt_count; i++) {
                  struct obd_device *child;
                  struct llog_ctxt *cctxt;
                  
@@ -117,10 +138,10 @@ static int lov_llog_origin_connect(struct llog_ctxt *ctxt, int count,
                          continue;
                  if (uuid && !obd_uuid_equals(uuid, &lov->lov_tgts[i]->ltd_uuid))
                          continue;
-                CDEBUG(D_CONFIG, "connect %d/%d\n", i, count);
+                CDEBUG(D_CONFIG, "connect %d/%d\n", i, lov->desc.ld_tgt_count);
                  child = lov->lov_tgts[i]->ltd_exp->exp_obd;
                  cctxt = llog_get_context(child, ctxt->loc_idx);
-                rc = llog_connect(cctxt, 1, logid, gen, uuid);
+                rc = llog_connect(cctxt, logid, gen, uuid);
                  llog_ctxt_put(cctxt);
   
                  if (rc) {
@@ -187,6 +208,8 @@ int lov_llog_init(struct obd_device *obd, struct obd_device *tgt,
          int i, rc = 0, err = 0;
          ENTRY;
  
+        LASSERT(uuid);
+
          rc = llog_setup(obd, LLOG_MDS_OST_ORIG_CTXT, tgt, 0, NULL,
                          &lov_mds_ost_orig_logops);
          if (rc)
@@ -195,19 +218,18 @@ int lov_llog_init(struct obd_device *obd, struct obd_device *tgt,
          rc = llog_setup(obd, LLOG_SIZE_REPL_CTXT, tgt, 0, NULL,
                          &lov_size_repl_logops);
          if (rc)
-                RETURN(rc);
+                GOTO(err_cleanup, rc);
  
          lov_getref(obd);
-        /* count may not match lov->desc.ld_tgt_count during dynamic ost add */
-        for (i = 0; i < count; i++) {
+        for (i = 0; i < lov->desc.ld_tgt_count ; i++) {
                  if (!lov->lov_tgts[i] || !lov->lov_tgts[i]->ltd_active)
                          continue;
-                if (uuid && !obd_uuid_equals(uuid, &lov->lov_tgts[i]->ltd_uuid))
+                if (!obd_uuid_equals(uuid, &lov->lov_tgts[i]->ltd_uuid))
                          continue;
                  CDEBUG(D_CONFIG, "init %d/%d\n", i, count);
                  LASSERT(lov->lov_tgts[i]->ltd_exp);
                  child = lov->lov_tgts[i]->ltd_exp->exp_obd;
-                rc = obd_llog_init(child, tgt, 1, logid + i, uuid);
+                rc = obd_llog_init(child, tgt, 1, logid, uuid);
                  if (rc) {
                          CERROR("error osc_llog_init idx %d osc '%s' tgt '%s' "
                                 "(rc=%d)\n", i, child->obd_name, tgt->obd_name,
@@ -217,7 +239,18 @@ int lov_llog_init(struct obd_device *obd, struct obd_device *tgt,
                  }
          }
          lov_putref(obd);
-        RETURN(err);
+        GOTO(err_cleanup, err);
+err_cleanup:
+        if (err) {
+                struct llog_ctxt *ctxt = 
+                        llog_get_context(obd, LLOG_SIZE_REPL_CTXT);
+                if (ctxt)
+                        llog_cleanup(ctxt);
+                ctxt = llog_get_context(obd, LLOG_MDS_OST_ORIG_CTXT);
+                if (ctxt)
+                        llog_cleanup(ctxt);
+        }
+        return err;
  }
  
  int lov_llog_finish(struct obd_device *obd, int count)
diff --git a/lustre/lov/lov_merge.c b/lustre/lov/lov_merge.c

index 45544c9..c30f5f1 100644 (file)
--- a/lustre/lov/lov_merge.c
+++ b/lustre/lov/lov_merge.c
@@ -1,25 +1,37 @@
  /* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
   * vim:expandtab:shiftwidth=8:tabstop=8:
   *
- * Copyright (C) 2002, 2003 Cluster File Systems, Inc.
+ * GPL HEADER START
   *
- *   This file is part of the Lustre file system, http://www.lustre.org
- *   Lustre is a trademark of Cluster File Systems, Inc.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   *
- *   You may have signed or agreed to another license before downloading
- *   this software.  If so, you are bound by the terms and conditions
- *   of that agreement, and the following does not apply to you.  See the
- *   LICENSE file included with this distribution for more information.
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
   *
- *   If you did not agree to a different license, then this copy of Lustre
- *   is open source software; you can redistribute it and/or modify it
- *   under the terms of version 2 of the GNU General Public License as
- *   published by the Free Software Foundation.
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
   *
- *   In either case, Lustre is distributed in the hope that it will be
- *   useful, but WITHOUT ANY WARRANTY; without even the implied warranty
- *   of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- *   license text for more details.
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright  2008 Sun Microsystems, Inc. All rights reserved
+ * Use is subject to license terms.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
   */
  
  #ifndef EXPORT_SYMTAB
@@ -80,19 +92,14 @@ int lov_merge_lvb(struct obd_export *exp, struct lov_stripe_md *lsm,
                  lov_size = lov_stripe_size(lsm, tmpsize, i);
                  if (lov_size > size)
                          size = lov_size;
-                /* merge blocks, mtime, atime */
+                /* merge blocks, mtime, atime, ctime */
                  blocks += loi->loi_lvb.lvb_blocks;
+                if (loi->loi_lvb.lvb_mtime > current_mtime)
+                        current_mtime = loi->loi_lvb.lvb_mtime;
                  if (loi->loi_lvb.lvb_atime > current_atime)
                          current_atime = loi->loi_lvb.lvb_atime;
-
-                /* mtime is always updated with ctime, but can be set in past.
-                   As write and utime(2) may happen within 1 second, and utime's
-                   mtime has a priority over write's one, leave mtime from mds 
-                   for the same ctimes. */
-                if (loi->loi_lvb.lvb_ctime > current_ctime) {
+                if (loi->loi_lvb.lvb_ctime > current_ctime)
                          current_ctime = loi->loi_lvb.lvb_ctime;
-                        current_mtime = loi->loi_lvb.lvb_mtime;
-                }
          }
  
          lvb->lvb_size = size;
@@ -165,8 +172,6 @@ void lov_merge_attrs(struct obdo *tgt, struct obdo *src, obd_flag valid,
                          tgt->o_blksize += src->o_blksize;
                  if (valid & OBD_MD_FLCTIME && tgt->o_ctime < src->o_ctime)
                          tgt->o_ctime = src->o_ctime;
-                /* Only mtime from OSTs are merged here, as they cannot be set
-                   in past (only MDS's mtime can) do not look at ctime. */
                  if (valid & OBD_MD_FLMTIME && tgt->o_mtime < src->o_mtime)
                          tgt->o_mtime = src->o_mtime;
          } else {
@@ -177,3 +182,24 @@ void lov_merge_attrs(struct obdo *tgt, struct obdo *src, obd_flag valid,
                  *set = 1;
          }
  }
+
+int lov_update_lvb(struct obd_export *exp, struct lov_stripe_md *lsm,
+                   struct ost_lvb *lvb, obd_flag valid)
+{
+        int i;
+        struct lov_oinfo *loi;
+
+        LASSERT_SPIN_LOCKED(&lsm->lsm_lock);
+        LASSERT(lsm->lsm_lock_owner == cfs_current());
+
+        for (i = 0; i < lsm->lsm_stripe_count; i++) {
+                loi = lsm->lsm_oinfo[i];
+                if (valid & OBD_MD_FLATIME)
+                        loi->loi_lvb.lvb_atime = lvb->lvb_atime;
+                if (valid & OBD_MD_FLMTIME)
+                        loi->loi_lvb.lvb_mtime = lvb->lvb_mtime;
+                if (valid & OBD_MD_FLCTIME)
+                        loi->loi_lvb.lvb_ctime = lvb->lvb_ctime;
+        }
+        return 0;
+}
diff --git a/lustre/lov/lov_obd.c b/lustre/lov/lov_obd.c

index 222de1d..d6ec791 100644 (file)
--- a/lustre/lov/lov_obd.c
+++ b/lustre/lov/lov_obd.c
@@ -1,29 +1,44 @@
  /* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
   * vim:expandtab:shiftwidth=8:tabstop=8:
   *
- * Copyright (C) 2002-2006 Cluster File Systems, Inc.
- * Author: Phil Schwan <phil@clusterfs.com>
- *         Peter Braam <braam@clusterfs.com>
- *         Mike Shaver <shaver@clusterfs.com>
- *         Nathan Rutman <nathan@clusterfs.com>
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
   *
- *   This file is part of the Lustre file system, http://www.lustre.org
- *   Lustre is a trademark of Cluster File Systems, Inc.
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
   *
- *   You may have signed or agreed to another license before downloading
- *   this software.  If so, you are bound by the terms and conditions
- *   of that agreement, and the following does not apply to you.  See the
- *   LICENSE file included with this distribution for more information.
+ * GPL HEADER END
+ */
+/*
+ * Copyright  2008 Sun Microsystems, Inc. All rights reserved
+ * Use is subject to license terms.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
   *
- *   If you did not agree to a different license, then this copy of Lustre
- *   is open source software; you can redistribute it and/or modify it
- *   under the terms of version 2 of the GNU General Public License as
- *   published by the Free Software Foundation.
+ * lustre/lov/lov_obd.c
   *
- *   In either case, Lustre is distributed in the hope that it will be
- *   useful, but WITHOUT ANY WARRANTY; without even the implied warranty
- *   of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- *   license text for more details.
+ * Author: Phil Schwan <phil@clusterfs.com>
+ * Author: Peter Braam <braam@clusterfs.com>
+ * Author: Mike Shaver <shaver@clusterfs.com>
+ * Author: Nathan Rutman <nathan@clusterfs.com>
   */
  
  #ifndef EXPORT_SYMTAB
@@ -49,6 +64,7 @@
  #include <lprocfs_status.h>
  #include <lustre_param.h>
  #include <lustre_cache.h>
+#include <lustre/ll_fiemap.h>
  
  #include "lov_internal.h"
  
@@ -176,6 +192,76 @@ static int lov_unregister_lock_cancel_cb(struct obd_export *exp,
          return rc;
  }
  
+static int lov_set_osc_active(struct obd_device *obd, struct obd_uuid *uuid,
+                              int activate);
+
+static int lov_notify(struct obd_device *obd, struct obd_device *watched,
+                      enum obd_notify_event ev, void *data)
+{
+        int rc = 0;
+        ENTRY;
+
+        if (ev == OBD_NOTIFY_ACTIVE || ev == OBD_NOTIFY_INACTIVE) {
+                struct obd_uuid *uuid;
+
+                LASSERT(watched);
+
+                if (strcmp(watched->obd_type->typ_name, LUSTRE_OSC_NAME)) {
+                        CERROR("unexpected notification of %s %s!\n",
+                               watched->obd_type->typ_name,
+                               watched->obd_name);
+                        RETURN(-EINVAL);
+                }
+                uuid = &watched->u.cli.cl_target_uuid;
+
+                /* Set OSC as active before notifying the observer, so the
+                 * observer can use the OSC normally.
+                 */
+                rc = lov_set_osc_active(obd, uuid, ev == OBD_NOTIFY_ACTIVE);
+                if (rc < 0) {
+                        CERROR("%sactivation of %s failed: %d\n",
+                               (ev == OBD_NOTIFY_ACTIVE) ? "" : "de",
+                               obd_uuid2str(uuid), rc);
+                        RETURN(rc);
+                }
+                /* active event should be pass lov target index as data */
+                data = &rc;
+        }
+
+        /* Pass the notification up the chain. */
+        if (watched) {
+                rc = obd_notify_observer(obd, watched, ev, data);
+        } else {
+                /* NULL watched means all osc's in the lov (only for syncs) */
+                /* sync event should be send lov idx as data */
+                struct lov_obd *lov = &obd->u.lov;
+                struct obd_device *tgt_obd;
+                int i;
+                lov_getref(obd);
+                for (i = 0; i < lov->desc.ld_tgt_count; i++) {
+                        if (!lov->lov_tgts[i])
+                                continue;
+                        tgt_obd = class_exp2obd(lov->lov_tgts[i]->ltd_exp);
+
+                        if ((ev == OBD_NOTIFY_SYNC) ||
+                            (ev == OBD_NOTIFY_SYNC_NONBLOCK))
+                                data = &i;
+
+                        rc = obd_notify_observer(obd, tgt_obd, ev, data);
+                        if (rc) {
+                                CERROR("%s: notify %s of %s failed %d\n",
+                                       obd->obd_name, 
+                                       obd->obd_observer->obd_name,
+                                       tgt_obd->obd_name, rc);
+                                break;
+                        }
+                }
+                lov_putref(obd);
+        }
+
+        RETURN(rc);
+}
+
  #define MAX_STRING_SIZE 128
  static int lov_connect_obd(struct obd_device *obd, __u32 index, int activate, 
                             struct obd_connect_data *data)
@@ -340,7 +426,7 @@ static int lov_connect(struct lustre_handle *conn, struct obd_device *obd,
          /* Why should there ever be more than 1 connect? */
          lov->lov_connects++;
          LASSERT(lov->lov_connects == 1);
-        
+
          memset(&lov->lov_ocd, 0, sizeof(lov->lov_ocd));
          if (data)
                  lov->lov_ocd = *data;
@@ -358,9 +444,19 @@ static int lov_connect(struct lustre_handle *conn, struct obd_device *obd,
                                 obd->obd_name, i, rc);
                          continue;
                  }
+                /* connect to administrative disabled ost */
+                if (!lov->lov_tgts[i]->ltd_exp)
+                        continue;
+
+                rc = lov_notify(obd, lov->lov_tgts[i]->ltd_exp->exp_obd,
+                                OBD_NOTIFY_ACTIVE, (void *)&i);
+                if (rc) {
+                        CERROR("%s error sending notify %d\n",
+                               obd->obd_name, rc);
+                }
          }
          lov_putref(obd);
-        
+
          RETURN(0);
  }
  
@@ -467,13 +563,14 @@ out:
   *  -EINVAL  : UUID can't be found in the LOV's target list
   *  -ENOTCONN: The UUID is found, but the target connection is bad (!)
   *  -EBADF   : The UUID is found, but the OBD is the wrong type (!)
+ *  - any above 0 is lov index
   */
  static int lov_set_osc_active(struct obd_device *obd, struct obd_uuid *uuid,
                                int activate)
  {
          struct lov_obd *lov = &obd->u.lov;
          struct lov_tgt_desc *tgt;
-        int i, rc = 0;
+        int i = 0;
          ENTRY;
  
          CDEBUG(D_INFO, "Searching in lov %p for uuid %s (activate=%d)\n",
@@ -493,12 +590,12 @@ static int lov_set_osc_active(struct obd_device *obd, struct obd_uuid *uuid,
          }
  
          if (i == lov->desc.ld_tgt_count)
-                GOTO(out, rc = -EINVAL);
+                GOTO(out, i = -EINVAL);
  
          if (lov->lov_tgts[i]->ltd_active == activate) {
                  CDEBUG(D_INFO, "OSC %s already %sactive!\n", uuid->uuid,
                         activate ? "" : "in");
-                GOTO(out, rc);
+                GOTO(out, i);
          }
  
          CDEBUG(D_CONFIG, "Marking OSC %s %sactive\n", obd_uuid2str(uuid),
@@ -518,67 +615,9 @@ static int lov_set_osc_active(struct obd_device *obd, struct obd_uuid *uuid,
  
   out:
          lov_putref(obd);
-        RETURN(rc);
+        RETURN(i);
  }
  
-static int lov_notify(struct obd_device *obd, struct obd_device *watched,
-                      enum obd_notify_event ev, void *data)
-{
-        int rc = 0;
-        ENTRY;
-
-        if (ev == OBD_NOTIFY_ACTIVE || ev == OBD_NOTIFY_INACTIVE) {
-                struct obd_uuid *uuid;
-
-                LASSERT(watched);
-                
-                if (strcmp(watched->obd_type->typ_name, LUSTRE_OSC_NAME)) {
-                        CERROR("unexpected notification of %s %s!\n",
-                               watched->obd_type->typ_name,
-                               watched->obd_name);
-                        RETURN(-EINVAL);
-                }
-                uuid = &watched->u.cli.cl_target_uuid;
-
-                /* Set OSC as active before notifying the observer, so the
-                 * observer can use the OSC normally.
-                 */
-                rc = lov_set_osc_active(obd, uuid, ev == OBD_NOTIFY_ACTIVE);
-                if (rc) {
-                        CERROR("%sactivation of %s failed: %d\n",
-                               (ev == OBD_NOTIFY_ACTIVE) ? "" : "de",
-                               obd_uuid2str(uuid), rc);
-                        RETURN(rc);
-                }
-        }
-
-        /* Pass the notification up the chain. */
-        if (watched) {
-                rc = obd_notify_observer(obd, watched, ev, data);
-        } else {
-                /* NULL watched means all osc's in the lov (only for syncs) */
-                struct lov_obd *lov = &obd->u.lov;
-                struct obd_device *tgt_obd;
-                int i;
-                lov_getref(obd);
-                for (i = 0; i < lov->desc.ld_tgt_count; i++) {
-                        if (!lov->lov_tgts[i])
-                                continue;
-                        tgt_obd = class_exp2obd(lov->lov_tgts[i]->ltd_exp);
-                        rc = obd_notify_observer(obd, tgt_obd, ev, data);
-                        if (rc) {
-                                CERROR("%s: notify %s of %s failed %d\n",
-                                       obd->obd_name, 
-                                       obd->obd_observer->obd_name,
-                                       tgt_obd->obd_name, rc);
-                                break;
-                        }
-                }
-                lov_putref(obd);
-        }
-
-        RETURN(rc);
-}
  
  static int lov_add_target(struct obd_device *obd, struct obd_uuid *uuidp,
                            __u32 index, int gen, int active)
@@ -640,7 +679,6 @@ static int lov_add_target(struct obd_device *obd, struct obd_uuid *uuidp,
                         lov->lov_tgts, lov->lov_tgt_size);
          }
  
-
          OBD_ALLOC_PTR(tgt);
          if (!tgt) {
                  mutex_up(&lov->lov_lock);
@@ -656,6 +694,11 @@ static int lov_add_target(struct obd_device *obd, struct obd_uuid *uuidp,
          lov->lov_tgts[index] = tgt;
          if (index >= lov->desc.ld_tgt_count)
                  lov->desc.ld_tgt_count = index + 1;
+
+        rc = lov_ost_pool_add(&lov->lov_packed, index, lov->lov_tgt_size);
+        if (rc)
+                RETURN(rc);
+
          mutex_up(&lov->lov_lock);
  
          CDEBUG(D_CONFIG, "idx=%d ltd_gen=%d ld_tgt_count=%d\n",
@@ -674,6 +717,10 @@ static int lov_add_target(struct obd_device *obd, struct obd_uuid *uuidp,
          if (rc)
                  GOTO(out, rc);
  
+        /* connect to administrative disabled ost */
+        if (!tgt->ltd_exp)
+                GOTO(out, rc = 0);
+
          rc = lov_notify(obd, tgt->ltd_exp->exp_obd, 
                          active ? OBD_NOTIFY_ACTIVE : OBD_NOTIFY_INACTIVE,
                          (void *)&index);
@@ -754,8 +801,9 @@ static void __lov_del_obd(struct obd_device *obd, __u32 index)
           * maximum tgt index for computing the mds_max_easize. So we can't
           * shrink it. */
  
+        lov_ost_pool_remove(&lov->lov_packed, index);
          lov->lov_tgts[index] = NULL;
-        OBD_FREE_PTR(tgt);        
+        OBD_FREE_PTR(tgt);
  
          /* Manual cleanup - no cleanup logs to clean up the osc's.  We must
             do it ourselves. And we can't do it from lov_cleanup,
@@ -771,8 +819,9 @@ static void __lov_del_obd(struct obd_device *obd, __u32 index)
  void lov_fix_desc_stripe_size(__u64 *val)
  {
          if (*val < PTLRPC_MAX_BRW_SIZE) {
-                LCONSOLE_WARN("Increasing default stripe size to min %u\n",
-                              PTLRPC_MAX_BRW_SIZE);
+                if (*val)
+                        LCONSOLE_WARN("Increasing default stripe size from "
+                                      LPU64" to %u\n",*val,PTLRPC_MAX_BRW_SIZE);
                  *val = PTLRPC_MAX_BRW_SIZE;
          } else if (*val & (LOV_MIN_STRIPE_SIZE - 1)) {
                  *val &= ~(LOV_MIN_STRIPE_SIZE - 1);
@@ -818,7 +867,7 @@ static int lov_setup(struct obd_device *obd, obd_count len, void *buf)
          struct lustre_cfg *lcfg = buf;
          struct lov_desc *desc;
          struct lov_obd *lov = &obd->u.lov;
-        int count;
+        int rc;
          ENTRY;
  
          if (LUSTRE_CFG_BUFLEN(lcfg, 1) < 1) {
@@ -848,30 +897,33 @@ static int lov_setup(struct obd_device *obd, obd_count len, void *buf)
  
          lov_fix_desc(desc);
  
-        /* Because of 64-bit divide/mod operations only work with a 32-bit
-         * divisor in a 32-bit kernel, we cannot support a stripe width
-         * of 4GB or larger on 32-bit CPUs. */
-        count = desc->ld_default_stripe_count;
-        if ((count > 0 ? count : desc->ld_tgt_count) *
-            desc->ld_default_stripe_size > 0xffffffff) {
-                CERROR("LOV: stripe width "LPU64"x%u > 4294967295 bytes\n",
-                       desc->ld_default_stripe_size, count);
-                RETURN(-EINVAL);
-        }
-
          desc->ld_active_tgt_count = 0;
          lov->desc = *desc;
          lov->lov_tgt_size = 0;
+
          sema_init(&lov->lov_lock, 1);
          atomic_set(&lov->lov_refcount, 0);
          CFS_INIT_LIST_HEAD(&lov->lov_qos.lq_oss_list);
          init_rwsem(&lov->lov_qos.lq_rw_sem);
          lov->lov_qos.lq_dirty = 1;
-        lov->lov_qos.lq_dirty_rr = 1;
+        lov->lov_qos.lq_rr.lqr_dirty = 1;
          lov->lov_qos.lq_reset = 1;
          /* Default priority is toward free space balance */
          lov->lov_qos.lq_prio_free = 232;
  
+        lov->lov_pools_hash_body = lustre_hash_init("POOLS", 7, 7,
+                                                    &pool_hash_operations, 0);
+        CFS_INIT_LIST_HEAD(&lov->lov_pool_list);
+        lov->lov_pool_count = 0;
+        rc = lov_ost_pool_init(&lov->lov_packed, 0);
+        if (rc)
+                RETURN(rc);
+        rc = lov_ost_pool_init(&lov->lov_qos.lq_rr.lqr_pool, 0);
+        if (rc) {
+                lov_ost_pool_free(&lov->lov_packed);
+                RETURN(rc);
+        }
+
          lprocfs_lov_init_vars(&lvars);
          lprocfs_obd_setup(obd, lvars.obd_vars);
  #ifdef LPROCFS
@@ -886,6 +938,9 @@ static int lov_setup(struct obd_device *obd, obd_count len, void *buf)
                  }
          }
  #endif
+        lov->lov_pool_proc_entry = lprocfs_register("pools",
+                                                    obd->obd_proc_entry,
+                                                    NULL, NULL);
  
          RETURN(0);
  }
@@ -923,8 +978,20 @@ static int lov_precleanup(struct obd_device *obd, enum obd_cleanup_stage stage)
  static int lov_cleanup(struct obd_device *obd)
  {
          struct lov_obd *lov = &obd->u.lov;
+        struct list_head *pos, *tmp;
+        struct pool_desc *pool;
  
          lprocfs_obd_cleanup(obd);
+
+        list_for_each_safe(pos, tmp, &lov->lov_pool_list) {
+                pool = list_entry(pos, struct pool_desc, pool_list);
+                /* free the pool structs */
+                lov_pool_del(obd, pool->pool_name);
+        }
+        lov_ost_pool_free(&(lov->lov_qos.lq_rr.lqr_pool));
+        lov_ost_pool_free(&lov->lov_packed);
+        lustre_hash_exit(lov->lov_pools_hash_body);
+
          if (lov->lov_tgts) {
                  int i;
                  for (i = 0; i < lov->desc.ld_tgt_count; i++) {
@@ -946,9 +1013,6 @@ static int lov_cleanup(struct obd_device *obd)
                           lov->lov_tgt_size);
                  lov->lov_tgt_size = 0;
          }
-        
-        if (lov->lov_qos.lq_rr_size) 
-                OBD_FREE(lov->lov_qos.lq_rr_array, lov->lov_qos.lq_rr_size);
  
          RETURN(0);
  }
@@ -998,6 +1062,12 @@ static int lov_process_config(struct obd_device *obd, obd_count len, void *buf)
                                                lcfg, obd);
                  GOTO(out, rc);
          }
+        case LCFG_POOL_NEW:
+        case LCFG_POOL_ADD:
+        case LCFG_POOL_DEL:
+        case LCFG_POOL_REM:
+                GOTO(out, rc);
+
          default: {
                  CERROR("Unknown command: %d\n", lcfg->lcfg_command);
                  GOTO(out, rc = -EINVAL);
@@ -1067,11 +1137,13 @@ static int lov_clear_orphans(struct obd_export *export, struct obdo *src_oa,
                  /* XXX: LOV STACKING: use real "obj_mdp" sub-data */
                  err = obd_create(lov->lov_tgts[i]->ltd_exp, 
                                   tmp_oa, &obj_mdp, oti);
-                if (err)
+                if (err) {
                          /* This export will be disabled until it is recovered,
                             and then orphan recovery will be completed. */
                          CERROR("error in orphan recovery on OST idx %d/%d: "
                                 "rc = %d\n", i, lov->desc.ld_tgt_count, err);
+                        rc = err;
+                }
  
                  if (ost_uuid)
                          break;
@@ -1149,11 +1221,12 @@ static int lov_create(struct obd_export *exp, struct obdo *src_oa,
          if (!lov->desc.ld_active_tgt_count)
                  RETURN(-EIO);
  
+        lov_getref(exp->exp_obd);
          /* Recreate a specific object id at the given OST index */
          if ((src_oa->o_valid & OBD_MD_FLFLAGS) &&
              (src_oa->o_flags & OBD_FL_RECREATE_OBJS)) {
                   rc = lov_recreate(exp, src_oa, ea, oti);
-                 RETURN(rc);
+                 GOTO(out, rc);
          }
  
          maxage = cfs_time_shift_64(-lov->desc.ld_qos_maxage);
@@ -1161,7 +1234,7 @@ static int lov_create(struct obd_export *exp, struct obdo *src_oa,
  
          rc = lov_prep_create_set(exp, &oinfo, ea, src_oa, oti, &set);
          if (rc)
-                RETURN(rc);
+                GOTO(out, rc);
  
          list_for_each_entry(req, &set->set_list, rq_link) {
                  /* XXX: LOV STACKING: use real "obj_mdp" sub-data */
@@ -1170,13 +1243,16 @@ static int lov_create(struct obd_export *exp, struct obdo *src_oa,
                  lov_update_create_set(set, req, rc);
          }
          rc = lov_fini_create_set(set, ea);
+out:
+        lov_putref(exp->exp_obd);
          RETURN(rc);
  }
  
  #define ASSERT_LSM_MAGIC(lsmp)                                                  \
  do {                                                                            \
          LASSERT((lsmp) != NULL);                                                \
-        LASSERTF(((lsmp)->lsm_magic == LOV_MAGIC ||                             \
+        LASSERTF(((lsmp)->lsm_magic == LOV_MAGIC_V1 ||                          \
+                 (lsmp)->lsm_magic == LOV_MAGIC_V3 ||                           \
                   (lsmp)->lsm_magic == LOV_MAGIC_JOIN), "%p->lsm_magic=%x\n",    \
                   (lsmp), (lsmp)->lsm_magic);                                    \
  } while (0)
@@ -1190,7 +1266,7 @@ static int lov_destroy(struct obd_export *exp, struct obdo *oa,
          struct lov_request *req;
          struct list_head *pos;
          struct lov_obd *lov;
-        int rc = 0, err;
+        int rc = 0, err = 0;
          ENTRY;
  
          ASSERT_LSM_MAGIC(lsm);
@@ -1204,12 +1280,12 @@ static int lov_destroy(struct obd_export *exp, struct obdo *oa,
          }
  
          lov = &exp->exp_obd->u.lov;
+        lov_getref(exp->exp_obd);
          rc = lov_prep_destroy_set(exp, &oinfo, oa, lsm, oti, &set);
          if (rc)
-                RETURN(rc);
+                GOTO(out, rc);
  
          list_for_each (pos, &set->set_list) {
-                int err;
                  req = list_entry(pos, struct lov_request, rq_link);
  
                  if (oa->o_valid & OBD_MD_FLCOOKIE)
@@ -1233,6 +1309,8 @@ static int lov_destroy(struct obd_export *exp, struct obdo *oa,
                  rc = lsm_op_find(lsm->lsm_magic)->lsm_destroy(lsm, oa, md_exp);
          }
          err = lov_fini_destroy_set(set);
+out:
+        lov_putref(exp->exp_obd);
          RETURN(rc ? rc : err);
  }
  
@@ -1444,8 +1522,10 @@ static int lov_setattr_async(struct obd_export *exp, struct obd_info *oinfo,
          if (rc)
                  RETURN(rc);
  
-        CDEBUG(D_INFO, "objid "LPX64": %ux%u byte stripes\n",
-               oinfo->oi_md->lsm_object_id, oinfo->oi_md->lsm_stripe_count,
+        CDEBUG(D_INFO, "objid "LPX64"@"LPX64": %ux%u byte stripes\n",
+               oinfo->oi_md->lsm_object_id,
+               oinfo->oi_md->lsm_object_gr,
+               oinfo->oi_md->lsm_stripe_count,
                 oinfo->oi_md->lsm_stripe_size);
  
          list_for_each (pos, &set->set_list) {
@@ -1454,9 +1534,9 @@ static int lov_setattr_async(struct obd_export *exp, struct obd_info *oinfo,
                  if (oinfo->oi_oa->o_valid & OBD_MD_FLCOOKIE)
                          oti->oti_logcookies = set->set_cookies + req->rq_stripe;
  
-                CDEBUG(D_INFO, "objid "LPX64"[%d] has subobj "LPX64" at idx "
-                       "%u\n", oinfo->oi_oa->o_id, req->rq_stripe,
-                       req->rq_oi.oi_oa->o_id, req->rq_idx);
+                CDEBUG(D_INFO, "objid "LPX64"@"LPX64"[%d] has subobj "LPX64
+                       " at idx %u\n", oinfo->oi_oa->o_id, oinfo->oi_oa->o_gr,
+                       req->rq_stripe, req->rq_oi.oi_oa->o_id, req->rq_idx);
  
                  rc = obd_setattr_async(lov->lov_tgts[req->rq_idx]->ltd_exp,
                                         &req->rq_oi, oti, rqset);
@@ -1551,48 +1631,67 @@ static int lov_punch(struct obd_export *exp, struct obd_info *oinfo,
          RETURN(0);
  }
  
-static int lov_sync(struct obd_export *exp, struct obdo *oa,
-                    struct lov_stripe_md *lsm, obd_off start, obd_off end)
+static int lov_sync_interpret(struct ptlrpc_request_set *rqset,
+                              void *data, int rc)
  {
-        struct lov_request_set *set;
-        struct obd_info oinfo;
+        struct lov_request_set *lovset = (struct lov_request_set *)data;
+        int err;
+        ENTRY;
+
+        if (rc)
+                lovset->set_completes = 0;
+        err = lov_fini_sync_set(lovset);
+        RETURN(rc ? rc : err);
+}
+
+static int lov_sync(struct obd_export *exp, struct obd_info *oinfo,
+                    obd_off start, obd_off end,
+                    struct ptlrpc_request_set *rqset)
+{
+        struct lov_request_set *set = NULL;
          struct lov_obd *lov;
          struct list_head *pos;
          struct lov_request *req;
-        int err = 0, rc = 0;
+        int    rc = 0;
          ENTRY;
  
-        ASSERT_LSM_MAGIC(lsm);
+        ASSERT_LSM_MAGIC(oinfo->oi_md);
+        LASSERT(rqset != NULL);
  
          if (!exp->exp_obd)
                  RETURN(-ENODEV);
  
          lov = &exp->exp_obd->u.lov;
-        rc = lov_prep_sync_set(exp, &oinfo, oa, lsm, start, end, &set);
+        rc = lov_prep_sync_set(exp, oinfo, start, end, &set);
          if (rc)
                  RETURN(rc);
  
          list_for_each (pos, &set->set_list) {
                  req = list_entry(pos, struct lov_request, rq_link);
  
-                rc = obd_sync(lov->lov_tgts[req->rq_idx]->ltd_exp, 
-                              req->rq_oi.oi_oa, NULL, 
+                rc = obd_sync(lov->lov_tgts[req->rq_idx]->ltd_exp, &req->rq_oi,
                                req->rq_oi.oi_policy.l_extent.start,
-                              req->rq_oi.oi_policy.l_extent.end);
-                err = lov_update_common_set(set, req, rc);
-                if (err) {
+                              req->rq_oi.oi_policy.l_extent.end, rqset);
+                if (rc) {
                          CERROR("error: fsync objid "LPX64" subobj "LPX64
                                 " on OST idx %d: rc = %d\n",
                                 set->set_oi->oi_oa->o_id,
                                 req->rq_oi.oi_oa->o_id, req->rq_idx, rc);
-                        if (!rc)
-                                rc = err;
+                        break;
                  }
          }
-        err = lov_fini_sync_set(set);
-        if (!rc)
-                rc = err;
-        RETURN(rc);
+
+        /* If we are not waiting for responses on async requests, return. */
+        if (rc || list_empty(&rqset->set_requests)) {
+                int err = lov_fini_sync_set(set);
+                RETURN(rc ? rc : err);
+        }
+
+        LASSERT(rqset->set_interpret == NULL);
+        rqset->set_interpret = lov_sync_interpret;
+        rqset->set_arg = (void *)set;
+
+        RETURN(0);
  }
  
  static int lov_brw_check(struct lov_obd *lov, struct obd_info *lov_oinfo,
@@ -1611,7 +1710,7 @@ static int lov_brw_check(struct lov_obd *lov, struct obd_info *lov_oinfo,
                  obd_off start, end;
  
                  if (!lov_stripe_intersects(lov_oinfo->oi_md, i, pga[i].off,
-                                           pga[i].off + pga[i].count,
+                                           pga[i].off + pga[i].count - 1,
                                             &start, &end))
                          continue;
  
@@ -2108,6 +2207,7 @@ static int lov_change_cbdata(struct obd_export *exp,
                          continue;
                  }
                  submd.lsm_object_id = loi->loi_id;
+                submd.lsm_object_gr = loi->loi_gr;
                  submd.lsm_stripe_count = 0;
                  rc = obd_change_cbdata(lov->lov_tgts[loi->loi_ost_idx]->ltd_exp,
                                         &submd, it, data);
@@ -2454,8 +2554,329 @@ static int lov_iocontrol(unsigned int cmd, struct obd_export *exp, int len,
          RETURN(rc);
  }
  
+#define FIEMAP_BUFFER_SIZE 4096
+
+/* Non-zero fe_logical indicates that this is a continuation FIEMAP
+ * call. The local end offset and the device are sent in the first
+ * fm_extent. This function calculates the stripe number from the index.
+ * This function returns a stripe_no on which mapping is to be restarted.
+ *
+ * This function returns fm_end_offset which is the in-OST offset at which
+ * mapping should be restarted. If fm_end_offset=0 is returned then caller
+ * will re-calculate proper offset in next stripe.
+ * Note that the first extent is passed to lov_get_info via the value field */
+obd_size fiemap_calc_fm_end_offset(struct ll_user_fiemap *fiemap,
+                                   struct lov_stripe_md *lsm, obd_size fm_start,
+                                   obd_size fm_end, int *start_stripe)
+{
+        obd_size local_end = fiemap->fm_extents[0].fe_logical;
+        obd_off lun_start, lun_end;
+        obd_size fm_end_offset;
+        int stripe_no = -1, i;
+
+        if (fiemap->fm_extent_count == 0 ||
+            fiemap->fm_extents[0].fe_logical == 0)
+                return 0;
+
+        /* Find out stripe_no from ost_index saved in the fe_device */
+        for (i = 0; i < lsm->lsm_stripe_count; i++) {
+                if (lsm->lsm_oinfo[i]->loi_ost_idx ==
+                                        fiemap->fm_extents[0].fe_device) {
+                        stripe_no = i;
+                        break;
+                }
+        }
+
+        /* If we have finished mapping on previous device, shift logical
+         * offset to start of next device */
+        if ((lov_stripe_intersects(lsm, stripe_no, fm_start, fm_end,
+                                   &lun_start, &lun_end)) != 0 &&
+                                   local_end < lun_end) {
+                fm_end_offset = local_end;
+                *start_stripe = stripe_no;
+        } else {
+                /* This is a special value to indicate that caller should
+                 * calculate offset in next stripe. */
+                fm_end_offset = 0;
+                *start_stripe = (stripe_no + 1) % lsm->lsm_stripe_count;
+        }
+
+        return fm_end_offset;
+}
+
+/* We calculate on which OST the mapping will end. If the length of mapping
+ * is greater than (stripe_size * stripe_count) then the last_stripe will
+ * will be one just before start_stripe. Else we check if the mapping
+ * intersects each OST and find last_stripe.
+ * This function returns the last_stripe and also sets the stripe_count
+ * over which the mapping is spread */
+int fiemap_calc_last_stripe(struct lov_stripe_md *lsm, obd_size fm_start,
+                            obd_size fm_end, int start_stripe,
+                            int *stripe_count)
+{
+        int last_stripe;
+        obd_off obd_start, obd_end;
+        int i, j;
+
+        if (fm_end - fm_start > lsm->lsm_stripe_size * lsm->lsm_stripe_count) {
+                last_stripe = (start_stripe < 1 ? lsm->lsm_stripe_count - 1 :
+                                                              start_stripe - 1);
+                *stripe_count = lsm->lsm_stripe_count;
+        } else {
+                for (j = 0, i = start_stripe; j < lsm->lsm_stripe_count;
+                     i = (i + 1) % lsm->lsm_stripe_count, j++) {
+                        if ((lov_stripe_intersects(lsm, i, fm_start, fm_end,
+                                                   &obd_start, &obd_end)) == 0)
+                                break;
+                }
+                *stripe_count = j;
+                last_stripe = (start_stripe + j - 1) %lsm->lsm_stripe_count;
+        }
+
+        return last_stripe;
+}
+
+/* Set fe_device and copy extents from local buffer into main return buffer */
+void fiemap_prepare_and_copy_exts(struct ll_user_fiemap *fiemap,
+                                  struct ll_fiemap_extent *lcl_fm_ext,
+                                  int ost_index, unsigned int ext_count,
+                                  int current_extent)
+{
+        char *to;
+        int ext;
+
+        for (ext = 0; ext < ext_count; ext++) {
+                lcl_fm_ext[ext].fe_device = ost_index;
+                lcl_fm_ext[ext].fe_flags |= FIEMAP_EXTENT_NET;
+        }
+
+        /* Copy fm_extent's from fm_local to return buffer */
+        to = (char *)fiemap + fiemap_count_to_size(current_extent);
+        memcpy(to, lcl_fm_ext, ext_count * sizeof(struct ll_fiemap_extent));
+}
+
+static int lov_fiemap(struct lov_obd *lov, __u32 keylen, void *key,
+                      __u32 *vallen, void *val, struct lov_stripe_md *lsm)
+{
+        struct ll_fiemap_info_key *fm_key = key;
+        struct ll_user_fiemap *fiemap = val;
+        struct ll_user_fiemap *fm_local = NULL;
+        struct ll_fiemap_extent *lcl_fm_ext;
+        int count_local;
+        unsigned int get_num_extents = 0;
+        int ost_index = 0, actual_start_stripe, start_stripe;
+        obd_size fm_start, fm_end, fm_length, fm_end_offset = 0;
+        obd_size curr_loc;
+        int current_extent = 0, rc = 0, i;
+        int ost_eof = 0; /* EOF for object */
+        int ost_done = 0; /* done with required mapping for this OST? */
+        int last_stripe;
+        int cur_stripe = 0, cur_stripe_wrap = 0, stripe_count;
+        unsigned int buffer_size = FIEMAP_BUFFER_SIZE;
+
+        if (lsm == NULL)
+                GOTO(out, rc = 0);
+
+        if (fiemap_count_to_size(fm_key->fiemap.fm_extent_count) < buffer_size)
+                buffer_size = fiemap_count_to_size(fm_key->fiemap.fm_extent_count);
+
+        OBD_ALLOC(fm_local, buffer_size);
+        if (fm_local == NULL)
+                GOTO(out, rc = -ENOMEM);
+        lcl_fm_ext = &fm_local->fm_extents[0];
+
+        count_local = fiemap_size_to_count(buffer_size);
+
+        memcpy(fiemap, &fm_key->fiemap, sizeof(*fiemap));
+        fm_start = fiemap->fm_start;
+        fm_length = fiemap->fm_length;
+        /* Calculate start stripe, last stripe and length of mapping */
+        actual_start_stripe = start_stripe = lov_stripe_number(lsm, fm_start);
+        fm_end = (fm_length == ~0ULL ? fm_key->oa.o_size :
+                                                fm_start + fm_length - 1);
+        /* If fm_length != ~0ULL but fm_start+fm_length-1 exceeds file size */
+        if (fm_end > fm_key->oa.o_size)
+                fm_end = fm_key->oa.o_size;
+
+        last_stripe = fiemap_calc_last_stripe(lsm, fm_start, fm_end,
+                                              actual_start_stripe, &stripe_count);
+
+        fm_end_offset = fiemap_calc_fm_end_offset(fiemap, lsm, fm_start, fm_end,
+                                                  &start_stripe);
+
+        if (fiemap->fm_extent_count == 0) {
+                get_num_extents = 1;
+                count_local = 0;
+        }
+
+        /* Check each stripe */
+        for (cur_stripe = start_stripe, i = 0; i < stripe_count;
+             i++, cur_stripe = (cur_stripe + 1) % lsm->lsm_stripe_count) {
+                obd_size req_fm_len; /* Stores length of required mapping */
+                obd_size len_mapped_single_call;
+                obd_off lun_start, lun_end, obd_object_end;
+                unsigned int ext_count;
+
+                cur_stripe_wrap = cur_stripe;
+
+                /* Find out range of mapping on this stripe */
+                if ((lov_stripe_intersects(lsm, cur_stripe, fm_start, fm_end,
+                                           &lun_start, &obd_object_end)) == 0)
+                        continue;
+
+                /* If this is a continuation FIEMAP call and we are on
+                 * starting stripe then lun_start needs to be set to
+                 * fm_end_offset */
+                if (fm_end_offset != 0 && cur_stripe == start_stripe)
+                        lun_start = fm_end_offset;
+
+                if (fm_length != ~0ULL) {
+                        /* Handle fm_start + fm_length overflow */
+                        if (fm_start + fm_length < fm_start)
+                                fm_length = ~0ULL - fm_start;
+                        lun_end = lov_size_to_stripe(lsm, fm_start + fm_length,
+                                                     cur_stripe);
+                } else {
+                        lun_end = ~0ULL;
+                }
+
+                if (lun_start == lun_end)
+                        continue;
+
+                req_fm_len = obd_object_end - lun_start;
+                fm_local->fm_length = 0;
+                len_mapped_single_call = 0;
+
+                /* If the output buffer is very large and the objects have many
+                 * extents we may need to loop on a single OST repeatedly */
+                ost_eof = 0;
+                ost_done = 0;
+                do {
+                        if (get_num_extents == 0) {
+                                /* Don't get too many extents. */
+                                if (current_extent + count_local >
+                                    fiemap->fm_extent_count)
+                                        count_local = fiemap->fm_extent_count -
+                                                                 current_extent;
+                        }
+
+                        lun_start += len_mapped_single_call;
+                        fm_local->fm_length = req_fm_len - len_mapped_single_call;
+                        req_fm_len = fm_local->fm_length;
+                        fm_local->fm_extent_count = count_local;
+                        fm_local->fm_mapped_extents = 0;
+                        fm_local->fm_flags = fiemap->fm_flags;
+
+                        fm_key->oa.o_id = lsm->lsm_oinfo[cur_stripe]->loi_id;
+                        ost_index = lsm->lsm_oinfo[cur_stripe]->loi_ost_idx;
+
+                        if (ost_index < 0 || ost_index >=lov->desc.ld_tgt_count)
+                                GOTO(out, rc = -EINVAL);
+
+                        /* If OST is inactive, return extent with UNKNOWN flag */
+                        if (lov && !lov->lov_tgts[ost_index]->ltd_active) {
+                                fm_local->fm_flags |= FIEMAP_EXTENT_LAST;
+                                fm_local->fm_mapped_extents = 1;
+
+                                lcl_fm_ext[0].fe_logical = lun_start;
+                                lcl_fm_ext[0].fe_length = obd_object_end -
+                                                                      lun_start;
+                                lcl_fm_ext[0].fe_flags |= FIEMAP_EXTENT_UNKNOWN;
+
+                                goto inactive_tgt;
+                        }
+
+                        fm_local->fm_start = lun_start;
+                        fm_local->fm_flags &= ~FIEMAP_FLAG_DEVICE_ORDER;
+                        memcpy(&fm_key->fiemap, fm_local, sizeof(*fm_local));
+                        *vallen=fiemap_count_to_size(fm_local->fm_extent_count);
+                        rc = obd_get_info(lov->lov_tgts[ost_index]->ltd_exp,
+                                          keylen, key, vallen, fm_local, lsm);
+                        if (rc != 0)
+                                GOTO(out, rc);
+
+inactive_tgt:
+                        ext_count = fm_local->fm_mapped_extents;
+                        if (ext_count == 0) {
+                                ost_done = 1;
+                                /* If last stripe has hole at the end,
+                                 * then we need to return */
+                                if (cur_stripe_wrap == last_stripe) {
+                                        fiemap->fm_mapped_extents = 0;
+                                        goto finish;
+                                }
+                                break;
+                        }
+
+                        /* If we just need num of extents then go to next device */
+                        if (get_num_extents) {
+                                current_extent += ext_count;
+                                break;
+                        }
+
+                        len_mapped_single_call = lcl_fm_ext[ext_count-1].fe_logical -
+                                  lun_start + lcl_fm_ext[ext_count - 1].fe_length;
+
+                        /* Have we finished mapping on this device? */
+                        if (req_fm_len <= len_mapped_single_call)
+                                ost_done = 1;
+
+                        /* Clear the EXTENT_LAST flag which can be present on
+                         * last extent */
+                        if (lcl_fm_ext[ext_count-1].fe_flags & FIEMAP_EXTENT_LAST)
+                                lcl_fm_ext[ext_count - 1].fe_flags &=
+                                                            ~FIEMAP_EXTENT_LAST;
+
+                        curr_loc = lov_stripe_size(lsm,
+                                           lcl_fm_ext[ext_count - 1].fe_logical+
+                                           lcl_fm_ext[ext_count - 1].fe_length,
+                                           cur_stripe);
+                        if (curr_loc >= fm_key->oa.o_size)
+                                ost_eof = 1;
+
+                        fiemap_prepare_and_copy_exts(fiemap, lcl_fm_ext,
+                                                     ost_index, ext_count,
+                                                     current_extent);
+
+                        current_extent += ext_count;
+
+                        /* Ran out of available extents? */
+                        if (current_extent >= fiemap->fm_extent_count)
+                                goto finish;
+                } while (ost_done == 0 && ost_eof == 0);
+
+                if (cur_stripe_wrap == last_stripe)
+                        goto finish;
+        }
+
+finish:
+        /* Indicate that we are returning device offsets unless file just has
+         * single stripe */
+        if (lsm->lsm_stripe_count > 1)
+                fiemap->fm_flags |= FIEMAP_FLAG_DEVICE_ORDER;
+
+        if (get_num_extents)
+                goto skip_last_device_calc;
+
+        /* Check if we have reached the last stripe and whether mapping for that
+         * stripe is done. */
+        if (cur_stripe_wrap == last_stripe) {
+                if (ost_done || ost_eof)
+                        fiemap->fm_extents[current_extent - 1].fe_flags |=
+                                                             FIEMAP_EXTENT_LAST;
+        }
+
+skip_last_device_calc:
+        fiemap->fm_mapped_extents = current_extent;
+
+out:
+        OBD_FREE(fm_local, buffer_size);
+        return rc;
+}
+
  static int lov_get_info(struct obd_export *exp, __u32 keylen,
-                        void *key, __u32 *vallen, void *val)
+                        void *key, __u32 *vallen, void *val,
+                        struct lov_stripe_md *lsm)
  {
          struct obd_device *obddev = class_exp2obd(exp);
          struct lov_obd *lov = &obddev->u.lov;
@@ -2471,7 +2892,6 @@ static int lov_get_info(struct obd_export *exp, __u32 keylen,
                  struct {
                          char name[16];
                          struct ldlm_lock *lock;
-                        struct lov_stripe_md *lsm;
                  } *data = key;
                  struct ldlm_res_id *res_id = &data->lock->l_resource->lr_name;
                  struct lov_oinfo *loi;
@@ -2487,20 +2907,19 @@ static int lov_get_info(struct obd_export *exp, __u32 keylen,
                  /* XXX - it's assumed all the locks for deleted OSTs have
                   * been cancelled. Also, the export for deleted OSTs will
                   * be NULL and won't match the lock's export. */
-                for (i = 0; i < data->lsm->lsm_stripe_count; i++) {
-                        loi = data->lsm->lsm_oinfo[i];
+                for (i = 0; i < lsm->lsm_stripe_count; i++) {
+                        loi = lsm->lsm_oinfo[i];
                          if (!lov->lov_tgts[loi->loi_ost_idx])
                                  continue;
                          if (lov->lov_tgts[loi->loi_ost_idx]->ltd_exp ==
                              data->lock->l_conn_export &&
-                            loi->loi_id == res_id->name[0] &&
-                            loi->loi_gr == res_id->name[1]) {
+                            osc_res_name_eq(loi->loi_id, loi->loi_gr, res_id)) {
                                  *stripe = i;
                                  GOTO(out, rc = 0);
                          }
                  }
                  LDLM_ERROR(data->lock, "lock on inode without such object");
-                dump_lsm(D_ERROR, data->lsm);
+                dump_lsm(D_ERROR, lsm);
                  GOTO(out, rc = -ENXIO);
          } else if (KEY_IS(KEY_LAST_ID)) {
                  struct obd_id_info *info = val;
@@ -2513,21 +2932,16 @@ static int lov_get_info(struct obd_export *exp, __u32 keylen,
                  if (!tgt || !tgt->ltd_active)
                          GOTO(out, rc = -ESRCH);
  
-                rc = obd_get_info(tgt->ltd_exp, keylen, key, &size, info->data);
+                rc = obd_get_info(tgt->ltd_exp, keylen, key, &size, info->data, NULL);
                  GOTO(out, rc = 0);
          } else if (KEY_IS(KEY_LOVDESC)) {
                  struct lov_desc *desc_ret = val;
                  *desc_ret = lov->desc;
  
                  GOTO(out, rc = 0);
-        } else if (KEY_IS(KEY_LOV_IDX)) {
-                struct lov_tgt_desc *tgt;
-
-                for(i = 0; i < lov->desc.ld_tgt_count; i++) {
-                        tgt = lov->lov_tgts[i];
-                        if (tgt && obd_uuid_equals(val, &tgt->ltd_uuid))
-                                GOTO(out, rc = i);
-                }
+        } else if (KEY_IS(KEY_FIEMAP)) {
+                rc = lov_fiemap(lov, keylen, key, vallen, val, lsm);
+                GOTO(out, rc);
          }
  
          rc = -EINVAL;
@@ -2842,6 +3256,7 @@ struct obd_ops lov_obd_ops = {
          .o_trigger_group_io    = lov_trigger_group_io,
          .o_teardown_async_page = lov_teardown_async_page,
          .o_merge_lvb           = lov_merge_lvb,
+        .o_update_lvb          = lov_update_lvb,
          .o_adjust_kms          = lov_adjust_kms,
          .o_punch               = lov_punch,
          .o_sync                = lov_sync,
@@ -2862,6 +3277,10 @@ struct obd_ops lov_obd_ops = {
          .o_unregister_page_removal_cb = lov_unregister_page_removal_cb,
          .o_register_lock_cancel_cb = lov_register_lock_cancel_cb,
          .o_unregister_lock_cancel_cb = lov_unregister_lock_cancel_cb,
+        .o_pool_new            = lov_pool_new,
+        .o_pool_rem            = lov_pool_remove,
+        .o_pool_add            = lov_pool_add,
+        .o_pool_del            = lov_pool_del,
  };
  
  static quota_interface_t *quota_interface;
@@ -2911,7 +3330,7 @@ static void /*__exit*/ lov_exit(void)
          LASSERT(rc == 0);
  }
  
-MODULE_AUTHOR("Cluster File Systems, Inc. <info@clusterfs.com>");
+MODULE_AUTHOR("Sun Microsystems, Inc. <http://www.lustre.org/>");
  MODULE_DESCRIPTION("Lustre Logical Object Volume OBD driver");
  MODULE_LICENSE("GPL");
  
diff --git a/lustre/lov/lov_offset.c b/lustre/lov/lov_offset.c

index 4717882..487f6da 100644 (file)
--- a/lustre/lov/lov_offset.c
+++ b/lustre/lov/lov_offset.c
@@ -1,25 +1,37 @@
  /* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
   * vim:expandtab:shiftwidth=8:tabstop=8:
   *
- * Copyright (C) 2002, 2003 Cluster File Systems, Inc.
+ * GPL HEADER START
   *
- *   This file is part of the Lustre file system, http://www.lustre.org
- *   Lustre is a trademark of Cluster File Systems, Inc.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   *
- *   You may have signed or agreed to another license before downloading
- *   this software.  If so, you are bound by the terms and conditions
- *   of that agreement, and the following does not apply to you.  See the
- *   LICENSE file included with this distribution for more information.
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
   *
- *   If you did not agree to a different license, then this copy of Lustre
- *   is open source software; you can redistribute it and/or modify it
- *   under the terms of version 2 of the GNU General Public License as
- *   published by the Free Software Foundation.
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
   *
- *   In either case, Lustre is distributed in the hope that it will be
- *   useful, but WITHOUT ANY WARRANTY; without even the implied warranty
- *   of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- *   license text for more details.
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright  2008 Sun Microsystems, Inc. All rights reserved
+ * Use is subject to license terms.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
   */
  
  #ifndef EXPORT_SYMTAB
@@ -42,8 +54,9 @@
  obd_size lov_stripe_size(struct lov_stripe_md *lsm, obd_size ost_size,
                           int stripeno)
  {
-        unsigned long ssize  = lsm->lsm_stripe_size;
-        unsigned long swidth, stripe_size;
+        obd_size ssize  = lsm->lsm_stripe_size;
+        unsigned long stripe_size;
+        obd_off swidth;
          int sindex = stripeno;
          obd_size lov_size;
          int magic = lsm->lsm_magic;
@@ -55,8 +68,8 @@ obd_size lov_stripe_size(struct lov_stripe_md *lsm, obd_size ost_size,
          LASSERT(lsm_op_find(magic) != NULL);
          lsm_op_find(magic)->lsm_stripe_by_index(lsm, &stripeno, NULL, &swidth);
   
-        /* do_div(a, b) returns a % b, and a = a / b */
-        stripe_size = do_div(ost_size, ssize);
+        /* ll_do_div64(a, b) returns a % b, and a = a / b */
+        stripe_size = ll_do_div64(ost_size, ssize);
          if (stripe_size)
                  lov_size = ost_size * swidth + stripeno * ssize + stripe_size;
          else
@@ -115,42 +128,43 @@ obd_size lov_stripe_size(struct lov_stripe_md *lsm, obd_size ost_size,
   * falls in the stripe and no shifting was done; > 0 when the offset
   * was outside the stripe and was pulled back to its final byte. */
  int lov_stripe_offset(struct lov_stripe_md *lsm, obd_off lov_off,
-                      int stripeno, obd_off *obd_off)
+                      int stripeno, obd_off *obdoff)
  {
          unsigned long ssize  = lsm->lsm_stripe_size;
-        unsigned long swidth, stripe_off, this_stripe;
+        unsigned long stripe_off, this_stripe;
          __u64 l_off, s_off;
+        obd_off swidth;
          int magic = lsm->lsm_magic;
          int ret = 0;
  
          if (lov_off == OBD_OBJECT_EOF) {
-                *obd_off = OBD_OBJECT_EOF;
+                *obdoff = OBD_OBJECT_EOF;
                  return 0;
          }
  
          LASSERT(lsm_op_find(magic) != NULL);
          /*It will check whether the lov_off and stripeno 
           *are in the same extent. 
-         *1) lov_off extent < stripeno extent, ret = -1, obd_off = 0
+         *1) lov_off extent < stripeno extent, ret = -1, obdoff = 0
           *2) lov_off extent > stripeno extent, ret = 1, 
-         *   obd_off = lov_off extent offset*/
+         *   obdoff = lov_off extent offset*/
          l_off = lsm_op_find(magic)->lsm_stripe_offset_by_index(lsm, stripeno);
          s_off = lsm_op_find(magic)->lsm_stripe_offset_by_offset(lsm, lov_off);
          if (s_off < l_off) {
                  ret = -1;
-                *obd_off = 0;
+                *obdoff = 0;
                  return ret;
          } else if (s_off > l_off) {
                  ret = 1;
-                *obd_off = s_off;
+                *obdoff = s_off;
                  return ret;
          }
          /*If they are in the same extent, original logic*/
          lsm_op_find(magic)->lsm_stripe_by_index(lsm, &stripeno, &lov_off,
                                                  &swidth);
         
-        /* do_div(a, b) returns a % b, and a = a / b */
-        stripe_off = do_div(lov_off, swidth);
+        /* ll_do_div64(a, b) returns a % b, and a = a / b */
+        stripe_off = ll_do_div64(lov_off, swidth);
  
          this_stripe = stripeno * ssize;
          if (stripe_off < this_stripe) {
@@ -165,7 +179,7 @@ int lov_stripe_offset(struct lov_stripe_md *lsm, obd_off lov_off,
                  }
          }
  
-        *obd_off = lov_off * ssize + stripe_off;
+        *obdoff = lov_off * ssize + stripe_off;
          return ret;
  }
  
@@ -192,7 +206,8 @@ obd_off lov_size_to_stripe(struct lov_stripe_md *lsm, obd_off file_size,
                             int stripeno)
  {
          unsigned long ssize  = lsm->lsm_stripe_size;
-        unsigned long swidth, stripe_off, this_stripe;
+        unsigned long stripe_off, this_stripe;
+        obd_off swidth;
          int magic = lsm->lsm_magic;
  
          if (file_size == OBD_OBJECT_EOF)
@@ -202,8 +217,8 @@ obd_off lov_size_to_stripe(struct lov_stripe_md *lsm, obd_off file_size,
          lsm_op_find(magic)->lsm_stripe_by_index(lsm, &stripeno, &file_size,
                                                  &swidth);
  
-        /* do_div(a, b) returns a % b, and a = a / b */
-        stripe_off = do_div(file_size, swidth);
+        /* ll_do_div64(a, b) returns a % b, and a = a / b */
+        stripe_off = ll_do_div64(file_size, swidth);
  
          this_stripe = stripeno * ssize;
          if (stripe_off < this_stripe) {
@@ -265,14 +280,15 @@ int lov_stripe_intersects(struct lov_stripe_md *lsm, int stripeno,
  int lov_stripe_number(struct lov_stripe_md *lsm, obd_off lov_off)
  {
          unsigned long ssize  = lsm->lsm_stripe_size;
-        unsigned long swidth, stripe_off;
+        unsigned long stripe_off;
+        obd_off swidth;
          obd_off offset = lov_off;
          int magic = lsm->lsm_magic;
  
          LASSERT(lsm_op_find(magic) != NULL);
          lsm_op_find(magic)->lsm_stripe_by_offset(lsm, NULL, &lov_off, &swidth);
  
-        stripe_off = do_div(lov_off, swidth);
+        stripe_off = ll_do_div64(lov_off, swidth);
  
          return (stripe_off/ssize +
                  lsm_op_find(magic)->lsm_stripe_index_by_offset(lsm, offset));
diff --git a/lustre/lov/lov_pack.c b/lustre/lov/lov_pack.c

index aea018f..942ff37 100644 (file)
--- a/lustre/lov/lov_pack.c
+++ b/lustre/lov/lov_pack.c
@@ -1,26 +1,43 @@
  /* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
   * vim:expandtab:shiftwidth=8:tabstop=8:
   *
- *  Copyright (C) 2002, 2003 Cluster File Systems, Inc.
- *   Author: Andreas Dilger <adilger@clusterfs.com>
+ * GPL HEADER START
   *
- *   This file is part of Lustre, http://www.lustre.org.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   *
- *   Lustre is free software; you can redistribute it and/or
- *   modify it under the terms of version 2 of the GNU General Public
- *   License as published by the Free Software Foundation.
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
   *
- *   Lustre is distributed in the hope that it will be useful,
- *   but WITHOUT ANY WARRANTY; without even the implied warranty of
- *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- *   GNU General Public License for more details.
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
   *
- *   You should have received a copy of the GNU General Public License
- *   along with Lustre; if not, write to the Free Software
- *   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright  2008 Sun Microsystems, Inc. All rights reserved
+ * Use is subject to license terms.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/lov/lov_pack.c
   *
   * (Un)packing of OST/MDS requests
   *
+ * Author: Andreas Dilger <adilger@clusterfs.com>
   */
  
  #define DEBUG_SUBSYSTEM S_LOV
@@ -68,15 +85,61 @@ void lov_dump_lmm_join(int level, struct lov_mds_md_join *lmmj)
  {
  
          CDEBUG(level, "objid "LPX64", magic 0x%08X, pattern %#X\n",
-               le64_to_cpu(lmmj->lmmj_md.lmm_object_id), 
+               le64_to_cpu(lmmj->lmmj_md.lmm_object_id),
                 le32_to_cpu(lmmj->lmmj_md.lmm_magic),
                 le32_to_cpu(lmmj->lmmj_md.lmm_pattern));
          CDEBUG(level,"stripe_size %u, stripe_count %u extent_count %u \n",
                 le32_to_cpu(lmmj->lmmj_md.lmm_stripe_size),
-               le32_to_cpu(lmmj->lmmj_md.lmm_stripe_count), 
+               le32_to_cpu(lmmj->lmmj_md.lmm_stripe_count),
                 le32_to_cpu(lmmj->lmmj_extent_count));
  }
  
+void lov_dump_lmm_v3(int level, struct lov_mds_md_v3 *lmm)
+{
+        struct lov_ost_data_v1 *lod;
+        int i;
+
+        CDEBUG(level, "objid "LPX64", magic 0x%08x, pattern %#x\n",
+               le64_to_cpu(lmm->lmm_object_id), le32_to_cpu(lmm->lmm_magic),
+               le32_to_cpu(lmm->lmm_pattern));
+        CDEBUG(level,"stripe_size %u, stripe_count %u\n",
+               le32_to_cpu(lmm->lmm_stripe_size),
+               le32_to_cpu(lmm->lmm_stripe_count));
+        CDEBUG(level,"pool_name "LOV_POOLNAMEF"\n", lmm->lmm_pool_name);
+
+        if (le32_to_cpu(lmm->lmm_stripe_count) <= LOV_V1_INSANE_STRIPE_COUNT) {
+                for (i = 0, lod = lmm->lmm_objects;
+                     i < (int)le32_to_cpu(lmm->lmm_stripe_count); i++, lod++)
+                         CDEBUG(level,
+                                "stripe %u idx %u subobj "LPX64"/"LPX64"\n",
+                                i, le32_to_cpu(lod->l_ost_idx),
+                                le64_to_cpu(lod->l_object_gr),
+                                le64_to_cpu(lod->l_object_id));
+        } else {
+                CDEBUG(level, "bad stripe_count %u > max_stripe_count %u\n",
+                       le32_to_cpu(lmm->lmm_stripe_count),
+                       LOV_V1_INSANE_STRIPE_COUNT);
+        }
+}
+
+void lov_dump_lmm(int level, void *lmm)
+{
+        int magic;
+
+        magic = ((struct lov_mds_md_v1 *)(lmm))->lmm_magic;
+        switch (magic) {
+        case LOV_MAGIC_V1:
+                return lov_dump_lmm_v1(level, (struct lov_mds_md_v1 *)(lmm));
+        case LOV_MAGIC_JOIN:
+                return lov_dump_lmm_join(level, (struct lov_mds_md_join *)(lmm));
+        case LOV_MAGIC_V3:
+                return lov_dump_lmm_v3(level, (struct lov_mds_md_v3 *)(lmm));
+        default:
+                CERROR("Cannot recognize lmm_magic %x", magic);
+        }
+        return;
+}
+
  #define LMM_ASSERT(test)                                                \
  do {                                                                    \
          if (!(test)) lov_dump_lmm(D_ERROR, lmm);                        \
@@ -96,36 +159,57 @@ int lov_packmd(struct obd_export *exp, struct lov_mds_md **lmmp,
  {
          struct obd_device *obd = class_exp2obd(exp);
          struct lov_obd *lov = &obd->u.lov;
+        struct lov_mds_md_v1 *lmmv1;
+        struct lov_mds_md_v3 *lmmv3;
          struct lov_oinfo *loi;
-        struct lov_mds_md *lmm;
-        int stripe_count = lov->desc.ld_tgt_count;
-        int lmm_size;
+        int stripe_count;
+        struct lov_ost_data_v1 *lmm_objects;
+        int lmm_size, lmm_magic;
          int i;
          ENTRY;
  
          if (lsm) {
-                if (lsm->lsm_magic != LOV_MAGIC) {
-                        CERROR("bad mem LOV MAGIC: 0x%08X != 0x%08X\n",
-                               lsm->lsm_magic, LOV_MAGIC);
-                        RETURN(-EINVAL);
-                }
+                lmm_magic = lsm->lsm_magic;
+
                  if (!lmmp) {
-                        stripe_count = lov_get_stripecnt(lov, lsm->lsm_stripe_count);
+                        stripe_count = lov_get_stripecnt(lov,
+                                                         lsm->lsm_stripe_count);
                          lsm->lsm_stripe_count = stripe_count;
                  } else {
                          stripe_count = lsm->lsm_stripe_count;
                  }
+        } else {
+                /* No needs to allocated more than LOV_MAX_STRIPE_COUNT.
+                 * Anyway, this is pretty inaccurate since ld_tgt_count now
+                 * represents max index and we should rely on the actual number
+                 * of OSTs instead */
+                stripe_count = min((__u32)LOV_MAX_STRIPE_COUNT,
+                                   lov->desc.ld_tgt_count);
+
+                if (lmmp && *lmmp)
+                        lmm_magic = le32_to_cpu((*lmmp)->lmm_magic);
+                else
+                        /* lsm == NULL and lmmp == NULL */
+                        lmm_magic = LOV_MAGIC;
+        }
+
+        if ((lmm_magic != LOV_MAGIC_V1) &&
+            (lmm_magic != LOV_MAGIC_V3)) {
+                CERROR("bad mem LOV MAGIC: 0x%08X != 0x%08X nor 0x%08X\n",
+                       lmm_magic, LOV_MAGIC_V1, LOV_MAGIC_V3);
+                RETURN(-EINVAL);
          }
  
          /* XXX LOV STACKING call into osc for sizes */
-        lmm_size = lov_mds_md_size(stripe_count);
+        lmm_size = lov_mds_md_size(stripe_count, lmm_magic);
  
          if (!lmmp)
                  RETURN(lmm_size);
  
          if (*lmmp && !lsm) {
                  stripe_count = le32_to_cpu((*lmmp)->lmm_stripe_count);
-                OBD_FREE(*lmmp, lov_mds_md_size(stripe_count));
+                lmm_size = lov_mds_md_size(stripe_count, le32_to_cpu((*lmmp)->lmm_magic));
+                OBD_FREE(*lmmp, lmm_size);
                  *lmmp = NULL;
                  RETURN(0);
          }
@@ -136,27 +220,43 @@ int lov_packmd(struct obd_export *exp, struct lov_mds_md **lmmp,
                          RETURN(-ENOMEM);
          }
  
-        lmm = *lmmp;
-        lmm->lmm_magic = cpu_to_le32(LOV_MAGIC); /* only write new format */
+        CDEBUG(D_INFO, "lov_packmd: LOV_MAGIC 0x%08X, lmm_size = %d \n",
+               lmm_magic, lmm_size);
+
+        lmmv1 = *lmmp;
+        lmmv3 = (struct lov_mds_md_v3 *)*lmmp;
+        if (lmm_magic == LOV_MAGIC_V3)
+                lmmv3->lmm_magic = cpu_to_le32(LOV_MAGIC_V3);
+        else
+                lmmv1->lmm_magic = cpu_to_le32(LOV_MAGIC_V1);
  
          if (!lsm)
                  RETURN(lmm_size);
  
-        lmm->lmm_object_id = cpu_to_le64(lsm->lsm_object_id);
-        lmm->lmm_object_gr = cpu_to_le64(lsm->lsm_object_gr);
-        lmm->lmm_stripe_size = cpu_to_le32(lsm->lsm_stripe_size);
-        lmm->lmm_stripe_count = cpu_to_le32(stripe_count);
-        lmm->lmm_pattern = cpu_to_le32(lsm->lsm_pattern);
+        /* lmmv1 and lmmv3 point to the same struct and have the
+         * same first fields
+         */
+        lmmv1->lmm_pattern = cpu_to_le32(lsm->lsm_pattern);
+        lmmv1->lmm_object_id = cpu_to_le64(lsm->lsm_object_id);
+        lmmv1->lmm_object_gr = cpu_to_le64(lsm->lsm_object_gr);
+        lmmv1->lmm_stripe_size = cpu_to_le32(lsm->lsm_stripe_size);
+        lmmv1->lmm_stripe_count = cpu_to_le32(stripe_count);
+        if (lsm->lsm_magic == LOV_MAGIC_V3) {
+                strncpy(lmmv3->lmm_pool_name, lsm->lsm_pool_name, LOV_MAXPOOLNAME);
+                lmm_objects = lmmv3->lmm_objects;
+        } else {
+                lmm_objects = lmmv1->lmm_objects;
+        }
  
          for (i = 0; i < stripe_count; i++) {
                  loi = lsm->lsm_oinfo[i];
                  /* XXX LOV STACKING call down to osc_packmd() to do packing */
                  LASSERTF(loi->loi_id, "lmm_oid "LPU64" stripe %u/%u idx %u\n",
-                         lmm->lmm_object_id, i, stripe_count, loi->loi_ost_idx);
-                lmm->lmm_objects[i].l_object_id = cpu_to_le64(loi->loi_id);
-                lmm->lmm_objects[i].l_object_gr = cpu_to_le64(loi->loi_gr);
-                lmm->lmm_objects[i].l_ost_gen = cpu_to_le32(loi->loi_ost_gen);
-                lmm->lmm_objects[i].l_ost_idx = cpu_to_le32(loi->loi_ost_idx);
+                         lmmv1->lmm_object_id, i, stripe_count, loi->loi_ost_idx);
+                lmm_objects[i].l_object_id = cpu_to_le64(loi->loi_id);
+                lmm_objects[i].l_object_gr = cpu_to_le64(loi->loi_gr);
+                lmm_objects[i].l_ost_gen = cpu_to_le32(loi->loi_ost_gen);
+                lmm_objects[i].l_ost_idx = cpu_to_le32(loi->loi_ost_idx);
          }
  
          RETURN(lmm_size);
@@ -186,9 +286,22 @@ static int lov_verify_lmm(void *lmm, int lmm_bytes, int *stripe_count)
          int rc;
  
          if (lsm_op_find(le32_to_cpu(*(__u32 *)lmm)) == NULL) {
-                CERROR("bad disk LOV MAGIC: 0x%08X; dumping V1 LMM:\n",
-                       le32_to_cpu(*(__u32 *)lmm));
-                lov_dump_lmm_v1(D_WARNING, lmm);
+                char *buffer;
+                int sz;
+
+                CERROR("bad disk LOV MAGIC: 0x%08X; dumping LMM (size=%d):\n",
+                       le32_to_cpu(*(__u32 *)lmm), lmm_bytes);
+                sz = lmm_bytes * 2 + 1;
+                OBD_ALLOC(buffer, sz);
+                if (buffer != NULL) {
+                        int i;
+
+                        for (i = 0; i < lmm_bytes; i++)
+                                sprintf(buffer+2*i, "%.2X", ((char *)lmm)[i]);
+                        buffer[sz] = '\0';
+                        CERROR("%s\n", buffer);
+                        OBD_FREE(buffer, sz);
+                }
                  return -EINVAL;
          }
          rc = lsm_op_find(le32_to_cpu(*(__u32 *)lmm))->lsm_lmm_verify(lmm,
@@ -196,7 +309,7 @@ static int lov_verify_lmm(void *lmm, int lmm_bytes, int *stripe_count)
          return rc;
  }
  
-int lov_alloc_memmd(struct lov_stripe_md **lsmp, int stripe_count, 
+int lov_alloc_memmd(struct lov_stripe_md **lsmp, int stripe_count,
                        int pattern, int magic)
  {
          int i, lsm_size;
@@ -215,6 +328,7 @@ int lov_alloc_memmd(struct lov_stripe_md **lsmp, int stripe_count,
          (*lsmp)->lsm_stripe_count = stripe_count;
          (*lsmp)->lsm_maxbytes = LUSTRE_STRIPE_MAXBYTES * stripe_count;
          (*lsmp)->lsm_pattern = pattern;
+        (*lsmp)->lsm_pool_name[0] = '\0';
          (*lsmp)->lsm_oinfo[0]->loi_ost_idx = ~0;
  
          for (i = 0; i < stripe_count; i++)
@@ -226,10 +340,10 @@ int lov_alloc_memmd(struct lov_stripe_md **lsmp, int stripe_count,
  void lov_free_memmd(struct lov_stripe_md **lsmp)
  {
          struct lov_stripe_md *lsm = *lsmp;
-        
+
          LASSERT(lsm_op_find(lsm->lsm_magic) != NULL);
          lsm_op_find(lsm->lsm_magic)->lsm_free(lsm);
-        
+
          *lsmp = NULL;
  }
  
@@ -237,7 +351,7 @@ void lov_free_memmd(struct lov_stripe_md **lsmp)
  /* Unpack LOV object metadata from disk storage.  It is packed in LE byte
   * order and is opaque to the networking layer.
   */
-int lov_unpackmd(struct obd_export *exp,  struct lov_stripe_md **lsmp, 
+int lov_unpackmd(struct obd_export *exp,  struct lov_stripe_md **lsmp,
                   struct lov_mds_md *lmm, int lmm_bytes)
  {
          struct obd_device *obd = class_exp2obd(exp);
@@ -270,7 +384,7 @@ int lov_unpackmd(struct obd_export *exp,  struct lov_stripe_md **lsmp,
                  RETURN(0);
          }
  
-        lsm_size = lov_alloc_memmd(lsmp, stripe_count, LOV_PATTERN_RAID0, 
+        lsm_size = lov_alloc_memmd(lsmp, stripe_count, LOV_PATTERN_RAID0,
                                     magic);
          if (lsm_size < 0)
                  RETURN(lsm_size);
@@ -301,66 +415,105 @@ int lov_setstripe(struct obd_export *exp, struct lov_stripe_md **lsmp,
  {
          struct obd_device *obd = class_exp2obd(exp);
          struct lov_obd *lov = &obd->u.lov;
-        struct lov_user_md lum;
+        struct lov_user_md_v3 lumv3;
+        struct lov_user_md_v1 *lumv1 = (struct lov_user_md_v1 *)&lumv3;
+        int lmm_magic;
          int stripe_count;
          int rc;
          ENTRY;
  
-        rc = copy_from_user(&lum, lump, sizeof(lum));
+        rc = copy_from_user(&lumv3, lump, sizeof(struct lov_user_md_v1));
          if (rc)
                  RETURN(-EFAULT);
  
-        if (lum.lmm_magic != LOV_USER_MAGIC) {
-                if (lum.lmm_magic == __swab32(LOV_USER_MAGIC)) {
-                        lustre_swab_lov_user_md(&lum);
-                } else {
-                        CDEBUG(D_IOCTL, "bad userland LOV MAGIC:"
-                               " %#08x != %#08x\n",
-                               lum.lmm_magic, LOV_USER_MAGIC);
-                        RETURN(-EINVAL);
-                }
+        lmm_magic = lumv1->lmm_magic;
+
+        if (lmm_magic == __swab32(LOV_USER_MAGIC_V1)) {
+                lustre_swab_lov_user_md_v1(lumv1);
+                lmm_magic = LOV_USER_MAGIC_V1;
+        } else if (lmm_magic == LOV_USER_MAGIC_V3) {
+                rc = copy_from_user(&lumv3, lump, sizeof(lumv3));
+                if (rc)
+                        RETURN(-EFAULT);
+        } else if (lmm_magic == __swab32(LOV_USER_MAGIC_V3)) {
+                rc = copy_from_user(&lumv3, lump, sizeof(lumv3));
+                if (rc)
+                        RETURN(-EFAULT);
+                lustre_swab_lov_user_md_v3(&lumv3);
+                lmm_magic = LOV_USER_MAGIC_V3;
+        } else if (lmm_magic != LOV_USER_MAGIC_V1) {
+                CDEBUG(D_IOCTL,
+                       "bad userland LOV MAGIC: %#08x != %#08x nor %#08x\n",
+                       lmm_magic, LOV_USER_MAGIC_V1, LOV_USER_MAGIC_V3);
+                       RETURN(-EINVAL);
          }
  
-        if (lum.lmm_pattern == 0) {
-                lum.lmm_pattern = lov->desc.ld_pattern ?
+        /* in the rest of the tests, as *lumv1 and lumv3 have the same
+         * fields, we use lumv1 to avoid code duplication */
+
+        if (lumv1->lmm_pattern == 0) {
+                lumv1->lmm_pattern = lov->desc.ld_pattern ?
                          lov->desc.ld_pattern : LOV_PATTERN_RAID0;
          }
  
-        if (lum.lmm_pattern != LOV_PATTERN_RAID0) {
+        if (lumv1->lmm_pattern != LOV_PATTERN_RAID0) {
                  CDEBUG(D_IOCTL, "bad userland stripe pattern: %#x\n",
-                       lum.lmm_pattern);
+                       lumv1->lmm_pattern);
                  RETURN(-EINVAL);
          }
  
          /* 64kB is the largest common page size we see (ia64), and matches the
           * check in lfs */
-        if (lum.lmm_stripe_size & (LOV_MIN_STRIPE_SIZE - 1)) {
+        if (lumv1->lmm_stripe_size & (LOV_MIN_STRIPE_SIZE - 1)) {
                  CDEBUG(D_IOCTL, "stripe size %u not multiple of %u, fixing\n",
-                       lum.lmm_stripe_size, LOV_MIN_STRIPE_SIZE);
-                lum.lmm_stripe_size = LOV_MIN_STRIPE_SIZE;
+                       lumv1->lmm_stripe_size, LOV_MIN_STRIPE_SIZE);
+                lumv1->lmm_stripe_size = LOV_MIN_STRIPE_SIZE;
          }
  
-        if ((lum.lmm_stripe_offset >= lov->desc.ld_tgt_count) &&
-            (lum.lmm_stripe_offset != (typeof(lum.lmm_stripe_offset))(-1))) {
+        if ((lumv1->lmm_stripe_offset >= lov->desc.ld_tgt_count) &&
+            (lumv1->lmm_stripe_offset !=
+             (typeof(lumv1->lmm_stripe_offset))(-1))) {
                  CDEBUG(D_IOCTL, "stripe offset %u > number of OSTs %u\n",
-                       lum.lmm_stripe_offset, lov->desc.ld_tgt_count);
+                       lumv1->lmm_stripe_offset, lov->desc.ld_tgt_count);
                  RETURN(-EINVAL);
          }
-        stripe_count = lov_get_stripecnt(lov, lum.lmm_stripe_count);
  
-        if ((__u64)lum.lmm_stripe_size * stripe_count > ~0U) {
-                CDEBUG(D_IOCTL, "stripe width %ux%u exceeds %u bytes\n",
-                       lum.lmm_stripe_size, (int)lum.lmm_stripe_count, ~0U);
-                RETURN(-EINVAL);
+        stripe_count = lov_get_stripecnt(lov, lumv1->lmm_stripe_count);
+
+        if (lmm_magic == LOV_USER_MAGIC_V3) {
+                struct pool_desc *pool;
+
+                pool = lov_find_pool(lov, lumv3.lmm_pool_name);
+                if (pool == NULL)
+                        RETURN(-EINVAL);
+
+                if (lumv1->lmm_stripe_offset !=
+                    (typeof(lumv1->lmm_stripe_offset))(-1)) {
+                        rc = lov_check_index_in_pool(lumv1->lmm_stripe_offset,
+                                                     pool);
+                        if (rc < 0) {
+                                lh_put(lov->lov_pools_hash_body, &pool->pool_hash);
+                                RETURN(-EINVAL);
+                        }
+                }
+
+                if (stripe_count > pool_tgt_count(pool))
+                        stripe_count = pool_tgt_count(pool);
+
+                lh_put(lov->lov_pools_hash_body, &pool->pool_hash);
          }
  
-        rc = lov_alloc_memmd(lsmp, stripe_count, lum.lmm_pattern, LOV_MAGIC);
+        rc = lov_alloc_memmd(lsmp, stripe_count, lumv1->lmm_pattern, lmm_magic);
  
          if (rc < 0)
                  RETURN(rc);
  
-        (*lsmp)->lsm_oinfo[0]->loi_ost_idx = lum.lmm_stripe_offset;
-        (*lsmp)->lsm_stripe_size = lum.lmm_stripe_size;
+        (*lsmp)->lsm_oinfo[0]->loi_ost_idx = lumv1->lmm_stripe_offset;
+        (*lsmp)->lsm_stripe_size = lumv1->lmm_stripe_size;
+
+        if (lmm_magic == LOV_USER_MAGIC_V3)
+                strncpy((*lsmp)->lsm_pool_name, lumv3.lmm_pool_name,
+                        LOV_MAXPOOLNAME);
  
          RETURN(0);
  }
@@ -373,20 +526,27 @@ int lov_setea(struct obd_export *exp, struct lov_stripe_md **lsmp,
          struct obd_export *oexp;
          struct lov_obd *lov = &exp->exp_obd->u.lov;
          obd_id last_id = 0;
+        struct lov_user_ost_data_v1 *lmm_objects;
  
          ENTRY;
+
+        if (lump->lmm_magic == LOV_USER_MAGIC_V3)
+                lmm_objects = ((struct lov_user_md_v3 *)lump)->lmm_objects;
+        else
+                lmm_objects = lump->lmm_objects;
+
          for (i = 0; i < lump->lmm_stripe_count; i++) {
                  __u32 len = sizeof(last_id);
-                oexp = lov->lov_tgts[lump->lmm_objects[i].l_ost_idx]->ltd_exp;
+                oexp = lov->lov_tgts[lmm_objects[i].l_ost_idx]->ltd_exp;
                  rc = obd_get_info(oexp, sizeof(KEY_LAST_ID), KEY_LAST_ID,
-                                  &len, &last_id);
+                                  &len, &last_id, NULL);
                  if (rc)
                          RETURN(rc);
-                if (lump->lmm_objects[i].l_object_id > last_id) {
+                if (lmm_objects[i].l_object_id > last_id) {
                          CERROR("Setting EA for object > than last id on "
                                 "ost idx %d "LPD64" > "LPD64" \n",
-                               lump->lmm_objects[i].l_ost_idx,
-                               lump->lmm_objects[i].l_object_id, last_id);
+                               lmm_objects[i].l_ost_idx,
+                               lmm_objects[i].l_object_id, last_id);
                          RETURN(-EINVAL);
                  }
          }
@@ -397,9 +557,9 @@ int lov_setea(struct obd_export *exp, struct lov_stripe_md **lsmp,
  
          for (i = 0; i < lump->lmm_stripe_count; i++) {
                  (*lsmp)->lsm_oinfo[i]->loi_ost_idx =
-                        lump->lmm_objects[i].l_ost_idx;
-                (*lsmp)->lsm_oinfo[i]->loi_id = lump->lmm_objects[i].l_object_id;
-                (*lsmp)->lsm_oinfo[i]->loi_gr = lump->lmm_objects[i].l_object_gr;
+                        lmm_objects[i].l_ost_idx;
+                (*lsmp)->lsm_oinfo[i]->loi_id = lmm_objects[i].l_object_id;
+                (*lsmp)->lsm_oinfo[i]->loi_gr = lmm_objects[i].l_object_gr;
          }
          RETURN(0);
  }
@@ -414,19 +574,27 @@ int lov_setea(struct obd_export *exp, struct lov_stripe_md **lsmp,
  int lov_getstripe(struct obd_export *exp, struct lov_stripe_md *lsm,
                    struct lov_user_md *lump)
  {
-        struct lov_user_md lum;
+        /* we use lov_user_md_v3 because it is larger than lov_user_md_v1 */
+        struct lov_user_md_v3 lum;
          struct lov_mds_md *lmmk = NULL;
          int rc, lmm_size;
+        int lum_size;
          ENTRY;
  
          if (!lsm)
                  RETURN(-ENODATA);
-
-        rc = copy_from_user(&lum, lump, sizeof(lum));
+        /* we only need the header part from user space to get lmm_magic and
+         * lmm_stripe_count, (the header part is common to v1 and v3) */
+        lum_size = sizeof(struct lov_user_md_v1);
+        rc = copy_from_user(&lum, lump, lum_size);
          if (rc)
                  RETURN(-EFAULT);
+        /* if v3 we just have to update the lum_size */
+        if (lum.lmm_magic == LOV_USER_MAGIC_V3)
+                lum_size = sizeof(struct lov_user_md_v3);
  
-        if (lum.lmm_magic != LOV_USER_MAGIC)
+        if ((lum.lmm_magic != LOV_USER_MAGIC_V1) &&
+            (lum.lmm_magic != LOV_USER_MAGIC_V3))
                  RETURN(-EINVAL);
  
          rc = lov_packmd(exp, &lmmk, lsm);
@@ -436,12 +604,13 @@ int lov_getstripe(struct obd_export *exp, struct lov_stripe_md *lsm,
          rc = 0;
  
          /* FIXME: Bug 1185 - copy fields properly when structs change */
-        LASSERT(sizeof(lum) == sizeof(*lmmk));
+        /* struct lov_user_md_v3 and struct lov_mds_md_v3 must be the same */
+        LASSERT(sizeof(lum) == sizeof(struct lov_mds_md_v3));
          LASSERT(sizeof(lum.lmm_objects[0]) == sizeof(lmmk->lmm_objects[0]));
  
          /* User wasn't expecting this many OST entries */
          if (lum.lmm_stripe_count == 0) {
-                if (copy_to_user(lump, lmmk, sizeof(lum)))
+                if (copy_to_user(lump, lmmk, lum_size))
                          rc = -EFAULT;
          } else if (lum.lmm_stripe_count < lmmk->lmm_stripe_count) {
                  rc = -EOVERFLOW;
diff --git a/lustre/lov/lov_pool.c b/lustre/lov/lov_pool.c

new file mode 100644 (file)

index 0000000..e506d70
--- /dev/null
+++ b/lustre/lov/lov_pool.c
@@ -0,0 +1,660 @@
+/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
+ * vim:expandtab:shiftwidth=8:tabstop=8:
+ *
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see [sun.com URL with a
+ * copy of GPLv2].
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright  2008 Sun Microsystems, Inc. All rights reserved
+ * Use is subject to license terms.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/lov/lov_pool.c
+ *
+ * OST pool methods
+ *
+ * Author: Jacques-Charles LAFOUCRIERE <jc.lafoucriere@cea.fr>
+ */
+
+#define DEBUG_SUBSYSTEM S_LOV
+
+#ifdef __KERNEL__
+#include <libcfs/libcfs.h>
+#else
+#include <liblustre.h>
+#endif
+
+#include <obd.h>
+#include "lov_internal.h"
+
+static void lov_pool_getref(struct pool_desc *pool) {
+        atomic_inc(&pool->pool_refcount);
+}
+
+static void lov_pool_putref(struct pool_desc *pool) {
+        if (atomic_dec_and_test(&pool->pool_refcount)) {
+                lov_ost_pool_free(&(pool->pool_rr.lqr_pool));
+                lov_ost_pool_free(&(pool->pool_obds));
+                OBD_FREE_PTR(pool);
+        }
+}
+
+/*
+ * hash function using a Rotating Hash algorithm
+ * Knuth, D. The Art of Computer Programming,
+ * Volume 3: Sorting and Searching,
+ * Chapter 6.4.
+ * Addison Wesley, 1973
+ */
+static __u32 pool_hashfn(lustre_hash_t *hash_body, void *key, unsigned mask)
+{
+        int i;
+        __u32 result;
+        char *poolname;
+
+        result = 0;
+        poolname = (char *)key;
+        for (i = 0; i < LOV_MAXPOOLNAME; i++) {
+                if (poolname[i] == '\0')
+                        break;
+                result = (result << 4)^(result >> 28) ^  poolname[i];
+        }
+        return (result % mask);
+}
+
+static void *pool_key(struct hlist_node *hnode)
+{
+        struct pool_desc *pool;
+
+        pool = hlist_entry(hnode, struct pool_desc, pool_hash);
+        return (pool->pool_name);
+}
+
+static int pool_hashkey_compare(void *key, struct hlist_node *compared_hnode)
+{
+        char *pool_name;
+        struct pool_desc *pool;
+        int rc;
+
+        pool_name = (char *)key;
+        pool = hlist_entry(compared_hnode, struct pool_desc, pool_hash);
+        rc = strncmp(pool_name, pool->pool_name, LOV_MAXPOOLNAME);
+        return (!rc);
+}
+
+static void *pool_hashrefcount_get(struct hlist_node *hnode)
+{
+        struct pool_desc *pool;
+
+        pool = hlist_entry(hnode, struct pool_desc, pool_hash);
+        lov_pool_getref(pool);
+        return (pool);
+}
+
+static void *pool_hashrefcount_put(struct hlist_node *hnode)
+{
+        struct pool_desc *pool;
+
+        pool = hlist_entry(hnode, struct pool_desc, pool_hash);
+        lov_pool_putref(pool);
+        return (pool);
+}
+
+lustre_hash_ops_t pool_hash_operations = {
+        .lh_hash        = pool_hashfn,
+        .lh_key         = pool_key,
+        .lh_compare     = pool_hashkey_compare,
+        .lh_get         = pool_hashrefcount_get,
+        .lh_put         = pool_hashrefcount_put,
+};
+
+#ifdef LPROCFS
+/* ifdef needed for liblustre support */
+/*
+ * pool /proc seq_file methods
+ */
+/*
+ * iterator is used to go through the target pool entries
+ * index is the current entry index in the lp_array[] array
+ * index >= pos returned to the seq_file interface
+ * pos is from 0 to (pool->pool_obds.op_count - 1)
+ */
+#define POOL_IT_MAGIC 0xB001CEA0
+struct pool_iterator {
+        int magic;
+        struct pool_desc *pool;
+        int idx;        /* from 0 to pool_tgt_size - 1 */
+};
+
+static void *pool_proc_next(struct seq_file *s, void *v, loff_t *pos)
+{
+        struct pool_iterator *iter = (struct pool_iterator *)s->private;
+        int prev_idx;
+
+        LASSERT(iter->magic == POOL_IT_MAGIC);
+
+        /* test if end of file */
+        if (*pos >= pool_tgt_count(iter->pool))
+                return NULL;
+
+        /* iterate to find a non empty entry */
+        prev_idx = iter->idx;
+        down_read(&pool_tgt_rw_sem(iter->pool));
+        iter->idx++;
+        if (iter->idx == pool_tgt_count(iter->pool)) {
+                iter->idx = prev_idx; /* we stay on the last entry */
+                up_read(&pool_tgt_rw_sem(iter->pool));
+                return NULL;
+        }
+        up_read(&pool_tgt_rw_sem(iter->pool));
+        (*pos)++;
+        /* return != NULL to continue */
+        return iter;
+}
+
+static void *pool_proc_start(struct seq_file *s, loff_t *pos)
+{
+        struct pool_desc *pool = (struct pool_desc *)s->private;
+        struct pool_iterator *iter;
+
+        lov_pool_getref(pool);
+        if ((pool_tgt_count(pool) == 0) ||
+            (*pos >= pool_tgt_count(pool))) {
+                /* iter is not created, so stop() has no way to
+                 * find pool to dec ref */
+                lov_pool_putref(pool);
+                return NULL;
+        }
+
+        OBD_ALLOC_PTR(iter);
+        if (!iter)
+                return ERR_PTR(-ENOMEM);
+        iter->magic = POOL_IT_MAGIC;
+        iter->pool = pool;
+        iter->idx = 0;
+
+        /* we use seq_file private field to memorized iterator so
+         * we can free it at stop() */
+        /* /!\ do not forget to restore it to pool before freeing it */
+        s->private = iter;
+        if (*pos > 0) {
+                loff_t i;
+                void *ptr;
+
+                i = 0;
+                do {
+                     ptr = pool_proc_next(s, &iter, &i);
+                } while ((i < *pos) && (ptr != NULL));
+                return ptr;
+        }
+        return iter;
+}
+
+static void pool_proc_stop(struct seq_file *s, void *v)
+{
+        struct pool_iterator *iter = (struct pool_iterator *)s->private;
+
+        /* in some cases stop() method is called 2 times, without
+         * calling start() method (see seq_read() from fs/seq_file.c)
+         * we have to free only if s->private is an iterator */
+        if ((iter) && (iter->magic == POOL_IT_MAGIC)) {
+                /* we restore s->private so next call to pool_proc_start()
+                 * will work */
+                s->private = iter->pool;
+                lov_pool_putref(iter->pool);
+                OBD_FREE_PTR(iter);
+        }
+        return;
+}
+
+static int pool_proc_show(struct seq_file *s, void *v)
+{
+        struct pool_iterator *iter = (struct pool_iterator *)v;
+        struct lov_tgt_desc *tgt;
+
+        LASSERT(iter->magic == POOL_IT_MAGIC);
+        LASSERT(iter->pool != NULL);
+        LASSERT(iter->idx <= pool_tgt_count(iter->pool));
+
+        down_read(&pool_tgt_rw_sem(iter->pool));
+        tgt = pool_tgt(iter->pool, iter->idx);
+        up_read(&pool_tgt_rw_sem(iter->pool));
+        if (tgt)
+                seq_printf(s, "%s\n", obd_uuid2str(&(tgt->ltd_uuid)));
+
+        return 0;
+}
+
+static struct seq_operations pool_proc_ops = {
+        .start          = pool_proc_start,
+        .next           = pool_proc_next,
+        .stop           = pool_proc_stop,
+        .show           = pool_proc_show,
+};
+
+static int pool_proc_open(struct inode *inode, struct file *file)
+{
+        int rc;
+
+        rc = seq_open(file, &pool_proc_ops);
+        if (!rc) {
+                struct seq_file *s = file->private_data;
+                s->private = PROC_I(inode)->pde->data;
+        }
+        return rc;
+}
+
+static struct file_operations pool_proc_operations = {
+        .open           = pool_proc_open,
+        .read           = seq_read,
+        .llseek         = seq_lseek,
+        .release        = seq_release,
+};
+#endif /* LPROCFS */
+
+void lov_dump_pool(int level, struct pool_desc *pool)
+{
+        int i;
+
+        lov_pool_getref(pool);
+
+        CDEBUG(level, "pool "LOV_POOLNAMEF" has %d members\n",
+               pool->pool_name, pool->pool_obds.op_count);
+        down_read(&pool_tgt_rw_sem(pool));
+
+        for (i = 0; i < pool_tgt_count(pool) ; i++) {
+                if (!pool_tgt(pool, i) || !(pool_tgt(pool, i))->ltd_exp)
+                        continue;
+                CDEBUG(level, "pool "LOV_POOLNAMEF"[%d] = %s\n",
+                       pool->pool_name, i,
+                       obd_uuid2str(&((pool_tgt(pool, i))->ltd_uuid)));
+        }
+
+        up_read(&pool_tgt_rw_sem(pool));
+        lov_pool_putref(pool);
+}
+
+#define LOV_POOL_INIT_COUNT 2
+int lov_ost_pool_init(struct ost_pool *op, unsigned int count)
+{
+        if (count == 0)
+                count = LOV_POOL_INIT_COUNT;
+        op->op_array = NULL;
+        op->op_count = 0;
+        init_rwsem(&op->op_rw_sem);
+        op->op_size = count;
+        OBD_ALLOC(op->op_array, op->op_size * sizeof(op->op_array[0]));
+        if (op->op_array == NULL) {
+                op->op_size = 0;
+                return -ENOMEM;
+        }
+        return 0;
+}
+
+/* Caller must hold write op_rwlock */
+int lov_ost_pool_extend(struct ost_pool *op, unsigned int max_count)
+{
+        __u32 *new;
+        int new_size;
+
+        LASSERT(max_count != 0);
+
+        if (op->op_count < op->op_size)
+                return 0;
+
+        new_size = min(max_count, 2 * op->op_size);
+        OBD_ALLOC(new, new_size * sizeof(op->op_array[0]));
+        if (new == NULL)
+                return -ENOMEM;
+
+        /* copy old array to new one */
+        memcpy(new, op->op_array, op->op_size * sizeof(op->op_array[0]));
+        OBD_FREE(op->op_array, op->op_size * sizeof(op->op_array[0]));
+        op->op_array = new;
+        op->op_size = new_size;
+        return 0;
+}
+
+int lov_ost_pool_add(struct ost_pool *op, __u32 idx, unsigned int max_count)
+{
+        int rc = 0, i;
+        ENTRY;
+
+        down_write(&op->op_rw_sem);
+
+        rc = lov_ost_pool_extend(op, max_count);
+        if (rc)
+                GOTO(out, rc);
+
+        /* search ost in pool array */
+        for (i = 0; i < op->op_count; i++) {
+                if (op->op_array[i] == idx)
+                        GOTO(out, rc = -EEXIST);
+        }
+        /* ost not found we add it */
+        op->op_array[op->op_count] = idx;
+        op->op_count++;
+out:
+        up_write(&op->op_rw_sem);
+        return rc;
+}
+
+int lov_ost_pool_remove(struct ost_pool *op, __u32 idx)
+{
+        int i;
+
+        down_write(&op->op_rw_sem);
+        for (i = 0; i < op->op_count; i++) {
+                if (op->op_array[i] == idx) {
+                        memmove(&op->op_array[i], &op->op_array[i + 1],
+                                (op->op_count - i - 1) * sizeof(op->op_array[0]));
+                        op->op_count--;
+                        up_write(&op->op_rw_sem);
+                        return 0;
+                }
+        }
+        up_write(&op->op_rw_sem);
+        return -EINVAL;
+}
+
+int lov_ost_pool_free(struct ost_pool *op)
+{
+        if (op->op_size == 0)
+                return 0;
+
+        down_write(&op->op_rw_sem);
+        OBD_FREE(op->op_array, op->op_size * sizeof(op->op_array[0]));
+        op->op_array = NULL;
+        op->op_count = 0;
+        op->op_size = 0;
+        up_write(&op->op_rw_sem);
+        return 0;
+}
+
+
+int lov_pool_new(struct obd_device *obd, char *poolname)
+{
+        struct lov_obd *lov;
+        struct pool_desc *new_pool;
+        int rc;
+        ENTRY;
+
+        lov = &(obd->u.lov);
+
+        if (strlen(poolname) > LOV_MAXPOOLNAME)
+                RETURN(-ENAMETOOLONG);
+
+        OBD_ALLOC_PTR(new_pool);
+        if (new_pool == NULL)
+                RETURN(-ENOMEM);
+
+        strncpy(new_pool->pool_name, poolname, LOV_MAXPOOLNAME);
+        new_pool->pool_name[LOV_MAXPOOLNAME] = '\0';
+        new_pool->pool_lov = lov;
+        /* ref count init to 1 because when created a pool is always used
+         * up to deletion
+         */
+        atomic_set(&new_pool->pool_refcount, 1);
+        rc = lov_ost_pool_init(&new_pool->pool_obds, 0);
+        if (rc)
+               GOTO(out_err, rc);
+
+        memset(&(new_pool->pool_rr), 0, sizeof(struct lov_qos_rr));
+        rc = lov_ost_pool_init(&new_pool->pool_rr.lqr_pool, 0);
+        if (rc) {
+                lov_ost_pool_free(&new_pool->pool_obds);
+                GOTO(out_err, rc);
+        }
+
+        INIT_HLIST_NODE(&new_pool->pool_hash);
+        rc = lustre_hash_add_unique(lov->lov_pools_hash_body, poolname,
+                                    &new_pool->pool_hash);
+        if (rc) {
+                lov_ost_pool_free(&new_pool->pool_rr.lqr_pool);
+                lov_ost_pool_free(&new_pool->pool_obds);
+                GOTO(out_err, rc = -EEXIST);
+        }
+
+        spin_lock(&obd->obd_dev_lock);
+        list_add_tail(&new_pool->pool_list, &lov->lov_pool_list);
+        lov->lov_pool_count++;
+
+        spin_unlock(&obd->obd_dev_lock);
+
+        CDEBUG(D_CONFIG, LOV_POOLNAMEF" is pool #%d\n",
+               poolname, lov->lov_pool_count);
+
+#ifdef LPROCFS
+        /* ifdef needed for liblustre */
+        /* get ref for /proc file */
+        lov_pool_getref(new_pool);
+        new_pool->pool_proc_entry = lprocfs_add_simple(lov->lov_pool_proc_entry,
+                                                       poolname, NULL, NULL,
+                                                       new_pool,
+                                                       &pool_proc_operations);
+#endif
+
+        if (IS_ERR(new_pool->pool_proc_entry)) {
+                CWARN("Cannot add proc pool entry "LOV_POOLNAMEF"\n", poolname);
+                new_pool->pool_proc_entry = NULL;
+                lov_pool_putref(new_pool);
+        }
+
+        RETURN(0);
+
+out_err:
+        OBD_FREE_PTR(new_pool);
+        return rc;
+}
+
+int lov_pool_del(struct obd_device *obd, char *poolname)
+{
+        struct lov_obd *lov;
+        struct pool_desc *pool;
+        ENTRY;
+
+        lov = &(obd->u.lov);
+
+        spin_lock(&obd->obd_dev_lock);
+        pool = lustre_hash_lookup(lov->lov_pools_hash_body, poolname);
+        if (pool == NULL) {
+                spin_unlock(&obd->obd_dev_lock);
+                RETURN(-ENOENT);
+        }
+
+#ifdef LPROCFS
+        if (pool->pool_proc_entry != NULL) {
+                remove_proc_entry(pool->pool_proc_entry->name,
+                                  pool->pool_proc_entry->parent);
+                lov_pool_putref(pool);
+        }
+#endif
+
+        lustre_hash_del_key(lov->lov_pools_hash_body, poolname);
+        list_del_init(&pool->pool_list);
+
+        lov->lov_pool_count--;
+        lh_put(lov->lov_pools_hash_body, &pool->pool_hash);
+        spin_unlock(&obd->obd_dev_lock);
+
+        /* remove ref got when pool was created in memory
+         * pool will be freed when refount will reach 0
+         */
+        lov_pool_putref(pool);
+
+        RETURN(0);
+}
+
+
+int lov_pool_add(struct obd_device *obd, char *poolname, char *ostname)
+{
+        struct obd_uuid ost_uuid;
+        struct lov_obd *lov;
+        struct pool_desc *pool;
+        unsigned int i, lov_idx;
+        int rc;
+        ENTRY;
+
+        lov = &(obd->u.lov);
+
+        pool = lustre_hash_lookup(lov->lov_pools_hash_body, poolname);
+        if (pool == NULL)
+                RETURN(-ENOENT);
+
+        obd_str2uuid(&ost_uuid, ostname);
+
+        /* search ost in lov array */
+        mutex_down(&lov->lov_lock);
+        for (i = 0; i < lov->desc.ld_tgt_count; i++) {
+                if (!lov->lov_tgts[i])
+                        continue;
+                if (obd_uuid_equals(&ost_uuid, &(lov->lov_tgts[i]->ltd_uuid)))
+                        break;
+        }
+
+        /* test if ost found in lov */
+        if (i == lov->desc.ld_tgt_count) {
+                mutex_up(&lov->lov_lock);
+                GOTO(out, rc = -EINVAL);
+        }
+        mutex_up(&lov->lov_lock);
+
+        lov_idx = i;
+
+        rc = lov_ost_pool_add(&pool->pool_obds, lov_idx, lov->lov_tgt_size);
+        if (rc)
+                GOTO(out, rc);
+
+        pool->pool_rr.lqr_dirty = 1;
+
+        CDEBUG(D_CONFIG, "Added %s to "LOV_POOLNAMEF" as member %d\n",
+               ostname, poolname,  pool_tgt_count(pool));
+        rc = 0;
+        EXIT;
+out:
+        lh_put(lov->lov_pools_hash_body, &pool->pool_hash);
+        return rc;
+}
+
+int lov_pool_remove(struct obd_device *obd, char *poolname, char *ostname)
+{
+        struct obd_uuid ost_uuid;
+        struct lov_obd *lov;
+        struct pool_desc *pool;
+        unsigned int i, lov_idx;
+        int rc;
+        ENTRY;
+
+        lov = &(obd->u.lov);
+
+        spin_lock(&obd->obd_dev_lock);
+        pool = lustre_hash_lookup(lov->lov_pools_hash_body, poolname);
+        if (pool == NULL) {
+                spin_unlock(&obd->obd_dev_lock);
+                RETURN(-ENOENT);
+        }
+
+        obd_str2uuid(&ost_uuid, ostname);
+
+        /* search ost in lov array, to get index */
+        for (i = 0; i < lov->desc.ld_tgt_count; i++) {
+                if (!lov->lov_tgts[i])
+                        continue;
+
+                if (obd_uuid_equals(&ost_uuid, &(lov->lov_tgts[i]->ltd_uuid)))
+                        break;
+        }
+
+        /* test if ost found in lov */
+        if (i == lov->desc.ld_tgt_count) {
+                spin_unlock(&obd->obd_dev_lock);
+                GOTO(out, rc = -EINVAL);
+        }
+
+        spin_unlock(&obd->obd_dev_lock);
+
+        lov_idx = i;
+
+        lov_ost_pool_remove(&pool->pool_obds, lov_idx);
+
+        pool->pool_rr.lqr_dirty = 1;
+
+        CDEBUG(D_CONFIG, "%s removed from "LOV_POOLNAMEF"\n", ostname,
+               poolname);
+        rc = 0;
+        EXIT;
+out:
+        lh_put(lov->lov_pools_hash_body, &pool->pool_hash);
+        return rc;
+}
+
+int lov_check_index_in_pool(__u32 idx, struct pool_desc *pool)
+{
+        int i, rc;
+        ENTRY;
+
+        /* caller may no have a ref on pool if it got the pool
+         * without calling lov_find_pool() (e.g. go through the lov pool
+         * list)
+         */
+        lov_pool_getref(pool);
+
+        down_read(&pool_tgt_rw_sem(pool));
+        for (i = 0; i < pool_tgt_count(pool); i++) {
+                if (pool_tgt_array(pool)[i] == idx)
+                        GOTO(out, rc = 0);
+        }
+        rc = -ENOENT;
+        EXIT;
+out:
+        up_read(&pool_tgt_rw_sem(pool));
+
+        lov_pool_putref(pool);
+        return rc;
+}
+
+struct pool_desc *lov_find_pool(struct lov_obd *lov, char *poolname)
+{
+        struct pool_desc *pool;
+
+        pool = NULL;
+        if (poolname[0] != '\0') {
+                pool = lustre_hash_lookup(lov->lov_pools_hash_body, poolname);
+                if (pool == NULL)
+                        CWARN("Request for an unknown pool ("LOV_POOLNAMEF")\n",
+                              poolname);
+                if ((pool != NULL) && (pool_tgt_count(pool) == 0)) {
+                        CWARN("Request for an empty pool ("LOV_POOLNAMEF")\n",
+                               poolname);
+                        /* pool is ignored, so we remove ref on it */
+                        lh_put(lov->lov_pools_hash_body, &pool->pool_hash);
+                        pool = NULL;
+                }
+        }
+        return pool;
+}
+
diff --git a/lustre/lov/lov_qos.c b/lustre/lov/lov_qos.c

index 695704d..d25871d 100644 (file)
--- a/lustre/lov/lov_qos.c
+++ b/lustre/lov/lov_qos.c
@@ -1,25 +1,37 @@
  /* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
   * vim:expandtab:shiftwidth=8:tabstop=8:
   *
- * Copyright (C) 2002, 2003 Cluster File Systems, Inc.
+ * GPL HEADER START
   *
- *   This file is part of the Lustre file system, http://www.lustre.org
- *   Lustre is a trademark of Cluster File Systems, Inc.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   *
- *   You may have signed or agreed to another license before downloading
- *   this software.  If so, you are bound by the terms and conditions
- *   of that agreement, and the following does not apply to you.  See the
- *   LICENSE file included with this distribution for more information.
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
   *
- *   If you did not agree to a different license, then this copy of Lustre
- *   is open source software; you can redistribute it and/or modify it
- *   under the terms of version 2 of the GNU General Public License as
- *   published by the Free Software Foundation.
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
   *
- *   In either case, Lustre is distributed in the hope that it will be
- *   useful, but WITHOUT ANY WARRANTY; without even the implied warranty
- *   of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- *   license text for more details.
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright  2008 Sun Microsystems, Inc. All rights reserved
+ * Use is subject to license terms.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
   */
  
  #ifndef EXPORT_SYMTAB
@@ -96,7 +108,7 @@ int qos_add_tgt(struct obd_device *obd, __u32 index)
          list_add_tail(&oss->lqo_oss_list, &temposs->lqo_oss_list);
  
          lov->lov_qos.lq_dirty = 1;
-        lov->lov_qos.lq_dirty_rr = 1;
+        lov->lov_qos.lq_rr.lqr_dirty = 1;
  
          CDEBUG(D_QOS, "add tgt %s to OSS %s (%d OSTs)\n",
                 obd_uuid2str(&lov->lov_tgts[index]->ltd_uuid),
@@ -134,7 +146,7 @@ int qos_del_tgt(struct obd_device *obd, __u32 index)
          }
  
          lov->lov_qos.lq_dirty = 1;
-        lov->lov_qos.lq_dirty_rr = 1;
+        lov->lov_qos.lq_rr.lqr_dirty = 1;
  out:
          up_write(&lov->lov_qos.lq_rw_sem);
          RETURN(rc);
@@ -149,6 +161,7 @@ static int qos_calc_ppo(struct obd_device *obd)
          __u64 ba_max, ba_min, temp;
          __u32 num_active;
          int rc, i, prio_wide;
+        time_t now, age;
          ENTRY;
  
          if (!lov->lov_qos.lq_dirty)
@@ -171,6 +184,7 @@ static int qos_calc_ppo(struct obd_device *obd)
  
          ba_min = (__u64)(-1);
          ba_max = 0;
+        now = cfs_time_current_sec();
          /* Calculate OST penalty per object */
          /* (lov ref taken in alloc_qos) */
          for (i = 0; i < lov->desc.ld_tgt_count; i++) {
@@ -193,8 +207,17 @@ static int qos_calc_ppo(struct obd_device *obd)
                  lov->lov_tgts[i]->ltd_qos.ltq_penalty_per_obj =
                          (temp * prio_wide) >> 8;
  
-                if (lov->lov_qos.lq_reset == 0)
+                age = (now - lov->lov_tgts[i]->ltd_qos.ltq_used) >> 3;
+                if (lov->lov_qos.lq_reset || age > 32 * lov->desc.ld_qos_maxage)
                          lov->lov_tgts[i]->ltd_qos.ltq_penalty = 0;
+                else if (age > lov->desc.ld_qos_maxage)
+                        /* Decay the penalty by half for every 8x the update
+                         * interval that the device has been idle.  That gives
+                         * lots of time for the statfs information to be
+                         * updated (which the penalty is only a proxy for),
+                         * and avoids penalizing OSS/OSTs under light load. */
+                        lov->lov_tgts[i]->ltd_qos.ltq_penalty >>=
+                                (age / lov->desc.ld_qos_maxage);
          }
  
          num_active = lov->lov_qos.lq_active_oss_count - 1;
@@ -203,7 +226,7 @@ static int qos_calc_ppo(struct obd_device *obd)
                     we have to double the OST penalty */
                  num_active = 1;
                  for (i = 0; i < lov->desc.ld_tgt_count; i++)
-                        if (lov->lov_tgts[i]) 
+                        if (lov->lov_tgts[i])
                              lov->lov_tgts[i]->ltd_qos.ltq_penalty_per_obj <<= 1;
          }
  
@@ -212,8 +235,17 @@ static int qos_calc_ppo(struct obd_device *obd)
                  temp = oss->lqo_bavail >> 1;
                  do_div(temp, oss->lqo_ost_count * num_active);
                  oss->lqo_penalty_per_obj = (temp * prio_wide) >> 8;
-                if (lov->lov_qos.lq_reset == 0)
+
+                age = (now - oss->lqo_used) >> 3;
+                if (lov->lov_qos.lq_reset || age > 32 * lov->desc.ld_qos_maxage)
                          oss->lqo_penalty = 0;
+                else if (age > lov->desc.ld_qos_maxage)
+                        /* Decay the penalty by half for every 8x the update
+                         * interval that the device has been idle.  That gives
+                         * lots of time for the statfs information to be
+                         * updated (which the penalty is only a proxy for),
+                         * and avoids penalizing OSS/OSTs under light load. */
+                        oss->lqo_penalty >>= (age / lov->desc.ld_qos_maxage);
          }
  
          lov->lov_qos.lq_dirty = 0;
@@ -228,7 +260,7 @@ static int qos_calc_ppo(struct obd_device *obd)
                  /* Difference is less than 20% */
                  lov->lov_qos.lq_same_space = 1;
                  /* Reset weights for the next time we enter qos mode */
-                lov->lov_qos.lq_reset = 0;
+                lov->lov_qos.lq_reset = 1;
          }
          rc = 0;
  
@@ -254,10 +286,11 @@ static int qos_calc_weight(struct lov_obd *lov, int i)
  }
  
  /* We just used this index for a stripe; adjust everyone's weights */
-static int qos_used(struct lov_obd *lov, __u32 index, __u64 *total_wt)
+static int qos_used(struct lov_obd *lov, struct ost_pool *osts,
+                    __u32 index, __u64 *total_wt)
  {
          struct lov_qos_oss *oss;
-        int i;
+        int j;
          ENTRY;
  
          /* Don't allocate from this stripe anymore, until the next alloc_qos */
@@ -270,6 +303,10 @@ static int qos_used(struct lov_obd *lov, __u32 index, __u64 *total_wt)
          lov->lov_tgts[index]->ltd_qos.ltq_penalty >>= 1;
          oss->lqo_penalty >>= 1;
  
+        /* mark the OSS and OST as recently used */
+        lov->lov_tgts[index]->ltd_qos.ltq_used =
+                oss->lqo_used = cfs_time_current_sec();
+
          /* Set max penalties for this OST and OSS */
          lov->lov_tgts[index]->ltd_qos.ltq_penalty +=
                  lov->lov_tgts[index]->ltd_qos.ltq_penalty_per_obj *
@@ -287,7 +324,10 @@ static int qos_used(struct lov_obd *lov, __u32 index, __u64 *total_wt)
  
          *total_wt = 0;
          /* Decrease all OST penalties */
-        for (i = 0; i < lov->desc.ld_tgt_count; i++) {
+        for (j = 0; j < osts->op_count; j++) {
+                int i;
+
+                i = osts->op_array[j];
                  if (!lov->lov_tgts[i] || !lov->lov_tgts[i]->ltd_active)
                          continue;
                  if (lov->lov_tgts[i]->ltd_qos.ltq_penalty <
@@ -304,10 +344,11 @@ static int qos_used(struct lov_obd *lov, __u32 index, __u64 *total_wt)
                          *total_wt += lov->lov_tgts[i]->ltd_qos.ltq_weight;
  
  #ifdef QOS_DEBUG
-                CDEBUG(D_QOS, "recalc tgt %d avail="LPU64
+                CDEBUG(D_QOS, "recalc tgt %d usable=%d avail="LPU64
                         " ostppo="LPU64" ostp="LPU64" ossppo="LPU64
                         " ossp="LPU64" wt="LPU64"\n",
-                       i, TGT_BAVAIL(i) >> 10,
+                       i, lov->lov_tgts[i]->ltd_qos.ltq_usable,
+                       TGT_BAVAIL(i) >> 10,
                         lov->lov_tgts[i]->ltd_qos.ltq_penalty_per_obj >> 10,
                         lov->lov_tgts[i]->ltd_qos.ltq_penalty >> 10,
                         lov->lov_tgts[i]->ltd_qos.ltq_oss->lqo_penalty_per_obj>>10,
@@ -320,62 +361,59 @@ static int qos_used(struct lov_obd *lov, __u32 index, __u64 *total_wt)
  }
  
  #define LOV_QOS_EMPTY ((__u32)-1)
-/* compute optimal round-robin order, based on OSTs per OSS */
-static int qos_calc_rr(struct lov_obd *lov)
+/* compute optimal round-robin order, based on OSTs per OSS
+ */
+static int qos_calc_rr(struct lov_obd *lov, struct ost_pool *src_pool,
+                       struct lov_qos_rr *lqr)
  {
          struct lov_qos_oss *oss;
-        unsigned ost_count, placed, real_count;
-        int i;
+        unsigned placed, real_count;
+        int i, rc;
          ENTRY;
  
-        if (!lov->lov_qos.lq_dirty_rr) {
-                LASSERT(lov->lov_qos.lq_rr_size);
+        if (!lqr->lqr_dirty) {
+                LASSERT(lqr->lqr_pool.op_size);
                  RETURN(0);
          }
  
          /* Do actual allocation. */
          down_write(&lov->lov_qos.lq_rw_sem);
-        ost_count = lov->desc.ld_tgt_count;
-
-        if (lov->lov_qos.lq_rr_size)
-                OBD_FREE(lov->lov_qos.lq_rr_array, lov->lov_qos.lq_rr_size);
-        lov->lov_qos.lq_rr_size = ost_count *
-                sizeof(lov->lov_qos.lq_rr_array[0]);
-        OBD_ALLOC(lov->lov_qos.lq_rr_array, lov->lov_qos.lq_rr_size);
-        if (!lov->lov_qos.lq_rr_array) {
-                lov->lov_qos.lq_rr_size = 0;
-                up_write(&lov->lov_qos.lq_rw_sem);
-                RETURN(-ENOMEM);
-        }
  
-        real_count = 0;
-        for (i = 0; i < ost_count; i++) {
-                lov->lov_qos.lq_rr_array[i] = LOV_QOS_EMPTY;
-                if (lov->lov_tgts[i])
-                        real_count++;
+        real_count = src_pool->op_count;
+
+        /* Zero the pool array */
+        /* alloc_rr is holding a read lock on the pool, so nobody is adding/
+           deleting from the pool. The lq_rw_sem insures that nobody else
+           is reading. */
+        lqr->lqr_pool.op_count = real_count;
+        rc = lov_ost_pool_extend(&lqr->lqr_pool, real_count);
+        if (rc) {
+                up_write(&lov->lov_qos.lq_rw_sem);
+                RETURN(rc);
          }
+        for (i = 0; i < lqr->lqr_pool.op_count; i++)
+                lqr->lqr_pool.op_array[i] = LOV_QOS_EMPTY;
  
          /* Place all the OSTs from 1 OSS at the same time. */
          placed = 0;
          list_for_each_entry(oss, &lov->lov_qos.lq_oss_list, lqo_oss_list) {
                  int j = 0;
-                for (i = 0; i < ost_count; i++) {
-                        if (lov->lov_tgts[i] &&
-                            lov->lov_tgts[i]->ltd_qos.ltq_oss == oss) {
+                for (i = 0; i < lqr->lqr_pool.op_count; i++) {
+                        if (lov->lov_tgts[src_pool->op_array[i]] &&
+                            (lov->lov_tgts[src_pool->op_array[i]]->ltd_qos.ltq_oss == oss)) {
                                /* Evenly space these OSTs across arrayspace */
-                              int next = j * ost_count / oss->lqo_ost_count;
-                              while (lov->lov_qos.lq_rr_array[next] !=
+                              int next = j * lqr->lqr_pool.op_count / oss->lqo_ost_count;
+                              while (lqr->lqr_pool.op_array[next] !=
                                       LOV_QOS_EMPTY)
-                                      next = (next + 1) % ost_count;
-                              lov->lov_qos.lq_rr_array[next] = i;
+                                      next = (next + 1) % lqr->lqr_pool.op_count;
+                              lqr->lqr_pool.op_array[next] = src_pool->op_array[i];
                                j++;
                                placed++;
                          }
                  }
-                LASSERT(j == oss->lqo_ost_count);
          }
  
-        lov->lov_qos.lq_dirty_rr = 0;
+        lqr->lqr_dirty = 0;
          up_write(&lov->lov_qos.lq_rw_sem);
  
          if (placed != real_count) {
@@ -383,18 +421,18 @@ static int qos_calc_rr(struct lov_obd *lov)
                  LCONSOLE_ERROR_MSG(0x14e, "Failed to place all OSTs in the "
                                     "round-robin list (%d of %d).\n",
                                     placed, real_count);
-                for (i = 0; i < ost_count; i++) {
+                for (i = 0; i < lqr->lqr_pool.op_count; i++) {
                          LCONSOLE(D_WARNING, "rr #%d ost idx=%d\n", i,
-                                 lov->lov_qos.lq_rr_array[i]);
+                                 lqr->lqr_pool.op_array[i]);
                  }
-                lov->lov_qos.lq_dirty_rr = 1;
+                lqr->lqr_dirty = 1;
                  RETURN(-EAGAIN);
          }
  
  #ifdef QOS_DEBUG
-        for (i = 0; i < ost_count; i++) {
+        for (i = 0; i < lqr->lqr_pool.op_count; i++) {
                  LCONSOLE(D_QOS, "rr #%d ost idx=%d\n", i,
-                         lov->lov_qos.lq_rr_array[i]);
+                         lqr->lqr_pool.op_array[i]);
          }
  #endif
  
@@ -493,54 +531,67 @@ static int min_stripe_count(int stripe_cnt, int flags)
  #define LOV_CREATE_RESEED_MIN  1000
  /* Allocate objects on osts with round-robin algorithm */
  static int alloc_rr(struct lov_obd *lov, int *idx_arr, int *stripe_cnt,
-                    int flags)
+                    char *poolname, int flags)
  {
-        unsigned array_idx, ost_count = lov->desc.ld_tgt_count;
-        unsigned ost_active_count = lov->desc.ld_active_tgt_count;
-        int i, *idx_pos;
+        unsigned array_idx;
+        int i, rc, *idx_pos;
          __u32 ost_idx;
          int ost_start_idx_temp;
          int speed = 0;
          int stripe_cnt_min = min_stripe_count(*stripe_cnt, flags);
+        struct pool_desc *pool;
+        struct ost_pool *osts;
+        struct lov_qos_rr *lqr;
          ENTRY;
  
-        i = qos_calc_rr(lov);
-        if (i)
-                RETURN(i);
-
-        if (--lov->lov_start_count <= 0) {
-                lov->lov_start_idx = ll_rand() % ost_count;
-                lov->lov_start_count =
-                        (LOV_CREATE_RESEED_MIN / max(ost_active_count, 1U) +
-                         LOV_CREATE_RESEED_MULT) * max(ost_active_count, 1U);
-        } else if (stripe_cnt_min >= ost_active_count ||
-                   lov->lov_start_idx > ost_count) {
+        pool = lov_find_pool(lov, poolname);
+        if (pool == NULL) {
+                osts = &(lov->lov_packed);
+                lqr = &(lov->lov_qos.lq_rr);
+        } else {
+                down_read(&pool_tgt_rw_sem(pool));
+                osts = &(pool->pool_obds);
+                lqr = &(pool->pool_rr);
+        }
+
+        rc = qos_calc_rr(lov, osts, lqr);
+        if (rc)
+                GOTO(out, rc);
+
+        if (--lqr->lqr_start_count <= 0) {
+                lqr->lqr_start_idx = ll_rand() % osts->op_count;
+                lqr->lqr_start_count =
+                        (LOV_CREATE_RESEED_MIN / max(osts->op_count, 1U) +
+                         LOV_CREATE_RESEED_MULT) * max(osts->op_count, 1U);
+        } else if (stripe_cnt_min >= osts->op_count ||
+                   lqr->lqr_start_idx > osts->op_count) {
                  /* If we have allocated from all of the OSTs, slowly
                   * precess the next start if the OST/stripe count isn't
                   * already doing this for us. */
-                lov->lov_start_idx %= ost_count;
-                if (*stripe_cnt > 1 && (ost_active_count % (*stripe_cnt)) != 1)
-                        ++lov->lov_offset_idx;
+                lqr->lqr_start_idx %= osts->op_count;
+                if (*stripe_cnt > 1 && (osts->op_count % (*stripe_cnt)) != 1)
+                        ++lqr->lqr_offset_idx;
          }
          down_read(&lov->lov_qos.lq_rw_sem);
-        ost_start_idx_temp = lov->lov_start_idx;
+        ost_start_idx_temp = lqr->lqr_start_idx;
  
  repeat_find:
-        array_idx = (lov->lov_start_idx + lov->lov_offset_idx) % ost_count;
+        array_idx = (lqr->lqr_start_idx + lqr->lqr_offset_idx) % osts->op_count;
          idx_pos = idx_arr;
  #ifdef QOS_DEBUG
-        CDEBUG(D_QOS, "want %d startidx %d startcnt %d offset %d active %d "
-               "count %d arrayidx %d\n",
-               stripe_cnt, lov->lov_start_idx, lov->lov_start_count,
-               lov->lov_offset_idx, ost_active_count, ost_count, array_idx);
+        CDEBUG(D_QOS, "pool '%s' want %d startidx %d startcnt %d offset %d "
+               "active %d count %d arrayidx %d\n", poolname,
+               *stripe_cnt, lqr->lqr_start_idx, lqr->lqr_start_count,
+               lqr->lqr_offset_idx, osts->op_count, osts->op_count, array_idx);
  #endif
  
-        for (i = 0; i < ost_count; i++, array_idx=(array_idx + 1) % ost_count) {
-                ++lov->lov_start_idx;
-                ost_idx = lov->lov_qos.lq_rr_array[array_idx];
+        for (i = 0; i < osts->op_count;
+                    i++, array_idx=(array_idx + 1) % osts->op_count) {
+                ++lqr->lqr_start_idx;
+                ost_idx = lqr->lqr_pool.op_array[array_idx];
  #ifdef QOS_DEBUG
                  CDEBUG(D_QOS, "#%d strt %d act %d strp %d ary %d idx %d\n",
-                       i, lov->lov_start_idx,
+                       i, lqr->lqr_start_idx,
                         ((ost_idx != LOV_QOS_EMPTY) && lov->lov_tgts[ost_idx]) ?
                         lov->lov_tgts[ost_idx]->ltd_active : 0,
                         idx_pos - idx_arr, array_idx, ost_idx);
@@ -567,29 +618,64 @@ repeat_find:
          if ((speed < 2) && (idx_pos - idx_arr < stripe_cnt_min)) {
                  /* Try again, allowing slower OSCs */
                  speed++;
-                lov->lov_start_idx = ost_start_idx_temp;
+                lqr->lqr_start_idx = ost_start_idx_temp;
                  goto repeat_find;
          }
  
          up_read(&lov->lov_qos.lq_rw_sem);
  
          *stripe_cnt = idx_pos - idx_arr;
-        RETURN(0);
+out:
+        if (pool != NULL) {
+                up_read(&pool_tgt_rw_sem(pool));
+                /* put back ref got by lov_find_pool() */
+                lh_put(lov->lov_pools_hash_body, &pool->pool_hash);
+        }
+
+        RETURN(rc);
  }
  
  /* alloc objects on osts with specific stripe offset */
  static int alloc_specific(struct lov_obd *lov, struct lov_stripe_md *lsm,
                            int *idx_arr)
  {
-        unsigned ost_idx, ost_count = lov->desc.ld_tgt_count;
-        int i, *idx_pos;
+        unsigned ost_idx, array_idx, ost_count;
+        int i, rc, *idx_pos;
          int speed = 0;
+        struct pool_desc *pool;
+        struct ost_pool *osts;
          ENTRY;
  
+        pool = lov_find_pool(lov, lsm->lsm_pool_name);
+        if (pool == NULL) {
+                osts = &(lov->lov_packed);
+        } else {
+                down_read(&pool_tgt_rw_sem(pool));
+                osts = &(pool->pool_obds);
+        }
+
+        ost_count = osts->op_count;
+
  repeat_find:
-        ost_idx = lsm->lsm_oinfo[0]->loi_ost_idx;
+        /* search loi_ost_idx in ost array */
+        array_idx = 0;
+        for (i = 0; i < ost_count; i++) {
+                if (osts->op_array[i] == lsm->lsm_oinfo[0]->loi_ost_idx) {
+                        array_idx = i;
+                        break;
+                }
+        }
+        if (i == ost_count) {
+                CERROR("Start index %d not found in pool '%s'\n",
+                       lsm->lsm_oinfo[0]->loi_ost_idx, lsm->lsm_pool_name);
+                GOTO(out, rc = -EINVAL);
+        }
+
          idx_pos = idx_arr;
-        for (i = 0; i < ost_count; i++, ost_idx = (ost_idx + 1) % ost_count) {
+        for (i = 0; i < ost_count;
+             i++, array_idx = (array_idx + 1) % ost_count) {
+                ost_idx = osts->op_array[array_idx];
+
                  if (!lov->lov_tgts[ost_idx] ||
                      !lov->lov_tgts[ost_idx]->ltd_active) {
                          continue;
@@ -600,16 +686,20 @@ repeat_find:
                  if (OBD_FAIL_CHECK(OBD_FAIL_MDS_OSC_PRECREATE) && ost_idx == 0)
                          continue;
  
-                /* Drop slow OSCs if we can, but not for requested start idx */
+                /* Drop slow OSCs if we can, but not for requested start idx.
+                 *
+                 * This means "if OSC is slow and it is not the requested
+                 * start OST, then it can be skipped, otherwise skip it only
+                 * if it is inactive/recovering/out-of-space." */
                  if ((obd_precreate(lov->lov_tgts[ost_idx]->ltd_exp) > speed) &&
-                    (i != 0 || speed < 2))
+                    (i != 0 || speed >= 2))
                          continue;
  
                  *idx_pos = ost_idx;
                  idx_pos++;
                  /* We have enough stripes */
                  if (idx_pos - idx_arr == lsm->lsm_stripe_count)
-                        RETURN(0);
+                        GOTO(out, rc = 0);
          }
          if (speed < 2) {
                  /* Try again, allowing slower OSCs */
@@ -626,7 +716,14 @@ repeat_find:
          CERROR("can't lstripe objid "LPX64": have %d want %u\n",
                 lsm->lsm_object_id, (int)(idx_pos - idx_arr),
                 lsm->lsm_stripe_count);
-        RETURN(-EFBIG);
+        rc = -EFBIG;
+out:
+        if (pool != NULL) {
+                up_read(&pool_tgt_rw_sem(pool));
+                /* put back ref got by lov_find_pool() */
+                lh_put(lov->lov_pools_hash_body, &pool->pool_hash);
+        }
+        RETURN(rc);
  }
  
  /* Alloc objects on osts with optimization based on:
@@ -634,25 +731,35 @@ repeat_find:
     - network resources (shared OSS's)
  */
  static int alloc_qos(struct obd_export *exp, int *idx_arr, int *stripe_cnt,
-                     int flags)
+                     char *poolname, int flags)
  {
          struct lov_obd *lov = &exp->exp_obd->u.lov;
          static time_t last_warn = 0;
          time_t now = cfs_time_current_sec();
          __u64 total_bavail, total_weight = 0;
-        __u32 ost_count;
          int nfound, good_osts, i, warn = 0, rc = 0;
          int stripe_cnt_min = min_stripe_count(*stripe_cnt, flags);
+        struct pool_desc *pool;
+        struct ost_pool *osts;
+        struct lov_qos_rr *lqr;
          ENTRY;
  
          if (stripe_cnt_min < 1)
-                GOTO(out, rc = -EINVAL);
+                RETURN(-EINVAL);
+
+        pool = lov_find_pool(lov, poolname);
+        if (pool == NULL) {
+                osts = &(lov->lov_packed);
+                lqr = &(lov->lov_qos.lq_rr);
+        } else {
+                down_read(&pool_tgt_rw_sem(pool));
+                osts = &(pool->pool_obds);
+                lqr = &(pool->pool_rr);
+        }
  
          lov_getref(exp->exp_obd);
          down_write(&lov->lov_qos.lq_rw_sem);
  
-        ost_count = lov->desc.ld_tgt_count;
-
          if (lov->desc.ld_active_tgt_count < 2)
                  GOTO(out, rc = -EAGAIN);
  
@@ -666,24 +773,25 @@ static int alloc_qos(struct obd_export *exp, int *idx_arr, int *stripe_cnt,
          if (cfs_time_sub(now, last_warn) > 60 * 30)
                  warn = 1;
          /* Find all the OSTs that are valid stripe candidates */
-        for (i = 0; i < ost_count; i++) {
+        for (i = 0; i < osts->op_count; i++) {
                  __u64 bavail;
  
-                if (!lov->lov_tgts[i] || !lov->lov_tgts[i]->ltd_active)
+                if (!lov->lov_tgts[osts->op_array[i]] ||
+                    !lov->lov_tgts[osts->op_array[i]]->ltd_active)
                          continue;
-                bavail = TGT_BAVAIL(i);
+                bavail = TGT_BAVAIL(osts->op_array[i]);
                  if (!bavail) {
                          if (warn) {
                                  CDEBUG(D_QOS, "no free space on %s\n",
-                                     obd_uuid2str(&lov->lov_tgts[i]->ltd_uuid));
+                                     obd_uuid2str(&lov->lov_tgts[osts->op_array[i]]->ltd_uuid));
                                  last_warn = now;
                          }
                          continue;
                  }
-                if (!TGT_FFREE(i)) {
+                if (!TGT_FFREE(osts->op_array[i])) {
                          if (warn) {
                                  CDEBUG(D_QOS, "no free inodes on %s\n",
-                                     obd_uuid2str(&lov->lov_tgts[i]->ltd_uuid));
+                                     obd_uuid2str(&lov->lov_tgts[osts->op_array[i]]->ltd_uuid));
                                  last_warn = now;
                          }
                          continue;
@@ -691,20 +799,24 @@ static int alloc_qos(struct obd_export *exp, int *idx_arr, int *stripe_cnt,
  
                  /* Fail Check before osc_precreate() is called
                     so we can only 'fail' single OSC. */
-                if (OBD_FAIL_CHECK(OBD_FAIL_MDS_OSC_PRECREATE) && i == 0)
+                if (OBD_FAIL_CHECK(OBD_FAIL_MDS_OSC_PRECREATE) && osts->op_array[i] == 0)
                          continue;
  
-                if (obd_precreate(lov->lov_tgts[i]->ltd_exp) > 2)
+                if (obd_precreate(lov->lov_tgts[osts->op_array[i]]->ltd_exp) > 2)
                          continue;
  
-                lov->lov_tgts[i]->ltd_qos.ltq_usable = 1;
-                qos_calc_weight(lov, i);
+                lov->lov_tgts[osts->op_array[i]]->ltd_qos.ltq_usable = 1;
+                qos_calc_weight(lov, osts->op_array[i]);
                  total_bavail += bavail;
-                total_weight += lov->lov_tgts[i]->ltd_qos.ltq_weight;
+                total_weight += lov->lov_tgts[osts->op_array[i]]->ltd_qos.ltq_weight;
  
                  good_osts++;
          }
  
+#ifdef QOS_DEBUG
+        CDEBUG(D_QOS, "found %d good osts\n", good_osts);
+#endif
+
          if (good_osts < stripe_cnt_min)
                  GOTO(out, rc = -EAGAIN);
  
@@ -750,36 +862,50 @@ static int alloc_qos(struct obd_export *exp, int *idx_arr, int *stripe_cnt,
  
                  /* On average, this will hit larger-weighted osts more often.
                     0-weight osts will always get used last (only when rand=0).*/
-                for (i = 0; i < ost_count; i++) {
-                        if (!lov->lov_tgts[i] ||
-                            !lov->lov_tgts[i]->ltd_qos.ltq_usable)
+                for (i = 0; i < osts->op_count; i++) {
+                        if (!lov->lov_tgts[osts->op_array[i]] ||
+                            !lov->lov_tgts[osts->op_array[i]]->ltd_qos.ltq_usable)
                                  continue;
  
-                        cur_weight += lov->lov_tgts[i]->ltd_qos.ltq_weight;
+                        cur_weight += lov->lov_tgts[osts->op_array[i]]->ltd_qos.ltq_weight;
+#ifdef QOS_DEBUG
+                        CDEBUG(D_QOS, "stripe_cnt=%d nfound=%d cur_weight="LPU64
+                                      " rand="LPU64" total_weight="LPU64"\n",
+                               *stripe_cnt, nfound, cur_weight, rand, total_weight);
+#endif
                          if (cur_weight >= rand) {
  #ifdef QOS_DEBUG
                                  CDEBUG(D_QOS, "assigned stripe=%d to idx=%d\n",
-                                       nfound, i);
+                                       nfound, osts->op_array[i]);
  #endif
-                                idx_arr[nfound++] = i;
-                                qos_used(lov, i, &total_weight);
+                                idx_arr[nfound++] = osts->op_array[i];
+                                qos_used(lov, osts, osts->op_array[i], &total_weight);
                                  rc = 0;
                                  break;
                          }
                  }
-                /* should never satisfy below condition */
                  if (rc) {
-                        CERROR("Didn't find any OSTs?\n");
-                        break;
+                        CDEBUG(D_QOS, "Didn't find any OSTs? Reduce total weight\n");
+                        if (total_weight == 0)
+                                break;
+                        else
+                                total_weight = 0;
                  }
          }
+
          LASSERT(nfound == *stripe_cnt);
  
  out:
          up_write(&lov->lov_qos.lq_rw_sem);
  
+        if (pool != NULL) {
+                up_read(&pool_tgt_rw_sem(pool));
+                /* put back ref got by lov_find_pool() */
+                lh_put(lov->lov_pools_hash_body, &pool->pool_hash);
+        }
+
          if (rc == -EAGAIN)
-                rc = alloc_rr(lov, idx_arr, stripe_cnt, flags);
+                rc = alloc_rr(lov, idx_arr, stripe_cnt, poolname, flags);
  
          lov_putref(exp->exp_obd);
          RETURN(rc);
@@ -804,7 +930,8 @@ static int alloc_idx_array(struct obd_export *exp, struct lov_stripe_md *lsm,
  
          if (newea ||
              lsm->lsm_oinfo[0]->loi_ost_idx >= lov->desc.ld_tgt_count)
-                rc = alloc_qos(exp, tmp_arr, &stripe_cnt, flags);
+                rc = alloc_qos(exp, tmp_arr, &stripe_cnt,
+                               lsm->lsm_pool_name, flags);
          else
                  rc = alloc_specific(lov, lsm, tmp_arr);
  
@@ -850,7 +977,6 @@ int qos_prep_create(struct obd_export *exp, struct lov_request_set *set)
                          /* Find a small number of stripes we can use
                             (up to # of active osts). */
                          stripes = 1;
-                        lov_getref(exp->exp_obd);
                          for (i = 0; i < lov->desc.ld_tgt_count; i++) {
                                  if (!lov->lov_tgts[i] ||
                                      !lov->lov_tgts[i]->ltd_active)
@@ -860,7 +986,6 @@ int qos_prep_create(struct obd_export *exp, struct lov_request_set *set)
                                          break;
                                  stripes++;
                          }
-                        lov_putref(exp->exp_obd);
  
                          if (stripes < stripes_def)
                                  stripes = stripes_def;
@@ -955,4 +1080,3 @@ void qos_update(struct lov_obd *lov)
          ENTRY;
          lov->lov_qos.lq_dirty = 1;
  }
-
diff --git a/lustre/lov/lov_request.c b/lustre/lov/lov_request.c

index 82153b7..7e2eda5 100644 (file)
--- a/lustre/lov/lov_request.c
+++ b/lustre/lov/lov_request.c
@@ -1,25 +1,37 @@
  /* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
   * vim:expandtab:shiftwidth=8:tabstop=8:
   *
- * Copyright (C) 2002, 2003 Cluster File Systems, Inc.
+ * GPL HEADER START
   *
- *   This file is part of the Lustre file system, http://www.lustre.org
- *   Lustre is a trademark of Cluster File Systems, Inc.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   *
- *   You may have signed or agreed to another license before downloading
- *   this software.  If so, you are bound by the terms and conditions
- *   of that agreement, and the following does not apply to you.  See the
- *   LICENSE file included with this distribution for more information.
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
   *
- *   If you did not agree to a different license, then this copy of Lustre
- *   is open source software; you can redistribute it and/or modify it
- *   under the terms of version 2 of the GNU General Public License as
- *   published by the Free Software Foundation.
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
   *
- *   In either case, Lustre is distributed in the hope that it will be
- *   useful, but WITHOUT ANY WARRANTY; without even the implied warranty
- *   of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- *   license text for more details.
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright  2008 Sun Microsystems, Inc. All rights reserved
+ * Use is subject to license terms.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
   */
  
  #ifndef EXPORT_SYMTAB
@@ -337,6 +349,7 @@ int lov_prep_enqueue_set(struct obd_export *exp, struct obd_info *oinfo,
  
                  /* XXX LOV STACKING: submd should be from the subobj */
                  req->rq_oi.oi_md->lsm_object_id = loi->loi_id;
+                req->rq_oi.oi_md->lsm_object_gr = loi->loi_gr;
                  req->rq_oi.oi_md->lsm_stripe_count = 0;
                  req->rq_oi.oi_md->lsm_oinfo[0]->loi_kms_valid =
                          loi->loi_kms_valid;
@@ -447,6 +460,7 @@ int lov_prep_match_set(struct obd_export *exp, struct obd_info *oinfo,
  
                  /* XXX LOV STACKING: submd should be from the subobj */
                  req->rq_oi.oi_md->lsm_object_id = loi->loi_id;
+                req->rq_oi.oi_md->lsm_object_gr = loi->loi_gr;
                  req->rq_oi.oi_md->lsm_stripe_count = 0;
  
                  lov_set_add_req(req, set);
@@ -1003,6 +1017,7 @@ int lov_prep_getattr_set(struct obd_export *exp, struct obd_info *oinfo,
                         sizeof(*req->rq_oi.oi_oa));
                  req->rq_oi.oi_oa->o_id = loi->loi_id;
                  req->rq_oi.oi_cb_up = cb_getattr_update;
+                req->rq_rqset = set;
  
                  lov_set_add_req(req, set);
          }
@@ -1365,8 +1380,16 @@ int lov_fini_sync_set(struct lov_request_set *set)
          RETURN(rc);
  }
  
+/* The callback for osc_sync that finilizes a request info when a
+ * response is recieved. */
+static int cb_sync_update(struct obd_info *oinfo, int rc)
+{
+        struct lov_request *lovreq;
+        lovreq = container_of(oinfo, struct lov_request, rq_oi);
+        return lov_update_common_set(lovreq->rq_rqset, lovreq, rc);
+}
+
  int lov_prep_sync_set(struct obd_export *exp, struct obd_info *oinfo,
-                      struct obdo *src_oa, struct lov_stripe_md *lsm,
                        obd_off start, obd_off end,
                        struct lov_request_set **reqset)
  {
@@ -1383,21 +1406,22 @@ int lov_prep_sync_set(struct obd_export *exp, struct obd_info *oinfo,
  
          set->set_exp = exp;
          set->set_oi = oinfo;
-        set->set_oi->oi_md = lsm;
-        set->set_oi->oi_oa = src_oa;
+        set->set_oi->oi_md = oinfo->oi_md;
+        set->set_oi->oi_oa = oinfo->oi_oa;
  
-        for (i = 0; i < lsm->lsm_stripe_count; i++) {
+        for (i = 0; i < oinfo->oi_md->lsm_stripe_count; i++) {
                  struct lov_request *req;
                  obd_off rs, re;
  
-                loi = lsm->lsm_oinfo[i];
+                loi = oinfo->oi_md->lsm_oinfo[i];
                  if (!lov->lov_tgts[loi->loi_ost_idx] ||
                      !lov->lov_tgts[loi->loi_ost_idx]->ltd_active) {
                          CDEBUG(D_HA, "lov idx %d inactive\n", loi->loi_ost_idx);
                          continue;
                  }
  
-                if (!lov_stripe_intersects(lsm, i, start, end, &rs, &re))
+                if (!lov_stripe_intersects(oinfo->oi_md, i, start,
+                                           end, &rs, &re))
                          continue;
  
                  OBD_ALLOC(req, sizeof(*req));
@@ -1411,13 +1435,16 @@ int lov_prep_sync_set(struct obd_export *exp, struct obd_info *oinfo,
                          OBD_FREE(req, sizeof(*req));
                          GOTO(out_set, rc = -ENOMEM);
                  }
-                memcpy(req->rq_oi.oi_oa, src_oa, sizeof(*req->rq_oi.oi_oa));
+                memcpy(req->rq_oi.oi_oa, oinfo->oi_oa,
+                       sizeof(*req->rq_oi.oi_oa));
                  req->rq_oi.oi_oa->o_id = loi->loi_id;
                  req->rq_oi.oi_oa->o_stripe_idx = i;
  
                  req->rq_oi.oi_policy.l_extent.start = rs;
                  req->rq_oi.oi_policy.l_extent.end = re;
                  req->rq_oi.oi_policy.l_extent.gid = -1;
+                req->rq_oi.oi_cb_up = cb_sync_update;
+                req->rq_rqset = set;
  
                  lov_set_add_req(req, set);
          }
@@ -1453,7 +1480,7 @@ int lov_fini_statfs(struct obd_device *obd, struct obd_statfs *osfs,int success)
  
                  spin_lock(&obd->obd_osfs_lock);
                  memcpy(&obd->obd_osfs, osfs, sizeof(*osfs));
-                obd->obd_osfs_age = get_jiffies_64();
+                obd->obd_osfs_age = cfs_time_current_64();
                  spin_unlock(&obd->obd_osfs_lock);
                  RETURN(0);
          }
@@ -1480,15 +1507,11 @@ int lov_fini_statfs_set(struct lov_request_set *set)
          RETURN(rc);
  }
  
-void lov_update_statfs(struct obd_device *obd, struct obd_statfs *osfs,
-                       struct obd_statfs *lov_sfs, int success)
+void lov_update_statfs(struct obd_statfs *osfs, struct obd_statfs *lov_sfs,
+                       int success)
  {
          int shift = 0, quit = 0;
          __u64 tmp;
-        spin_lock(&obd->obd_osfs_lock);
-        memcpy(&obd->obd_osfs, lov_sfs, sizeof(*lov_sfs));
-        obd->obd_osfs_age = get_jiffies_64();
-        spin_unlock(&obd->obd_osfs_lock);
  
          if (success == 0) {
                  memcpy(osfs, lov_sfs, sizeof(*lov_sfs));
@@ -1582,7 +1605,13 @@ static int cb_statfs_update(struct obd_info *oinfo, int rc)
                  RETURN(rc);
          }
  
-        lov_update_statfs(obd, osfs, lov_sfs, success);
+        spin_lock(&obd->obd_osfs_lock);
+        memcpy(&obd->obd_osfs, lov_sfs, sizeof(*lov_sfs));
+        if ((oinfo->oi_flags & OBD_STATFS_FROM_CACHE) == 0)
+                obd->obd_osfs_age = cfs_time_current_64();
+        spin_unlock(&obd->obd_osfs_lock);
+
+        lov_update_statfs(osfs, lov_sfs, success);
          qos_update(lov);
  
          RETURN(0);
@@ -1625,6 +1654,7 @@ int lov_prep_statfs_set(struct obd_device *obd, struct obd_info *oinfo,
  
                  req->rq_idx = i;
                  req->rq_oi.oi_cb_up = cb_statfs_update;
+                req->rq_oi.oi_flags = oinfo->oi_flags;
                  req->rq_rqset = set;
  
                  lov_set_add_req(req, set);
diff --git a/lustre/lov/lproc_lov.c b/lustre/lov/lproc_lov.c

index b486995..71ab80f 100644 (file)
--- a/lustre/lov/lproc_lov.c
+++ b/lustre/lov/lproc_lov.c
@@ -1,33 +1,42 @@
  /* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
   * vim:expandtab:shiftwidth=8:tabstop=8:
   *
- *  Copyright (C) 2002 Cluster File Systems, Inc.
+ * GPL HEADER START
   *
- *   This file is part of the Lustre file system, http://www.lustre.org
- *   Lustre is a trademark of Cluster File Systems, Inc.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   *
- *   You may have signed or agreed to another license before downloading
- *   this software.  If so, you are bound by the terms and conditions
- *   of that agreement, and the following does not apply to you.  See the
- *   LICENSE file included with this distribution for more information.
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
   *
- *   If you did not agree to a different license, then this copy of Lustre
- *   is open source software; you can redistribute it and/or modify it
- *   under the terms of version 2 of the GNU General Public License as
- *   published by the Free Software Foundation.
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
   *
- *   In either case, Lustre is distributed in the hope that it will be
- *   useful, but WITHOUT ANY WARRANTY; without even the implied warranty
- *   of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- *   license text for more details.
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
   *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright  2008 Sun Microsystems, Inc. All rights reserved
+ * Use is subject to license terms.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
   */
  #define DEBUG_SUBSYSTEM S_CLASS
  
  #include <linux/version.h>
-#if (LINUX_VERSION_CODE >= KERNEL_VERSION(2,5,0))
  #include <asm/statfs.h>
-#endif
  #include <lprocfs_status.h>
  #include <obd_class.h>
  #include <linux/seq_file.h>
@@ -134,8 +143,8 @@ static int lov_rd_stripecount(char *page, char **start, off_t off, int count,
          LASSERT(dev != NULL);
          desc = &dev->u.lov.desc;
          *eof = 1;
-        return snprintf(page, count, "%ld\n",
-                        (long)(desc->ld_default_stripe_count + 1) - 1);
+        return snprintf(page, count, "%d\n",
+                        (__s16)(desc->ld_default_stripe_count + 1) - 1);
  }
  
  static int lov_wr_stripecount(struct file *file, const char *buffer,
@@ -361,4 +370,3 @@ struct file_operations lov_proc_target_fops = {
          .release = lprocfs_seq_release,
  };
  #endif /* LPROCFS */
-
diff --git a/lustre/lvfs/autoMakefile.am b/lustre/lvfs/autoMakefile.am

index b7b3a48..37ab888 100644 (file)
--- a/lustre/lvfs/autoMakefile.am
+++ b/lustre/lvfs/autoMakefile.am
@@ -1,7 +1,39 @@
-# Copyright (C) 2001  Cluster File Systems, Inc.
  #
-# This code is issued under the GNU General Public License.
-# See the file COPYING in this distribution
+# GPL HEADER START
+#
+# DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License version 2 only,
+# as published by the Free Software Foundation.
+#
+# This program is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+# General Public License version 2 for more details (a copy is included
+# in the LICENSE file that accompanied this code).
+#
+# You should have received a copy of the GNU General Public License
+# version 2 along with this program; If not, see
+# http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+#
+# Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+# CA 95054 USA or visit www.sun.com if you need additional information or
+# have any questions.
+#
+# GPL HEADER END
+#
+
+#
+# Copyright  2008 Sun Microsystems, Inc. All rights reserved
+# Use is subject to license terms.
+#
+
+#
+# This file is part of Lustre, http://www.lustre.org/
+# Lustre is a trademark of Sun Microsystems, Inc.
+#
+
  if LIBLUSTRE
  noinst_LIBRARIES = liblvfs.a
  liblvfs_a_SOURCES = lvfs_userfs.c prng.c lvfs_lib.c
@@ -81,4 +113,3 @@ DIST_SOURCES = fsfilt.c fsfilt_ext3.c fsfilt_reiserfs.c lvfs_common.c \
  
  MOSTLYCLEANFILES := @MOSTLYCLEANFILES@ 
  CLEANFILES = fsfilt-*.c fsfilt_ldiskfs*.c fsfilt_extN.c sources
-
diff --git a/lustre/lvfs/fsfilt.c b/lustre/lvfs/fsfilt.c

index 6f88917..e9d365b 100644 (file)
--- a/lustre/lvfs/fsfilt.c
+++ b/lustre/lvfs/fsfilt.c
@@ -1,3 +1,39 @@
+/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
+ * vim:expandtab:shiftwidth=8:tabstop=8:
+ *
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright  2008 Sun Microsystems, Inc. All rights reserved
+ * Use is subject to license terms.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
  #ifndef EXPORT_SYMTAB
  # define EXPORT_SYMTAB
  #endif
diff --git a/lustre/lvfs/fsfilt_ext3.c b/lustre/lvfs/fsfilt_ext3.c

index 88bf8dd..43b3f0c 100644 (file)
--- a/lustre/lvfs/fsfilt_ext3.c
+++ b/lustre/lvfs/fsfilt_ext3.c
@@ -1,26 +1,41 @@
  /* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
   * vim:expandtab:shiftwidth=8:tabstop=8:
   *
- *  lustre/lib/fsfilt_ext3.c
- *  Lustre filesystem abstraction routines
+ * GPL HEADER START
   *
- *  Copyright (C) 2002, 2003 Cluster File Systems, Inc.
- *   Author: Andreas Dilger <adilger@clusterfs.com>
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   *
- *   This file is part of Lustre, http://www.lustre.org.
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
   *
- *   Lustre is free software; you can redistribute it and/or
- *   modify it under the terms of version 2 of the GNU General Public
- *   License as published by the Free Software Foundation.
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
   *
- *   Lustre is distributed in the hope that it will be useful,
- *   but WITHOUT ANY WARRANTY; without even the implied warranty of
- *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- *   GNU General Public License for more details.
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
   *
- *   You should have received a copy of the GNU General Public License
- *   along with Lustre; if not, write to the Free Software
- *   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright  2008 Sun Microsystems, Inc. All rights reserved
+ * Use is subject to license terms.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/lvfs/fsfilt_ext3.c
+ *
+ * Author: Andreas Dilger <adilger@clusterfs.com>
   */
  
  #define DEBUG_SUBSYSTEM S_FILTER
@@ -40,19 +55,12 @@
  #include <linux/quotaio_v1.h>
  #include <linux/quotaio_v2.h>
  #include <linux/parser.h>
-#if (LINUX_VERSION_CODE < KERNEL_VERSION(2,5,0))
-#include <linux/ext3_xattr.h>
-#else
  #include <ext3/xattr.h>
-#endif
  
  #include <libcfs/kp30.h>
  #include <lustre_fsfilt.h>
  #include <obd.h>
  #include <lustre_quota.h>
-#if (LINUX_VERSION_CODE < KERNEL_VERSION(2,5,0))
-#include <linux/iobuf.h>
-#endif
  #include <linux/lustre_compat25.h>
  #include <linux/lprocfs_status.h>
  
@@ -77,15 +85,6 @@
  #define FSFILT_SINGLEDATA_TRANS_BLOCKS(sb) EXT3_SINGLEDATA_TRANS_BLOCKS
  #endif
  
-#if (LINUX_VERSION_CODE < KERNEL_VERSION(2,5,0))
-#define fsfilt_ext3_journal_start(inode, nblocks) \
-                                journal_start(EXT3_JOURNAL(inode),nblocks)
-#define fsfilt_ext3_journal_stop(handle)          journal_stop(handle)
-#else
-#define fsfilt_ext3_journal_start(inode, nblocks) ext3_journal_start(inode, nblocks)
-#define fsfilt_ext3_journal_stop(handle)          ext3_journal_stop(handle)
-#endif
-
  static cfs_mem_cache_t *fcb_cache;
  
  struct fsfilt_cb_data {
@@ -99,6 +98,9 @@ struct fsfilt_cb_data {
  #ifndef EXT3_XATTR_INDEX_TRUSTED        /* temporary until we hit l28 kernel */
  #define EXT3_XATTR_INDEX_TRUSTED        4
  #endif
+#ifndef XATTR_NO_CTIME
+#define XATTR_NO_CTIME 0x80
+#endif
  
  static char *fsfilt_ext3_get_label(struct super_block *sb)
  {
@@ -113,9 +115,7 @@ static int fsfilt_ext3_set_label(struct super_block *sb, char *label)
          int err;
  
          journal = EXT3_SB(sb)->s_journal;
-        lock_24kernel();
          handle = journal_start(journal, 1);
-        unlock_24kernel();
          if (IS_ERR(handle)) {
                  CERROR("can't start transaction\n");
                  return(PTR_ERR(handle));
@@ -131,9 +131,7 @@ static int fsfilt_ext3_set_label(struct super_block *sb, char *label)
          err = ext3_journal_dirty_metadata(handle, EXT3_SB(sb)->s_sbh);
  
  out:
-        lock_24kernel();
          journal_stop(handle);
-        unlock_24kernel();
  
          return(err);
  }
@@ -149,6 +147,8 @@ static char *fsfilt_ext3_uuid(struct super_block *sb)
   */
  static __u64 fsfilt_ext3_get_version(struct inode *inode)
  {
+        CDEBUG(D_INFO, "Get version "LPX64" for inode %lu\n",
+               EXT3_I(inode)->i_fs_version, inode->i_ino);
          return EXT3_I(inode)->i_fs_version;
  }
  
@@ -159,7 +159,12 @@ static __u64 fsfilt_ext3_set_version(struct inode *inode, __u64 new_version)
  {
          __u64 old_version = EXT3_I(inode)->i_fs_version;
  
+        CDEBUG(D_INFO, "Set version "LPX64" (old "LPX64") for inode %lu\n",
+               new_version, old_version, inode->i_ino);
          (EXT3_I(inode))->i_fs_version = new_version;
+        /* version is set after all inode operations are finished, so we should
+         * mark it dirty here */
+        inode->i_sb->s_op->dirty_inode(inode);
          return old_version;
  }
  
@@ -211,7 +216,7 @@ static void *fsfilt_ext3_start(struct inode *inode, int op, void *desc_private,
                                cpu_to_le32(EXT3_EXTENTS_FL | EXT3_INDEX_FL)) ==
                                cpu_to_le32(EXT3_EXTENTS_FL | EXT3_INDEX_FL))) {
                                  CWARN("extent-mapped directory found - contact "
-                                      "CFS: support@clusterfs.com\n");
+                                      "http://bugzilla.lustre.org/\n");
                                  warned = 1;
                          }
                  }
@@ -273,9 +278,7 @@ static void *fsfilt_ext3_start(struct inode *inode, int op, void *desc_private,
  
   journal_start:
          LASSERTF(nblocks > 0, "can't start %d credit transaction\n", nblocks);
-        lock_24kernel();
-        handle = fsfilt_ext3_journal_start(inode, nblocks);
-        unlock_24kernel();
+        handle = ext3_journal_start(inode, nblocks);
  
          if (!IS_ERR(handle))
                  LASSERT(current->journal_info == handle);
@@ -412,9 +415,7 @@ static void *fsfilt_ext3_brw_start(int objcount, struct fsfilt_objinfo *fso,
          }
  
          LASSERTF(needed > 0, "can't start %d credit transaction\n", needed);
-        lock_24kernel();
-        handle = fsfilt_ext3_journal_start(fso->fso_dentry->d_inode, needed);
-        unlock_24kernel();
+        handle = ext3_journal_start(fso->fso_dentry->d_inode, needed);
          if (IS_ERR(handle)) {
                  CERROR("can't get handle for %d credits: rc = %ld\n", needed,
                         PTR_ERR(handle));
@@ -454,9 +455,7 @@ static int fsfilt_ext3_commit(struct inode *inode, void *h, int force_sync)
          if (force_sync)
                  handle->h_sync = 1; /* recovery likes this */
  
-        lock_24kernel();
-        rc = fsfilt_ext3_journal_stop(handle);
-        unlock_24kernel();
+        rc = ext3_journal_stop(handle);
  
          return rc;
  }
@@ -466,36 +465,23 @@ static int fsfilt_ext3_commit_async(struct inode *inode, void *h,
  {
          unsigned long tid;
          transaction_t *transaction;
-#if LINUX_VERSION_CODE < KERNEL_VERSION(2,5,0)
-        unsigned long rtid;
-#endif
          handle_t *handle = h;
          journal_t *journal;
          int rc;
  
          LASSERT(current->journal_info == handle);
  
-        lock_24kernel();
          transaction = handle->h_transaction;
          journal = transaction->t_journal;
          tid = transaction->t_tid;
          /* we don't want to be blocked */
          handle->h_sync = 0;
-        rc = fsfilt_ext3_journal_stop(handle);
+        rc = ext3_journal_stop(handle);
          if (rc) {
                  CERROR("error while stopping transaction: %d\n", rc);
-                unlock_24kernel();
                  return rc;
          }
-#if LINUX_VERSION_CODE < KERNEL_VERSION(2,5,0)
-        rtid = log_start_commit(journal, transaction);
-        if (rtid != tid)
-                CERROR("strange race: %lu != %lu\n",
-                       (unsigned long) tid, (unsigned long) rtid);
-#else
          log_start_commit(journal, tid);
-#endif
-        unlock_24kernel();
  
          *wait_handle = (void *) tid;
          CDEBUG(D_INODE, "commit async: %lu\n", (unsigned long) tid);
@@ -524,8 +510,6 @@ static int fsfilt_ext3_setattr(struct dentry *dentry, void *handle,
          struct inode *inode = dentry->d_inode;
          int rc = 0;
  
-        lock_24kernel();
-
          /* Avoid marking the inode dirty on the superblock list unnecessarily.
           * We are already writing the inode to disk as part of this
           * transaction and want to avoid a lot of extra inode writeout
@@ -576,7 +560,6 @@ static int fsfilt_ext3_setattr(struct dentry *dentry, void *handle,
          }
  
   out:
-        unlock_24kernel();
          RETURN(rc);
  }
  
@@ -607,11 +590,8 @@ static int fsfilt_ext3_set_md(struct inode *inode, void *handle,
  
          LASSERT(TRYLOCK_INODE_MUTEX(inode) == 0);
  
-        lock_24kernel();
          rc = ext3_xattr_set_handle(handle, inode, EXT3_XATTR_INDEX_TRUSTED,
-                                   name, lmm, lmm_size, 0);
-
-        unlock_24kernel();
+                                   name, lmm, lmm_size, XATTR_NO_CTIME);
  
          if (rc && rc != -EROFS)
                  CERROR("error adding MD data to inode %lu: rc = %d\n",
@@ -626,12 +606,9 @@ static int fsfilt_ext3_get_md(struct inode *inode, void *lmm, int lmm_size,
          int rc;
  
          LASSERT(TRYLOCK_INODE_MUTEX(inode) == 0);
-        lock_24kernel();
  
          rc = ext3_xattr_get(inode, EXT3_XATTR_INDEX_TRUSTED,
                              name, lmm, lmm_size);
-        unlock_24kernel();
-
          /* This gives us the MD size */
          if (lmm == NULL)
                  return (rc == -ENODATA) ? 0 : rc;
@@ -647,43 +624,11 @@ static int fsfilt_ext3_get_md(struct inode *inode, void *lmm, int lmm_size,
          return rc;
  }
  
-#if (LINUX_VERSION_CODE > KERNEL_VERSION(2,5,0))
  static int fsfilt_ext3_send_bio(int rw, struct inode *inode, struct bio *bio)
  {
          submit_bio(rw, bio);
          return 0;
  }
-#else
-static int fsfilt_ext3_send_bio(int rw, struct inode *inode, struct kiobuf *bio)
-{
-        int rc, blk_per_page;
-
-        rc = brw_kiovec(rw, 1, &bio, inode->i_dev,
-                        KIOBUF_GET_BLOCKS(bio), 1 << inode->i_blkbits);
-        /*
-         * brw_kiovec() returns number of bytes actually written. If error
-         * occurred after something was written, error code is returned though
-         * kiobuf->errno. (See bug 6854.)
-         */
-
-        blk_per_page = CFS_PAGE_SIZE >> inode->i_blkbits;
-
-        if (rc != (1 << inode->i_blkbits) * bio->nr_pages * blk_per_page) {
-                CERROR("short write?  expected %d, wrote %d (%d)\n",
-                       (1 << inode->i_blkbits) * bio->nr_pages * blk_per_page,
-                       rc, bio->errno);
-        }
-        if (bio->errno != 0) {
-                CERROR("IO error. Wrote %d of %d (%d)\n",
-                       rc,
-                       (1 << inode->i_blkbits) * bio->nr_pages * blk_per_page,
-                       bio->errno);
-                rc = bio->errno;
-        }
-
-        return rc;
-}
-#endif
  
  static ssize_t fsfilt_ext3_readpage(struct file *file, char *buf, size_t count,
                                      loff_t *off)
@@ -766,10 +711,8 @@ static int fsfilt_ext3_add_journal_cb(struct obd_device *obd, __u64 last_rcvd,
          fcb->cb_data = cb_data;
  
          CDEBUG(D_EXT2, "set callback for last_rcvd: "LPD64"\n", last_rcvd);
-        lock_24kernel();
          journal_callback_set(handle, fsfilt_ext3_cb_func,
                               (struct journal_callback *)fcb);
-        unlock_24kernel();
  
          return 0;
  }
@@ -812,10 +755,7 @@ static int fsfilt_ext3_sync(struct super_block *sb)
  #endif
  
  #ifdef EXT3_MULTIBLOCK_ALLOCATOR
-#if (LINUX_VERSION_CODE < KERNEL_VERSION(2,5,0))
-#define ext3_up_truncate_sem(inode)  up_write(&EXT3_I(inode)->truncate_sem);
-#define ext3_down_truncate_sem(inode)  down_write(&EXT3_I(inode)->truncate_sem);
-#elif (LINUX_VERSION_CODE < KERNEL_VERSION(2,6,17))
+#if (LINUX_VERSION_CODE < KERNEL_VERSION(2,6,17))
  #define ext3_up_truncate_sem(inode)  up(&EXT3_I(inode)->truncate_sem);
  #define ext3_down_truncate_sem(inode)  down(&EXT3_I(inode)->truncate_sem);
  #else
@@ -902,25 +842,8 @@ static int ext3_ext_find_goal(struct inode *inode, struct ext3_ext_path *path,
          return bg_start + colour + block;
  }
  
-#if (LINUX_VERSION_CODE < KERNEL_VERSION(2,5,0))
-#include <linux/locks.h>
-static void ll_unmap_underlying_metadata(struct super_block *sb,
-                                         unsigned long blocknr)
-{
-        struct buffer_head *old_bh;
-
-        old_bh = get_hash_table(sb->s_dev, blocknr, sb->s_blocksize);
-        if (old_bh) {
-                mark_buffer_clean(old_bh);
-                wait_on_buffer(old_bh);
-                clear_bit(BH_Req, &old_bh->b_state);
-                __brelse(old_bh);
-        }
-}
-#else
  #define ll_unmap_underlying_metadata(sb, blocknr) \
          unmap_underlying_metadata((sb)->s_bdev, blocknr)
-#endif
  
  #ifndef EXT3_MB_HINT_GROUP_ALLOC
  static unsigned long new_blocks(handle_t *handle, struct ext3_ext_base *base,
@@ -933,9 +856,7 @@ static unsigned long new_blocks(handle_t *handle, struct ext3_ext_base *base,
  
          goal = ext3_ext_find_goal(inode, path, block, &aflags);
          aflags |= 2; /* block have been already reserved */
-        lock_24kernel();
          pblock = ext3_mb_new_blocks(handle, inode, goal, count, aflags, err);
-        unlock_24kernel();
          return pblock;
  
  }
@@ -1032,9 +953,7 @@ static int ext3_ext_new_extent_cb(struct ext3_ext_base *base,
          count = ext3_ext_calc_credits_for_insert(base, path);
          ext3_up_truncate_sem(inode);
  
-        lock_24kernel();
-        handle = fsfilt_ext3_journal_start(inode, count+EXT3_ALLOC_NEEDED+1);
-        unlock_24kernel();
+        handle = ext3_journal_start(inode, count+EXT3_ALLOC_NEEDED+1);
          if (IS_ERR(handle)) {
                  ext3_down_truncate_sem(inode);
                  return PTR_ERR(handle);
@@ -1043,9 +962,7 @@ static int ext3_ext_new_extent_cb(struct ext3_ext_base *base,
          ext3_down_truncate_sem(inode);
          if (tgen != EXT_GENERATION(base)) {
                  /* the tree has changed. so path can be invalid at moment */
-                lock_24kernel();
-                fsfilt_ext3_journal_stop(handle);
-                unlock_24kernel();
+                ext3_journal_stop(handle);
                  return EXT_REPEAT;
          }
  
@@ -1061,9 +978,13 @@ static int ext3_ext_new_extent_cb(struct ext3_ext_base *base,
          nex.ee_len = count;
          err = ext3_ext_insert_extent(handle, base, path, &nex);
          if (err) {
-                CERROR("can't insert extent: %d\n", err);
-                /* XXX: export ext3_free_blocks() */
-                /*ext3_free_blocks(handle, inode, nex.ee_start, nex.ee_len, 0);*/
+                /* free data blocks we just allocated */
+                /* not a good idea to call discard here directly,
+                 * but otherwise we'd need to call it every free() */
+#ifdef EXT3_MB_HINT_GROUP_ALLOC
+                ext3_mb_discard_inode_preallocations(inode);
+#endif
+                ext3_free_blocks(handle, inode, nex.ee_start, nex.ee_len, 0);
                  goto out;
          }
  
@@ -1078,9 +999,7 @@ static int ext3_ext_new_extent_cb(struct ext3_ext_base *base,
          BUG_ON(nex.ee_block != cex->ec_block);
  
  out:
-        lock_24kernel();
-        fsfilt_ext3_journal_stop(handle);
-        unlock_24kernel();
+        ext3_journal_stop(handle);
  map:
          if (err >= 0) {
                  /* map blocks */
@@ -1312,10 +1231,8 @@ static int fsfilt_ext3_write_record(struct file *file, void *buf, int bufsize,
          block_count = (*offs & (blocksize - 1)) + bufsize;
          block_count = (block_count + blocksize - 1) >> inode->i_blkbits;
  
-        lock_24kernel();
-        handle = fsfilt_ext3_journal_start(inode,
+        handle = ext3_journal_start(inode,
                                 block_count * FSFILT_DATA_TRANS_BLOCKS(inode->i_sb) + 2);
-        unlock_24kernel();
          if (IS_ERR(handle)) {
                  CERROR("can't start transaction for %d blocks (%d bytes)\n",
                         block_count * FSFILT_DATA_TRANS_BLOCKS(inode->i_sb) + 2, bufsize);
@@ -1374,9 +1291,7 @@ out:
                  unlock_kernel();
          }
  
-        lock_24kernel();
-        fsfilt_ext3_journal_stop(handle);
-        unlock_24kernel();
+        ext3_journal_stop(handle);
  
          if (err == 0)
                  *offs = offset;
@@ -1385,7 +1300,9 @@ out:
  
  static int fsfilt_ext3_setup(struct super_block *sb)
  {
+#if (LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,6)) && defined(HAVE_QUOTA_SUPPORT)
          struct ext3_sb_info *sbi = EXT3_SB(sb);
+#endif
  #if 0
          sbi->dx_lock = fsfilt_ext3_dx_lock;
          sbi->dx_unlock = fsfilt_ext3_dx_unlock;
@@ -1397,7 +1314,7 @@ static int fsfilt_ext3_setup(struct super_block *sb)
  #endif
          if (!EXT3_HAS_COMPAT_FEATURE(sb, EXT3_FEATURE_COMPAT_DIR_INDEX))
                  CWARN("filesystem doesn't have dir_index feature enabled\n");
-#if (LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,6)) && HAVE_QUOTA_SUPPORT
+#if (LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,6)) && defined(HAVE_QUOTA_SUPPORT)
          /* enable journaled quota support */
          /* kfreed in ext3_put_super() */
          sbi->s_qf_names[USRQUOTA] = kstrdup("lquota.user.reserved", GFP_KERNEL);
@@ -1448,6 +1365,7 @@ static int fsfilt_ext3_get_op_len(int op, struct fsfilt_objinfo *fso, int logs)
          return 0;
  }
  
+#ifdef HAVE_QUOTA_SUPPORT
  #define DQINFO_COPY(out, in)                    \
  do {                                            \
          Q_COPY(out, in, dqi_bgrace);            \
@@ -2137,7 +2055,6 @@ out:
          RETURN(rc);
  }
  
-#ifdef HAVE_QUOTA_SUPPORT
  static int fsfilt_ext3_quotainfo(struct lustre_quota_info *lqi, int type, 
                                   int cmd)
  {
@@ -2215,15 +2132,29 @@ static int fsfilt_ext3_dquot(struct lustre_dquot *dquot, int cmd)
          }
          RETURN(rc);
  }
+
+static int fsfilt_ext3_get_mblk(struct super_block *sb, int *count,
+                                struct inode *inode, int frags)
+{
+#ifdef EXT3_EXT_HAS_NO_TREE
+        struct ext3_ext_base *base = inode;
+#else
+        struct ext3_extents_tree tree;
+        struct ext3_ext_base *base = &tree;
+
+        ext3_init_tree_desc(base, inode);
+#endif
+        /* for an ost_write request, it needs <#fragments> * <tree depth + 1>
+         * metablocks at maxium b=16542 */
+        *count = frags * (EXT_DEPTH(base) + 1) * EXT3_BLOCK_SIZE(sb);
+        return 0;
+}
+
  #endif
  
  static lvfs_sbdev_type fsfilt_ext3_journal_sbdev(struct super_block *sb)
  {
-#if (LINUX_VERSION_CODE >= KERNEL_VERSION(2,5,0))
          return (EXT3_SB(sb)->journal_bdev);
-#else
-        return kdev_t_to_nr(EXT3_SB(sb)->s_journal->j_dev);
-#endif
  }
  
  static struct fsfilt_operations fsfilt_ext3_ops = {
@@ -2252,16 +2183,17 @@ static struct fsfilt_operations fsfilt_ext3_ops = {
          .fs_setup               = fsfilt_ext3_setup,
          .fs_send_bio            = fsfilt_ext3_send_bio,
          .fs_get_op_len          = fsfilt_ext3_get_op_len,
-        .fs_quotactl            = fsfilt_ext3_quotactl,
-        .fs_quotacheck          = fsfilt_ext3_quotacheck,
  #ifdef HAVE_DISK_INODE_VERSION
          .fs_get_version         = fsfilt_ext3_get_version,
          .fs_set_version         = fsfilt_ext3_set_version,
  #endif
  #ifdef HAVE_QUOTA_SUPPORT
+        .fs_quotactl            = fsfilt_ext3_quotactl,
+        .fs_quotacheck          = fsfilt_ext3_quotacheck,
          .fs_quotainfo           = fsfilt_ext3_quotainfo,
          .fs_qids                = fsfilt_ext3_qids,
          .fs_dquot               = fsfilt_ext3_dquot,
+        .fs_get_mblk            = fsfilt_ext3_get_mblk,
  #endif
          .fs_journal_sbdev       = fsfilt_ext3_journal_sbdev,
  };
@@ -2299,6 +2231,6 @@ static void __exit fsfilt_ext3_exit(void)
  module_init(fsfilt_ext3_init);
  module_exit(fsfilt_ext3_exit);
  
-MODULE_AUTHOR("Cluster File Systems, Inc. <info@clusterfs.com>");
+MODULE_AUTHOR("Sun Microsystems, Inc. <http://www.lustre.org/>");
  MODULE_DESCRIPTION("Lustre ext3 Filesystem Helper v0.1");
  MODULE_LICENSE("GPL");
diff --git a/lustre/lvfs/fsfilt_reiserfs.c b/lustre/lvfs/fsfilt_reiserfs.c

index 00a0433..a0513f4 100644 (file)
--- a/lustre/lvfs/fsfilt_reiserfs.c
+++ b/lustre/lvfs/fsfilt_reiserfs.c
@@ -1,26 +1,41 @@
  /* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
   * vim:expandtab:shiftwidth=8:tabstop=8:
   *
- *  lustre/lib/fsfilt_reiserfs.c
- *  Lustre filesystem abstraction routines
+ * GPL HEADER START
   *
- *  Copyright (C) 2002 Cluster File Systems, Inc.
- *   Author: Andreas Dilger <adilger@clusterfs.com>
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   *
- *   This file is part of Lustre, http://www.lustre.org.
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
   *
- *   Lustre is free software; you can redistribute it and/or
- *   modify it under the terms of version 2 of the GNU General Public
- *   License as published by the Free Software Foundation.
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
   *
- *   Lustre is distributed in the hope that it will be useful,
- *   but WITHOUT ANY WARRANTY; without even the implied warranty of
- *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- *   GNU General Public License for more details.
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
   *
- *   You should have received a copy of the GNU General Public License
- *   along with Lustre; if not, write to the Free Software
- *   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright  2008 Sun Microsystems, Inc. All rights reserved
+ * Use is subject to license terms.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/lvfs/fsfilt_reiserfs.c
+ *
+ * Author: Andreas Dilger <adilger@clusterfs.com>
   */
  
  /*
@@ -236,7 +251,7 @@ static void __exit fsfilt_reiserfs_exit(void)
          fsfilt_unregister_ops(&fsfilt_reiserfs_ops);
  }
  
-MODULE_AUTHOR("Cluster File Systems, Inc. <info@clusterfs.com>");
+MODULE_AUTHOR("Sun Microsystems, Inc. <http://www.lustre.org/>");
  MODULE_DESCRIPTION("Lustre reiserfs Filesystem Helper v0.1");
  MODULE_LICENSE("GPL");
  
diff --git a/lustre/lvfs/lustre_quota_fmt.c b/lustre/lvfs/lustre_quota_fmt.c

index e910a0c..e168c99 100644 (file)
--- a/lustre/lvfs/lustre_quota_fmt.c
+++ b/lustre/lvfs/lustre_quota_fmt.c
@@ -1,10 +1,42 @@
  /* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
   * vim:expandtab:shiftwidth=8:tabstop=8:
   *
- * Lustre administrative quota format.
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright  2008 Sun Microsystems, Inc. All rights reserved
+ * Use is subject to license terms.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/lvfs/lustre_quota_fmt.c
   *
- *  from
- *  linux/fs/quota_v2.c
+ * Lustre administrative quota format.
+ * from linux/fs/quota_v2.c
   */
  
  #ifndef EXPORT_SYMTAB
@@ -27,6 +59,8 @@
  #include <obd_support.h>
  #include "lustre_quota_fmt.h"
  
+#ifdef HAVE_QUOTA_SUPPORT
+
  static const uint lustre_initqversions[][MAXQUOTAS] = {
          [LUSTRE_QUOTA_V1] = LUSTRE_INITQVERSIONS,
          [LUSTRE_QUOTA_V2] = LUSTRE_INITQVERSIONS_V2
@@ -138,9 +172,8 @@ int lustre_write_quota_info(struct lustre_quota_info *lqi, int type)
                                sizeof(struct lustre_disk_dqinfo), &offset);
          set_fs(fs);
          if (size != sizeof(struct lustre_disk_dqinfo)) {
-                printk(KERN_WARNING
-                       "Can't write info structure on device %s.\n",
-                       f->f_vfsmnt->mnt_sb->s_id);
+                CWARN("Can't write info structure on device %s.\n",
+                      f->f_vfsmnt->mnt_sb->s_id);
                  return -1;
          }
          return 0;
@@ -208,8 +241,7 @@ dqbuf_t getdqbuf(void)
  {
          dqbuf_t buf = kmalloc(LUSTRE_DQBLKSIZE, GFP_NOFS);
          if (!buf)
-                printk(KERN_WARNING
-                       "VFS: Not enough memory for quota buffers.\n");
+                CWARN("VFS: Not enough memory for quota buffers.\n");
          return buf;
  }
  
@@ -554,8 +586,8 @@ static int lustre_write_dquot(struct lustre_dquot *dquot,
                                  dqblk_sz, &offset);
          set_fs(fs);
          if (ret != dqblk_sz) {
-                printk(KERN_WARNING "VFS: dquota write failed on dev %s\n",
-                       filp->f_dentry->d_sb->s_id);
+                CWARN("VFS: dquota write failed on dev %s\n",
+                      filp->f_dentry->d_sb->s_id);
                  if (ret >= 0)
                          ret = -ENOSPC;
          } else
@@ -1077,3 +1109,4 @@ EXPORT_SYMBOL(lustre_read_dquot);
  EXPORT_SYMBOL(lustre_commit_dquot);
  EXPORT_SYMBOL(lustre_init_quota_info);
  EXPORT_SYMBOL(lustre_get_qids);
+#endif
diff --git a/lustre/lvfs/lustre_quota_fmt.h b/lustre/lvfs/lustre_quota_fmt.h

index 2139ae0..a9fbcfc 100644 (file)
--- a/lustre/lvfs/lustre_quota_fmt.h
+++ b/lustre/lvfs/lustre_quota_fmt.h
@@ -1,14 +1,48 @@
  /* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
   * vim:expandtab:shiftwidth=8:tabstop=8:
   *
- * Lustre administrative quota format
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
   *
- * from
- * include/linux/quotaio_v2.h
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright  2008 Sun Microsystems, Inc. All rights reserved
+ * Use is subject to license terms.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/lvfs/lustre_quota_fmt.h
+ *
+ * Lustre administrative quota format
+ * from include/linux/quotaio_v2.h
   */
  #ifndef _LUSTRE_QUOTA_FMT_H
  #define _LUSTRE_QUOTA_FMT_H
  
+#ifdef HAVE_QUOTA_SUPPORT
+
  #include <linux/types.h>
  #include <linux/quota.h>
  
@@ -190,3 +224,4 @@ int lustre_quota_convert(struct lustre_quota_info *lqi, int type);
  #define LUSTRE_OPQFILES_NAMES { { "lquota.user", "lquota.group" }, \
                                  { "lquota_v2.user", "lquota_v2.group" } }
  #endif                          /* lustre_quota_fmt.h */
+#endif
diff --git a/lustre/lvfs/lustre_quota_fmt_convert.c b/lustre/lvfs/lustre_quota_fmt_convert.c

index baa37fb..de85f8b 100644 (file)
--- a/lustre/lvfs/lustre_quota_fmt_convert.c
+++ b/lustre/lvfs/lustre_quota_fmt_convert.c
@@ -1,10 +1,42 @@
  /* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
   * vim:expandtab:shiftwidth=8:tabstop=8:
   *
- * convert quota format.
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
   *
- *  from
- *  linux/fs/quota_v2.c
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright  2008 Sun Microsystems, Inc. All rights reserved
+ * Use is subject to license terms.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/lvfs/lustre_quota_fmt_convert.c
+ *
+ * convert quota format.
+ * from linux/fs/quota_v2.c
   */
  
  #ifndef EXPORT_SYMTAB
@@ -27,6 +59,8 @@
  #include <obd_support.h>
  #include "lustre_quota_fmt.h"
  
+#ifdef HAVE_QUOTA_SUPPORT
+
  static int admin_convert_dqinfo(struct file *fp_v1, struct file *fp_v2,
                                  struct lustre_quota_info *lqi, int type)
  {
@@ -268,4 +302,5 @@ out:
          return rc;
  }
  EXPORT_SYMBOL(lustre_slave_quota_convert);
-#endif
+#endif /* HAVE_QUOTA64 */
+#endif /* HAVE_QUOTA_SUPPORT */
diff --git a/lustre/lvfs/lvfs_common.c b/lustre/lvfs/lvfs_common.c

index 1834616..6b907ee 100644 (file)
--- a/lustre/lvfs/lvfs_common.c
+++ b/lustre/lvfs/lvfs_common.c
@@ -1,22 +1,37 @@
  /* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
   * vim:expandtab:shiftwidth=8:tabstop=8:
   *
- *  Copyright (c) 2003 Cluster File Systems, Inc.
+ * GPL HEADER START
   *
- *   This file is part of Lustre, http://www.lustre.org/
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   *
- *   Lustre is free software; you can redistribute it and/or
- *   modify it under the terms of version 2 of the GNU General Public
- *   License as published by the Free Software Foundation.
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
   *
- *   Lustre is distributed in the hope that it will be useful,
- *   but WITHOUT ANY WARRANTY; without even the implied warranty of
- *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- *   GNU General Public License for more details.
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
   *
- *   You should have received a copy of the GNU General Public License
- *   along with Lustre; if not, write to the Free Software
- *   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright  2008 Sun Microsystems, Inc. All rights reserved
+ * Use is subject to license terms.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
   */
  
  #ifndef EXPORT_SYMTAB
diff --git a/lustre/lvfs/lvfs_darwin.c b/lustre/lvfs/lvfs_darwin.c

index 1feb31c..cec260b 100644 (file)
--- a/lustre/lvfs/lvfs_darwin.c
+++ b/lustre/lvfs/lvfs_darwin.c
@@ -1,3 +1,39 @@
+/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
+ * vim:expandtab:shiftwidth=8:tabstop=8:
+ *
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright  2008 Sun Microsystems, Inc. All rights reserved
+ * Use is subject to license terms.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
  #define DEBUG_SUBSYSTEM S_FILTER
  
  #include <libcfs/libcfs.h>
@@ -42,4 +78,3 @@ static void __exit lvfs_exit(void)
  }
  
  cfs_module(lvfs, "1.0.0", lvfs_init, lvfs_exit);
-
diff --git a/lustre/lvfs/lvfs_internal.h b/lustre/lvfs/lvfs_internal.h

index 091fd99..a50b2d8 100644 (file)
--- a/lustre/lvfs/lvfs_internal.h
+++ b/lustre/lvfs/lvfs_internal.h
@@ -1,3 +1,39 @@
+/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
+ * vim:expandtab:shiftwidth=8:tabstop=8:
+ *
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright  2008 Sun Microsystems, Inc. All rights reserved
+ * Use is subject to license terms.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
  int  fsfilt_ext3_init(void);
  void fsfilt_ext3_exit(void);
  
diff --git a/lustre/lvfs/lvfs_lib.c b/lustre/lvfs/lvfs_lib.c

index 978000c..3f354af 100644 (file)
--- a/lustre/lvfs/lvfs_lib.c
+++ b/lustre/lvfs/lvfs_lib.c
@@ -1,26 +1,43 @@
  /* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
   * vim:expandtab:shiftwidth=8:tabstop=8:
   *
- *  lustre/lvfs/lvfs_lib.c
- *  Lustre filesystem abstraction routines
+ * GPL HEADER START
   *
- *  Copyright (C) 2007 Cluster File Systems, Inc.
- *   Author: Andreas Dilger <adilger@clusterfs.com>
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   *
- *   This file is part of Lustre, http://www.lustre.org.
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
   *
- *   Lustre is free software; you can redistribute it and/or
- *   modify it under the terms of version 2 of the GNU General Public
- *   License as published by the Free Software Foundation.
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
   *
- *   Lustre is distributed in the hope that it will be useful,
- *   but WITHOUT ANY WARRANTY; without even the implied warranty of
- *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- *   GNU General Public License for more details.
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
   *
- *   You should have received a copy of the GNU General Public License
- *   along with Lustre; if not, write to the Free Software
- *   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright  2008 Sun Microsystems, Inc. All rights reserved
+ * Use is subject to license terms.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/lvfs/lvfs_lib.c
+ *
+ * Lustre filesystem abstraction routines
+ *
+ * Author: Andreas Dilger <adilger@clusterfs.com>
   */
  #ifdef __KERNEL__
  #include <linux/module.h>
diff --git a/lustre/lvfs/lvfs_linux.c b/lustre/lvfs/lvfs_linux.c

index beea8ba..631d8bc 100644 (file)
--- a/lustre/lvfs/lvfs_linux.c
+++ b/lustre/lvfs/lvfs_linux.c
@@ -1,26 +1,41 @@
  /* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
   * vim:expandtab:shiftwidth=8:tabstop=8:
   *
- *  lustre/lib/lvfs_linux.c
- *  Lustre filesystem abstraction routines
+ * GPL HEADER START
   *
- *  Copyright (C) 2002, 2003 Cluster File Systems, Inc.
- *   Author: Andreas Dilger <adilger@clusterfs.com>
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   *
- *   This file is part of Lustre, http://www.lustre.org.
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
   *
- *   Lustre is free software; you can redistribute it and/or
- *   modify it under the terms of version 2 of the GNU General Public
- *   License as published by the Free Software Foundation.
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
   *
- *   Lustre is distributed in the hope that it will be useful,
- *   but WITHOUT ANY WARRANTY; without even the implied warranty of
- *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- *   GNU General Public License for more details.
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
   *
- *   You should have received a copy of the GNU General Public License
- *   along with Lustre; if not, write to the Free Software
- *   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright  2008 Sun Microsystems, Inc. All rights reserved
+ * Use is subject to license terms.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/lvfs/lvfs_linux.c
+ *
+ * Author: Andreas Dilger <adilger@clusterfs.com>
   */
  
  #ifndef EXPORT_SYMTAB
@@ -267,7 +282,8 @@ out_up:
  EXPORT_SYMBOL(simple_mknod);
  
  /* utility to make a directory */
-struct dentry *simple_mkdir(struct dentry *dir, char *name, int mode, int fix)
+struct dentry *simple_mkdir(struct dentry *dir, struct vfsmount *mnt, 
+                            char *name, int mode, int fix)
  {
          struct dentry *dchild;
          int err = 0;
@@ -300,7 +316,7 @@ struct dentry *simple_mkdir(struct dentry *dir, char *name, int mode, int fix)
                  GOTO(out_up, dchild);
          }
  
-        err = vfs_mkdir(dir->d_inode, dchild, mode);
+        err = ll_vfs_mkdir(dir->d_inode, dchild, mnt, mode);
          if (err)
                  GOTO(out_err, err);
  
@@ -315,7 +331,8 @@ out_up:
  EXPORT_SYMBOL(simple_mkdir);
  
  /* utility to rename a file */
-int lustre_rename(struct dentry *dir, char *oldname, char *newname)
+int lustre_rename(struct dentry *dir, struct vfsmount *mnt, 
+                  char *oldname, char *newname)
  {
          struct dentry *dchild_old, *dchild_new;
          int err = 0;
@@ -336,7 +353,8 @@ int lustre_rename(struct dentry *dir, char *oldname, char *newname)
          if (IS_ERR(dchild_new))
                  GOTO(put_old, err = PTR_ERR(dchild_new));
  
-        err = vfs_rename(dir->d_inode, dchild_old, dir->d_inode, dchild_new);
+        err = ll_vfs_rename(dir->d_inode, dchild_old, mnt, 
+                            dir->d_inode, dchild_new, mnt);
  
          dput(dchild_new);
  put_old:
@@ -455,6 +473,57 @@ long l_readdir(struct file *file, struct list_head *dentry_list)
  }
  EXPORT_SYMBOL(l_readdir);
  
+int l_notify_change(struct vfsmount *mnt, struct dentry *dchild,
+                    struct iattr *newattrs)
+{
+        int rc;
+
+        LOCK_INODE_MUTEX(dchild->d_inode);
+#ifdef HAVE_SECURITY_PLUG
+        rc = notify_change(dchild, mnt, newattrs);
+#else
+        rc = notify_change(dchild, newattrs);
+#endif
+        UNLOCK_INODE_MUTEX(dchild->d_inode);
+        return rc;
+}
+EXPORT_SYMBOL(l_notify_change);
+
+/* utility to truncate a file */
+int simple_truncate(struct dentry *dir, struct vfsmount *mnt, 
+                    char *name, loff_t length)
+{
+        struct dentry *dchild;
+        struct iattr newattrs;
+        int err = 0;
+        ENTRY;
+
+        CDEBUG(D_INODE, "truncating file %.*s to %lld\n", (int)strlen(name),
+               name, (long long)length);
+        dchild = ll_lookup_one_len(name, dir, strlen(name));
+        if (IS_ERR(dchild))
+                GOTO(out, err = PTR_ERR(dchild));
+
+        if (dchild->d_inode) {
+                int old_mode = dchild->d_inode->i_mode;
+                if (S_ISDIR(old_mode)) {
+                        CERROR("found %s (%lu/%u) is mode %o\n", name,
+                               dchild->d_inode->i_ino,
+                               dchild->d_inode->i_generation, old_mode);
+                        GOTO(out_dput, err = -EISDIR);
+                }
+
+                newattrs.ia_size = length;
+                newattrs.ia_valid = ATTR_SIZE;
+                err = l_notify_change(mnt, dchild, &newattrs);
+        }
+        EXIT;
+out_dput:
+        dput(dchild);
+out:
+        return err;
+}
+EXPORT_SYMBOL(simple_truncate);
  
  #ifdef LUSTRE_KERNEL_VERSION
  #ifndef HAVE_CLEAR_RDONLY_ON_PUT
@@ -505,6 +574,6 @@ int lvfs_check_io_health(struct obd_device *obd, struct file *file)
  }
  EXPORT_SYMBOL(lvfs_check_io_health);
  
-MODULE_AUTHOR("Cluster File Systems, Inc. <info@clusterfs.com>");
+MODULE_AUTHOR("Sun Microsystems, Inc. <http://www.lustre.org/>");
  MODULE_DESCRIPTION("Lustre VFS Filesystem Helper v0.1");
  MODULE_LICENSE("GPL");
diff --git a/lustre/lvfs/lvfs_userfs.c b/lustre/lvfs/lvfs_userfs.c

index 28afe5f..88aa69d 100644 (file)
--- a/lustre/lvfs/lvfs_userfs.c
+++ b/lustre/lvfs/lvfs_userfs.c
@@ -1,26 +1,43 @@
  /* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
   * vim:expandtab:shiftwidth=8:tabstop=8:
   *
- *  lustre/lib/lvfs_userfs.c
- *  Lustre filesystem abstraction routines
+ * GPL HEADER START
   *
- *  Copyright (C) 2002, 2003 Cluster File Systems, Inc.
- *   Author: Andreas Dilger <adilger@clusterfs.com>
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   *
- *   This file is part of Lustre, http://www.lustre.org.
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
   *
- *   Lustre is free software; you can redistribute it and/or
- *   modify it under the terms of version 2 of the GNU General Public
- *   License as published by the Free Software Foundation.
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
   *
- *   Lustre is distributed in the hope that it will be useful,
- *   but WITHOUT ANY WARRANTY; without even the implied warranty of
- *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- *   GNU General Public License for more details.
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
   *
- *   You should have received a copy of the GNU General Public License
- *   along with Lustre; if not, write to the Free Software
- *   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright  2008 Sun Microsystems, Inc. All rights reserved
+ * Use is subject to license terms.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/lvfs/lvfs_userfs.c
+ *
+ * Lustre filesystem abstraction routines
+ *
+ * Author: Andreas Dilger <adilger@clusterfs.com>
   */
  
  #include <liblustre.h>
diff --git a/lustre/lvfs/prng.c b/lustre/lvfs/prng.c

index 168df78..53fdb2a 100644 (file)
--- a/lustre/lvfs/prng.c
+++ b/lustre/lvfs/prng.c
@@ -1,14 +1,46 @@
  /* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
   * vim:expandtab:shiftwidth=8:tabstop=8:
   *
- *   This file is part of the Lustre file system, http://www.lustre.org
- *   Lustre is a trademark of Cluster File Systems, Inc.
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright  2008 Sun Microsystems, Inc. All rights reserved
+ * Use is subject to license terms.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/lvfs/prng.c
   *
   * concatenation of following two 16-bit multiply with carry generators
   * x(n)=a*x(n-1)+carry mod 2^16 and y(n)=b*y(n-1)+carry mod 2^16,
   * number and carry packed within the same 32 bit integer.
   * algorithm recommended by Marsaglia
- ******************************************************************/
+*/
+
  #ifndef EXPORT_SYMTAB
  # define EXPORT_SYMTAB
  #endif
diff --git a/lustre/lvfs/quotafmt_test.c b/lustre/lvfs/quotafmt_test.c

index 3e7f592..a371193 100644 (file)
--- a/lustre/lvfs/quotafmt_test.c
+++ b/lustre/lvfs/quotafmt_test.c
@@ -1,10 +1,46 @@
  /* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
   * vim:expandtab:shiftwidth=8:tabstop=8:
   *
- *   No redistribution or use is permitted outside of Cluster File Systems, Inc.
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright  2008 Sun Microsystems, Inc. All rights reserved
+ * Use is subject to license terms.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/lvfs/quotafmt_test.c
+ *
+ * No redistribution or use is permitted outside of Sun Microsystems, Inc.
   *
   * Kernel module to test lustre administrative quotafile format APIs
- * from the OBD setup function */
+ * from the OBD setup function
+ */
+
  #ifndef EXPORT_SYMTAB
  # define EXPORT_SYMTAB
  #endif
@@ -21,6 +57,8 @@
  
  #include "lustre_quota_fmt.h"
  
+#ifdef HAVE_QUOTA_SUPPORT
+
  char *test_quotafile[2] = { "usrquota_test", "grpquota_test" };
  
  static int quotfmt_initialize(struct lustre_quota_info *lqi,
@@ -48,7 +86,8 @@ static int quotfmt_initialize(struct lustre_quota_info *lqi,
                  LOCK_INODE_MUTEX(parent_inode);
                  de = lookup_one_len(name, tgt->obd_lvfs_ctxt.pwd, namelen);
                  if (!IS_ERR(de) && de->d_inode)
-                        vfs_unlink(parent_inode, de);
+                        ll_vfs_unlink(parent_inode, de, 
+                                      tgt->obd_lvfs_ctxt.pwdmnt);
                  if (!IS_ERR(de))
                          dput(de);
                  UNLOCK_INODE_MUTEX(parent_inode);
@@ -109,7 +148,7 @@ static int quotfmt_finalize(struct lustre_quota_info *lqi,
                          goto dput;
                  }
  
-                rc = vfs_unlink(parent_inode, de);
+                rc = ll_vfs_unlink(parent_inode, de, tgt->obd_lvfs_ctxt.pwdmnt);
                  if (rc)
                          CERROR("error unlink quotafile %s (rc = %d)\n",
                                 name, rc);
@@ -500,9 +539,11 @@ static void __exit quotfmt_test_exit(void)
          class_unregister_type("quotfmt_test");
  }
  
-MODULE_AUTHOR("Cluster File Systems, Inc. <info@clusterfs.com>");
+MODULE_AUTHOR("Sun Microsystems, Inc. <http://www.lustre.org/>");
  MODULE_DESCRIPTION("administrative quotafile test module");
  MODULE_LICENSE("GPL");
  
  module_init(quotfmt_test_init);
  module_exit(quotfmt_test_exit);
+
+#endif /* HAVE_QUOTA_SUPPORT */
diff --git a/lustre/lvfs/upcall_cache.c b/lustre/lvfs/upcall_cache.c

index 61e1e85..8894557 100644 (file)
--- a/lustre/lvfs/upcall_cache.c
+++ b/lustre/lvfs/upcall_cache.c
@@ -1,24 +1,41 @@
  /* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
   * vim:expandtab:shiftwidth=8:tabstop=8:
   *
- *  Supplementary groups cache.
+ * GPL HEADER START
   *
- *  Copyright (c) 2004 Cluster File Systems, Inc.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   *
- *   This file is part of Lustre, http://www.lustre.org.
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
   *
- *   Lustre is free software; you can redistribute it and/or
- *   modify it under the terms of version 2 of the GNU General Public
- *   License as published by the Free Software Foundation.
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
   *
- *   Lustre is distributed in the hope that it will be useful,
- *   but WITHOUT ANY WARRANTY; without even the implied warranty of
- *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- *   GNU General Public License for more details.
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
   *
- *   You should have received a copy of the GNU General Public License
- *   along with Lustre; if not, write to the Free Software
- *   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright  2008 Sun Microsystems, Inc. All rights reserved
+ * Use is subject to license terms.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/lvfs/upcall_cache.c
+ *
+ * Supplementary groups cache.
   */
  
  #define DEBUG_SUBSYSTEM S_SEC
diff --git a/lustre/mdc/Makefile.in b/lustre/mdc/Makefile.in

index b9b9793..95dfffb 100644 (file)
--- a/lustre/mdc/Makefile.in
+++ b/lustre/mdc/Makefile.in
@@ -1,4 +1,4 @@
  MODULES := mdc
-mdc-objs := mdc_request.o mdc_reint.o lproc_mdc.o mdc_lib.o mdc_locks.o
+mdc-objs := mdc_request.o mdc_reint.o lproc_mdc.o mdc_lib.o mdc_locks.o mdc_fid.o
  
  @INCLUDE_RULES@
diff --git a/lustre/mdc/autoMakefile.am b/lustre/mdc/autoMakefile.am

index e39cc9f..61c1e86 100644 (file)
--- a/lustre/mdc/autoMakefile.am
+++ b/lustre/mdc/autoMakefile.am
@@ -1,11 +1,42 @@
-# Copyright (C) 2001  Cluster File Systems, Inc.
  #
-# This code is issued under the GNU General Public License.
-# See the file COPYING in this distribution
+# GPL HEADER START
+#
+# DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License version 2 only,
+# as published by the Free Software Foundation.
+#
+# This program is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+# General Public License version 2 for more details (a copy is included
+# in the LICENSE file that accompanied this code).
+#
+# You should have received a copy of the GNU General Public License
+# version 2 along with this program; If not, see
+# http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+#
+# Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+# CA 95054 USA or visit www.sun.com if you need additional information or
+# have any questions.
+#
+# GPL HEADER END
+#
+
+#
+# Copyright  2008 Sun Microsystems, Inc. All rights reserved
+# Use is subject to license terms.
+#
+
+#
+# This file is part of Lustre, http://www.lustre.org/
+# Lustre is a trademark of Sun Microsystems, Inc.
+#
  
  if LIBLUSTRE
  noinst_LIBRARIES = libmdc.a
-libmdc_a_SOURCES = mdc_request.c mdc_reint.c mdc_lib.c mdc_internal.h mdc_locks.c
+libmdc_a_SOURCES = mdc_request.c mdc_reint.c mdc_lib.c mdc_internal.h mdc_locks.c mdc_fid.c
  libmdc_a_CPPFLAGS = $(LLCPPFLAGS)
  libmdc_a_CFLAGS = $(LLCFLAGS)
  endif
diff --git a/lustre/mdc/lproc_mdc.c b/lustre/mdc/lproc_mdc.c

index dafe84e..accb538 100644 (file)
--- a/lustre/mdc/lproc_mdc.c
+++ b/lustre/mdc/lproc_mdc.c
@@ -1,26 +1,37 @@
  /* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
   * vim:expandtab:shiftwidth=8:tabstop=8:
   *
- *  Copyright (C) 2002 Cluster File Systems, Inc.
+ * GPL HEADER START
   *
- *   This file is part of the Lustre file system, http://www.lustre.org
- *   Lustre is a trademark of Cluster File Systems, Inc.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   *
- *   You may have signed or agreed to another license before downloading
- *   this software.  If so, you are bound by the terms and conditions
- *   of that agreement, and the following does not apply to you.  See the
- *   LICENSE file included with this distribution for more information.
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
   *
- *   If you did not agree to a different license, then this copy of Lustre
- *   is open source software; you can redistribute it and/or modify it
- *   under the terms of version 2 of the GNU General Public License as
- *   published by the Free Software Foundation.
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
   *
- *   In either case, Lustre is distributed in the hope that it will be
- *   useful, but WITHOUT ANY WARRANTY; without even the implied warranty
- *   of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- *   license text for more details.
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
   *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright  2008 Sun Microsystems, Inc. All rights reserved
+ * Use is subject to license terms.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
   */
  #define DEBUG_SUBSYSTEM S_CLASS
  
@@ -66,7 +77,7 @@ static int mdc_wr_max_rpcs_in_flight(struct file *file, const char *buffer,
  }
  static struct lprocfs_vars lprocfs_mdc_obd_vars[] = {
          { "uuid",            lprocfs_rd_uuid,        0, 0 },
-        { "ping",            0, lprocfs_wr_ping,        0 },
+        { "ping",            0, lprocfs_wr_ping,     0, 0, 0222 },
          { "connect_flags",   lprocfs_rd_connect_flags, 0, 0 },
          { "blocksize",       lprocfs_rd_blksize,     0, 0 },
          { "kbytestotal",     lprocfs_rd_kbytestotal, 0, 0 },
@@ -80,6 +91,7 @@ static struct lprocfs_vars lprocfs_mdc_obd_vars[] = {
          { "max_rpcs_in_flight", mdc_rd_max_rpcs_in_flight,
                                  mdc_wr_max_rpcs_in_flight, 0 },
          { "timeouts",        lprocfs_rd_timeouts,    0, 0 },
+        { "import",          lprocfs_rd_import,    0, 0 },
          { 0 }
  };
  
@@ -94,4 +106,3 @@ void lprocfs_mdc_init_vars(struct lprocfs_static_vars *lvars)
      lvars->obd_vars     = lprocfs_mdc_obd_vars;
  }
  #endif /* LPROCFS */
-
diff --git a/lustre/mdc/mdc_fid.c b/lustre/mdc/mdc_fid.c

new file mode 100644 (file)

index 0000000..ac3379c
--- /dev/null
+++ b/lustre/mdc/mdc_fid.c
@@ -0,0 +1,414 @@
+/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
+ * vim:expandtab:shiftwidth=8:tabstop=8:
+ *
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright  2008 Sun Microsystems, Inc. All rights reserved
+ * Use is subject to license terms.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/mdc/mdc_fid.c
+ *
+ * MDC fid management
+ *
+ * Author: Yury Umanets <umka@clusterfs.com>
+ */
+
+#ifndef EXPORT_SYMTAB
+# define EXPORT_SYMTAB
+#endif
+#define DEBUG_SUBSYSTEM S_FID
+
+#ifdef __KERNEL__
+# include <libcfs/libcfs.h>
+# include <linux/module.h>
+#else /* __KERNEL__ */
+# include <liblustre.h>
+#endif
+
+#include <obd.h>
+#include <obd_class.h>
+#include <obd_support.h>
+#include "mdc_internal.h"
+
+static int seq_client_rpc(struct lu_client_seq *seq, struct lu_seq_range *input,
+                          struct lu_seq_range *output, __u32 opc,
+                          const char *opcname)
+{
+        int rc;
+        __u32 size[3] = { sizeof(struct ptlrpc_body),
+                            sizeof(__u32),
+                            sizeof(struct lu_seq_range) };
+        struct obd_export *exp = seq->lcs_exp;
+        struct ptlrpc_request *req;
+        struct lu_seq_range *out, *in;
+        __u32 *op;
+        ENTRY;
+
+        req = ptlrpc_prep_req(class_exp2cliimp(exp), LUSTRE_MDS_VERSION,
+                              SEQ_QUERY, 3, size, NULL);
+        if (req == NULL)
+                RETURN(-ENOMEM);
+
+        req->rq_export = class_export_get(exp);
+        op = lustre_msg_buf(req->rq_reqmsg, REQ_REC_OFF, sizeof(__u32));
+        *op = opc;
+
+        /* Zero out input range, this is not recovery yet. */
+        in = lustre_msg_buf(req->rq_reqmsg, REQ_REC_OFF + 1,
+                            sizeof(struct lu_seq_range));
+        if (input != NULL)
+                *in = *input;
+        else
+                range_init(in);
+
+        size[1] = sizeof(struct lu_seq_range);
+        ptlrpc_req_set_repsize(req, 2, size);
+
+        LASSERT(seq->lcs_type == LUSTRE_SEQ_METADATA);
+        req->rq_request_portal = SEQ_METADATA_PORTAL;
+
+        mdc_get_rpc_lock(exp->exp_obd->u.cli.cl_rpc_lock, NULL);
+        rc = ptlrpc_queue_wait(req);
+        mdc_put_rpc_lock(exp->exp_obd->u.cli.cl_rpc_lock, NULL);
+
+        if (rc)
+                GOTO(out_req, rc);
+
+        out = lustre_msg_buf(req->rq_repmsg, REPLY_REC_OFF,
+                            sizeof(struct lu_seq_range));
+        *output = *out;
+
+        if (!range_is_sane(output)) {
+                CERROR("%s: Invalid range received from server: "
+                       DRANGE"\n", seq->lcs_name, PRANGE(output));
+                GOTO(out_req, rc = -EINVAL);
+        }
+
+        if (range_is_exhausted(output)) {
+                CERROR("%s: Range received from server is exhausted: "
+                       DRANGE"]\n", seq->lcs_name, PRANGE(output));
+                GOTO(out_req, rc = -EINVAL);
+        }
+        *in = *out;
+
+        CDEBUG(D_INFO, "%s: Allocated %s-sequence "DRANGE"]\n",
+               seq->lcs_name, opcname, PRANGE(output));
+
+        EXIT;
+out_req:
+        ptlrpc_req_finished(req);
+        return rc;
+}
+
+
+/* Request sequence-controller node to allocate new meta-sequence. */
+static int seq_client_alloc_meta(struct lu_client_seq *seq)
+{
+        int rc;
+        ENTRY;
+
+        rc = seq_client_rpc(seq, NULL, &seq->lcs_space,
+                            SEQ_ALLOC_META, "meta");
+        RETURN(rc);
+}
+
+/* Allocate new sequence for client. */
+static int seq_client_alloc_seq(struct lu_client_seq *seq, seqno_t *seqnr)
+{
+        int rc;
+        ENTRY;
+
+        LASSERT(range_is_sane(&seq->lcs_space));
+
+        if (range_is_exhausted(&seq->lcs_space)) {
+                rc = seq_client_alloc_meta(seq);
+                if (rc) {
+                        CERROR("%s: Can't allocate new meta-sequence, "
+                               "rc %d\n", seq->lcs_name, rc);
+                        RETURN(rc);
+                } else {
+                        CDEBUG(D_INFO, "%s: New range - "DRANGE"\n",
+                               seq->lcs_name, PRANGE(&seq->lcs_space));
+                }
+        } else {
+                rc = 0;
+        }
+
+        LASSERT(!range_is_exhausted(&seq->lcs_space));
+        *seqnr = seq->lcs_space.lsr_start;
+        seq->lcs_space.lsr_start += 1;
+
+        CDEBUG(D_INFO, "%s: Allocated sequence ["LPX64"]\n", seq->lcs_name,
+               *seqnr);
+
+        RETURN(rc);
+}
+
+/* Allocate new fid on passed client @seq and save it to @fid. */
+static int seq_client_alloc_fid(struct lu_client_seq *seq, struct lu_fid *fid)
+{
+        int rc;
+        ENTRY;
+
+        LASSERT(seq != NULL);
+        LASSERT(fid != NULL);
+
+        down(&seq->lcs_sem);
+
+        if (fid_is_zero(&seq->lcs_fid) ||
+            fid_oid(&seq->lcs_fid) >= seq->lcs_width)
+        {
+                seqno_t seqnr;
+
+                rc = seq_client_alloc_seq(seq, &seqnr);
+                if (rc) {
+                        CERROR("%s: Can't allocate new sequence, "
+                               "rc %d\n", seq->lcs_name, rc);
+                        up(&seq->lcs_sem);
+                        RETURN(rc);
+                }
+
+                CDEBUG(D_INFO, "%s: Switch to sequence "
+                       "[0x%16.16"LPF64"x]\n", seq->lcs_name, seqnr);
+
+                seq->lcs_fid.f_seq = seqnr;
+                seq->lcs_fid.f_oid = LUSTRE_FID_INIT_OID;
+                seq->lcs_fid.f_ver = 0;
+
+                /*
+                 * Inform caller that sequence switch is performed to allow it
+                 * to setup FLD for it.
+                 */
+                rc = 1;
+        } else {
+                /* Just bump last allocated fid and return to caller. */
+                seq->lcs_fid.f_oid += 1;
+                rc = 0;
+        }
+
+        *fid = seq->lcs_fid;
+        up(&seq->lcs_sem);
+
+        CDEBUG(D_INFO, "%s: Allocated FID "DFID"\n", seq->lcs_name,  PFID(fid));
+        RETURN(rc);
+}
+
+/*
+ * Finish the current sequence due to disconnect.
+ * See mdc_import_event()
+ */
+static void seq_client_flush(struct lu_client_seq *seq)
+{
+        LASSERT(seq != NULL);
+        down(&seq->lcs_sem);
+        fid_init(&seq->lcs_fid);
+        range_init(&seq->lcs_space);
+        up(&seq->lcs_sem);
+}
+
+static int seq_client_proc_init(struct lu_client_seq *seq)
+{
+        return 0;
+}
+
+static void seq_client_proc_fini(struct lu_client_seq *seq)
+{
+        return;
+}
+
+int seq_client_init(struct lu_client_seq *seq,
+                    struct obd_export *exp,
+                    enum lu_cli_type type,
+                    __u64 width,
+                    const char *prefix)
+{
+        int rc;
+        ENTRY;
+
+        LASSERT(seq != NULL);
+        LASSERT(prefix != NULL);
+
+        seq->lcs_exp = exp;
+        seq->lcs_type = type;
+        sema_init(&seq->lcs_sem, 1);
+        seq->lcs_width = width;
+
+        /* Make sure that things are clear before work is started. */
+        seq_client_flush(seq);
+
+        LASSERT(seq->lcs_exp != NULL);
+        seq->lcs_exp = class_export_get(seq->lcs_exp);
+
+        snprintf(seq->lcs_name, sizeof(seq->lcs_name),
+                 "cli-%s", prefix);
+
+        rc = seq_client_proc_init(seq);
+        if (rc)
+                seq_client_fini(seq);
+        RETURN(rc);
+}
+
+void seq_client_fini(struct lu_client_seq *seq)
+{
+        ENTRY;
+
+        seq_client_proc_fini(seq);
+        LASSERT(seq->lcs_exp != NULL);
+
+        if (seq->lcs_exp != NULL) {
+                class_export_put(seq->lcs_exp);
+                seq->lcs_exp = NULL;
+        }
+
+        EXIT;
+}
+
+/* Allocate new fid on passed client @seq and save it to @fid. */
+int mdc_fid_alloc(struct lu_client_seq *seq, struct lu_fid *fid)
+{
+        int rc;
+        ENTRY;
+        
+        rc = seq_client_alloc_fid(seq, fid);
+        if (rc > 0)
+                rc = 0;
+        RETURN(rc);
+}
+
+void fid_cpu_to_le(struct lu_fid *dst, const struct lu_fid *src)
+{
+        /* check that all fields are converted */
+        CLASSERT(sizeof *src ==
+                 sizeof fid_seq(src) +
+                 sizeof fid_oid(src) + sizeof fid_ver(src));
+        LASSERTF(fid_is_igif(src) || fid_ver(src) == 0, DFID"\n", PFID(src));
+        dst->f_seq = cpu_to_le64(fid_seq(src));
+        dst->f_oid = cpu_to_le32(fid_oid(src));
+        dst->f_ver = cpu_to_le32(fid_ver(src));
+}
+EXPORT_SYMBOL(fid_cpu_to_le);
+
+void fid_le_to_cpu(struct lu_fid *dst, const struct lu_fid *src)
+{
+        /* check that all fields are converted */
+        CLASSERT(sizeof *src ==
+                 sizeof fid_seq(src) +
+                 sizeof fid_oid(src) + sizeof fid_ver(src));
+        dst->f_seq = le64_to_cpu(fid_seq(src));
+        dst->f_oid = le32_to_cpu(fid_oid(src));
+        dst->f_ver = le32_to_cpu(fid_ver(src));
+        LASSERTF(fid_is_igif(dst) || fid_ver(dst) == 0, DFID"\n", PFID(dst));
+}
+EXPORT_SYMBOL(fid_le_to_cpu);
+
+void range_cpu_to_le(struct lu_seq_range *dst, const struct lu_seq_range *src)
+{
+        /* check that all fields are converted */
+        CLASSERT(sizeof(*src) ==
+                 sizeof(src->lsr_start) +
+                 sizeof(src->lsr_end) +
+                 sizeof(src->lsr_mdt) +
+                 sizeof(src->lsr_padding));
+        dst->lsr_start = cpu_to_le64(src->lsr_start);
+        dst->lsr_end = cpu_to_le64(src->lsr_end);
+}
+EXPORT_SYMBOL(range_cpu_to_le);
+
+void range_le_to_cpu(struct lu_seq_range *dst, const struct lu_seq_range *src)
+{
+        /* check that all fields are converted */
+        CLASSERT(sizeof(*src) ==
+                 sizeof(src->lsr_start) +
+                 sizeof(src->lsr_end) +
+                 sizeof(src->lsr_mdt) +
+                 sizeof(src->lsr_padding));
+
+        dst->lsr_start = le64_to_cpu(src->lsr_start);
+        dst->lsr_end = le64_to_cpu(src->lsr_end);
+}
+EXPORT_SYMBOL(range_le_to_cpu);
+
+void range_cpu_to_be(struct lu_seq_range *dst, const struct lu_seq_range *src)
+{
+        /* check that all fields are converted */
+        CLASSERT(sizeof(*src) ==
+                 sizeof(src->lsr_start) +
+                 sizeof(src->lsr_end) +
+                 sizeof(src->lsr_mdt) +
+                 sizeof(src->lsr_padding));
+
+        dst->lsr_start = cpu_to_be64(src->lsr_start);
+        dst->lsr_end = cpu_to_be64(src->lsr_end);
+}
+EXPORT_SYMBOL(range_cpu_to_be);
+
+void range_be_to_cpu(struct lu_seq_range *dst, const struct lu_seq_range *src)
+{
+        /* check that all fields are converted */
+        CLASSERT(sizeof(*src) ==
+                 sizeof(src->lsr_start) +
+                 sizeof(src->lsr_end) +
+                 sizeof(src->lsr_mdt) +
+                 sizeof(src->lsr_padding));
+
+        dst->lsr_start = be64_to_cpu(src->lsr_start);
+        dst->lsr_end = be64_to_cpu(src->lsr_end);
+}
+EXPORT_SYMBOL(range_be_to_cpu);
+
+/**     
+ * Build (DLM) resource name from fid.
+ */
+struct ldlm_res_id *
+fid_build_reg_res_name(const struct lu_fid *f, struct ldlm_res_id *name)
+{       
+        memset(name, 0, sizeof *name);
+        name->name[LUSTRE_RES_ID_SEQ_OFF] = fid_seq(f);
+        name->name[LUSTRE_RES_ID_OID_OFF] = fid_oid(f);
+        if (!fid_is_igif(f))
+                name->name[LUSTRE_RES_ID_VER_OFF] = fid_ver(f);
+        return name;
+}
+EXPORT_SYMBOL(fid_build_reg_res_name);
+
+/**
+ * Return true if resource is for object identified by fid.
+ */
+int fid_res_name_eq(const struct lu_fid *f, const struct ldlm_res_id *name)
+{
+        int ret;
+        
+        ret = name->name[LUSTRE_RES_ID_SEQ_OFF] == fid_seq(f) &&
+              name->name[LUSTRE_RES_ID_OID_OFF] == fid_oid(f);
+        if (!fid_is_igif(f))
+                ret = ret && name->name[LUSTRE_RES_ID_VER_OFF] == fid_ver(f);
+        return ret;
+}
+EXPORT_SYMBOL(fid_res_name_eq);
diff --git a/lustre/mdc/mdc_internal.h b/lustre/mdc/mdc_internal.h

index 843bcbe..9104238 100644 (file)
--- a/lustre/mdc/mdc_internal.h
+++ b/lustre/mdc/mdc_internal.h
@@ -1,7 +1,39 @@
  /* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
   * vim:expandtab:shiftwidth=8:tabstop=8:
   *
- *   This file is part of Lustre, http://www.lustre.org
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright  2008 Sun Microsystems, Inc. All rights reserved
+ * Use is subject to license terms.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/mdc/mdc_internal.h
   *
   * MDC internal definitions.
   */
@@ -28,7 +60,7 @@ void mdc_setattr_pack(struct ptlrpc_request *req, int offset,
                        void *ea2, int ea2len);
  void mdc_create_pack(struct ptlrpc_request *req, int offset,
                       struct mdc_op_data *op_data, const void *data, int datalen,
-                     __u32 mode, __u32 uid, __u32 gid, __u32 cap_effective,
+                     __u32 mode, __u32 uid, __u32 gid, cfs_cap_t cap_effective,
                       __u64 rdev);
  void mdc_open_pack(struct ptlrpc_request *req, int offset,
                     struct mdc_op_data *op_data, __u32 mode, __u64 rdev,
@@ -42,11 +74,23 @@ void mdc_link_pack(struct ptlrpc_request *req, int offset,
  void mdc_rename_pack(struct ptlrpc_request *req, int offset,
                       struct mdc_op_data *data,
                       const char *old, int oldlen, const char *new, int newlen);
-void mdc_close_pack(struct ptlrpc_request *req, int offset, struct obdo *oa,
+void mdc_close_pack(struct ptlrpc_request *req, int offset,
+                    struct mdc_op_data *data,
+                    struct obdo *oa,
                      __u64 valid, struct obd_client_handle *och);
  void mdc_exit_request(struct client_obd *cli);
  void mdc_enter_request(struct client_obd *cli);
  
+int seq_client_init(struct lu_client_seq *seq,
+                    struct obd_export *exp,
+                    enum lu_cli_type type,
+                    __u64 width,
+                    const char *prefix);
+
+void seq_client_fini(struct lu_client_seq *seq);
+
+int mdc_fid_alloc(struct lu_client_seq *seq, struct lu_fid *fid);
+
  struct mdc_open_data {
          struct obd_client_handle *mod_och;
          struct ptlrpc_request    *mod_open_req;
@@ -85,3 +129,13 @@ static inline void mdc_put_rpc_lock(struct mdc_rpc_lock *lck,
          }
          EXIT;
  }
+
+static inline int mdc_exp_is_2_0_server(struct obd_export *exp) {
+       LASSERT(exp);
+       return !!(exp->exp_connect_flags & OBD_CONNECT_FID);
+}
+
+static inline int mdc_req_is_2_0_server(struct ptlrpc_request *req) {
+       LASSERT(req);
+        return mdc_exp_is_2_0_server(req->rq_export);
+}
diff --git a/lustre/mdc/mdc_lib.c b/lustre/mdc/mdc_lib.c

index bb02005..579e6d5 100644 (file)
--- a/lustre/mdc/mdc_lib.c
+++ b/lustre/mdc/mdc_lib.c
@@ -1,25 +1,37 @@
  /* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
   * vim:expandtab:shiftwidth=8:tabstop=8:
   *
- *  Copyright (c) 2003 Cluster File Systems, Inc.
+ * GPL HEADER START
   *
- *   This file is part of the Lustre file system, http://www.lustre.org
- *   Lustre is a trademark of Cluster File Systems, Inc.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   *
- *   You may have signed or agreed to another license before downloading
- *   this software.  If so, you are bound by the terms and conditions
- *   of that agreement, and the following does not apply to you.  See the
- *   LICENSE file included with this distribution for more information.
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
   *
- *   If you did not agree to a different license, then this copy of Lustre
- *   is open source software; you can redistribute it and/or modify it
- *   under the terms of version 2 of the GNU General Public License as
- *   published by the Free Software Foundation.
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
   *
- *   In either case, Lustre is distributed in the hope that it will be
- *   useful, but WITHOUT ANY WARRANTY; without even the implied warranty
- *   of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- *   license text for more details.
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright  2008 Sun Microsystems, Inc. All rights reserved
+ * Use is subject to license terms.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
   */
  
  #define DEBUG_SUBSYSTEM S_MDC
@@ -38,62 +50,131 @@
  #endif
  #endif
  
-void mdc_readdir_pack(struct ptlrpc_request *req, int offset, __u64 pg_off,
-                      __u32 size, struct ll_fid *fid)
+static void mdc_readdir_pack_18(struct ptlrpc_request *req, int offset,
+                                __u64 pg_off, __u32 size, struct ll_fid *fid)
  {
          struct mds_body *b;
+        ENTRY;
+
+        CLASSERT(sizeof(struct ll_fid)   == sizeof(struct lu_fid));
+        CLASSERT(sizeof(struct mds_body) <= sizeof(struct mdt_body));
+        CLASSERT((int)offsetof(struct mds_body, max_cookiesize) == 
+                 (int)offsetof(struct mdt_body, max_cookiesize));
+
  
          b = lustre_msg_buf(req->rq_reqmsg, offset, sizeof(*b));
          b->fsuid = current->fsuid;
          b->fsgid = current->fsgid;
-        b->capability = current->cap_effective;
+        b->capability = cfs_curproc_cap_pack();
          b->fid1 = *fid;
          b->size = pg_off;                       /* !! */
          b->suppgid = -1;
          b->nlink = size;                        /* !! */
+        EXIT;
  }
  
-static void mdc_pack_body(struct mds_body *b)
+static void mdc_readdir_pack_20(struct ptlrpc_request *req, int offset,
+                                __u64 pg_off, __u32 size, struct ll_fid *fid)
  {
-        LASSERT (b != NULL);
+        struct mdt_body *b;
+        ENTRY;
  
+        b = lustre_msg_buf(req->rq_reqmsg, offset, sizeof(*b));
          b->fsuid = current->fsuid;
          b->fsgid = current->fsgid;
-        b->capability = current->cap_effective;
+        b->capability = cfs_curproc_cap_pack();
+
+        if (fid) {
+                b->fid1 = *((struct lu_fid*)fid);
+                b->valid |= OBD_MD_FLID;
+        }
+        b->size = pg_off;                       /* !! */
+        b->suppgid = -1;
+        b->nlink = size;                        /* !! */
+        EXIT;
  }
  
-void mdc_pack_req_body(struct ptlrpc_request *req, int offset,
-                       __u64 valid, struct ll_fid *fid, int ea_size, int flags)
+void mdc_readdir_pack(struct ptlrpc_request *req, int offset,
+                      __u64 pg_off, __u32 size, struct ll_fid *fid)
+{
+        if (mdc_req_is_2_0_server(req))
+                mdc_readdir_pack_20(req, offset, pg_off, size, fid);
+        else
+                mdc_readdir_pack_18(req, offset, pg_off, size, fid);
+}
+
+static void mdc_pack_req_body_18(struct ptlrpc_request *req, int offset,
+                                 __u64 valid, struct ll_fid *fid, int ea_size,
+                                 int flags)
  {
          struct mds_body *b = lustre_msg_buf(req->rq_reqmsg, offset, sizeof(*b));
+        ENTRY;
+        LASSERT (b != NULL);
  
          if (fid)
                  b->fid1 = *fid;
          b->valid = valid;
          b->eadatasize = ea_size;
          b->flags = flags;
-        mdc_pack_body(b);
+        b->fsuid = current->fsuid;
+        b->fsgid = current->fsgid;
+        b->capability = cfs_curproc_cap_pack();
+        EXIT;
+}
+
+static void mdc_pack_req_body_20(struct ptlrpc_request *req, int offset,
+                                 __u64 valid, struct ll_fid *fid, int ea_size,
+                                 int flags)
+{
+        struct mdt_body *b = lustre_msg_buf(req->rq_reqmsg, offset, sizeof(*b));
+        ENTRY;
+        LASSERT (b != NULL);
+
+        b->valid      = valid;
+        b->eadatasize = ea_size;
+        b->flags      = flags;
+        if (fid) {
+                b->fid1 = *((struct lu_fid*)fid);
+                b->valid |= OBD_MD_FLID;
+        }
+
+        b->fsuid = current->fsuid;
+        b->fsgid = current->fsgid;
+        b->capability = cfs_curproc_cap_pack();
+        EXIT;
+}
+
+void mdc_pack_req_body(struct ptlrpc_request *req, int offset,
+                       __u64 valid, struct ll_fid *fid, int ea_size,
+                       int flags)
+{
+        if (mdc_req_is_2_0_server(req))
+                mdc_pack_req_body_20(req, offset, valid, fid, ea_size, flags);
+        else
+                mdc_pack_req_body_18(req, offset, valid, fid, ea_size, flags);
  }
  
  /* packing of MDS records */
-void mdc_create_pack(struct ptlrpc_request *req, int offset,
-                     struct mdc_op_data *op_data, const void *data, int datalen,
-                     __u32 mode, __u32 uid, __u32 gid, __u32 cap_effective,
-                     __u64 rdev)
+static void mdc_create_pack_18(struct ptlrpc_request *req, int offset,
+                               struct mdc_op_data *op_data, const void *data,
+                               int datalen, __u32 mode, __u32 uid, __u32 gid,
+                               cfs_cap_t cap_effective, __u64 rdev)
  {
          struct mds_rec_create *rec;
          char *tmp;
+        ENTRY;
+
          rec = lustre_msg_buf(req->rq_reqmsg, offset, sizeof (*rec));
  
-        rec->cr_opcode = REINT_CREATE;
-        rec->cr_fsuid = uid;
-        rec->cr_fsgid = gid;
-        rec->cr_cap = cap_effective;
-        rec->cr_fid = op_data->fid1;
+        rec->cr_opcode  = REINT_CREATE;
+        rec->cr_fsuid   = uid;
+        rec->cr_fsgid   = gid;
+        rec->cr_cap     = cap_effective;
+        rec->cr_fid     = op_data->fid1;
          memset(&rec->cr_replayfid, 0, sizeof(rec->cr_replayfid));
-        rec->cr_mode = mode;
-        rec->cr_rdev = rdev;
-        rec->cr_time = op_data->mod_time;
+        rec->cr_mode    = mode;
+        rec->cr_rdev    = rdev;
+        rec->cr_time    = op_data->mod_time;
          rec->cr_suppgid = op_data->suppgids[0];
  
          tmp = lustre_msg_buf(req->rq_reqmsg, offset + 1, op_data->namelen + 1);
@@ -103,6 +184,53 @@ void mdc_create_pack(struct ptlrpc_request *req, int offset,
                  tmp = lustre_msg_buf(req->rq_reqmsg, offset + 2, datalen);
                  memcpy (tmp, data, datalen);
          }
+        EXIT;
+}
+
+static void mdc_create_pack_20(struct ptlrpc_request *req, int offset,
+                               struct mdc_op_data *op_data, const void *data,
+                               int datalen, __u32 mode, __u32 uid, __u32 gid,
+                               cfs_cap_t cap_effective, __u64 rdev)
+{
+        struct mdt_rec_create *rec;
+        char *tmp;
+        ENTRY;
+
+        rec = lustre_msg_buf(req->rq_reqmsg, offset, sizeof (*rec));
+
+        rec->cr_opcode   = REINT_CREATE;
+        rec->cr_fsuid    = uid;
+        rec->cr_fsgid    = gid;
+        rec->cr_cap      = cap_effective;
+        memcpy(&rec->cr_fid1, &op_data->fid1, sizeof(op_data->fid1));
+        memcpy(&rec->cr_fid2, &op_data->fid2, sizeof(op_data->fid2));
+        rec->cr_mode     = mode;
+        rec->cr_rdev     = rdev;
+        rec->cr_time     = op_data->mod_time;
+        rec->cr_suppgid1 = op_data->suppgids[0];
+
+        /* offset + 1  == capa */
+        tmp = lustre_msg_buf(req->rq_reqmsg, offset + 2, op_data->namelen + 1);
+        LOGL0(op_data->name, op_data->namelen, tmp);
+
+        if (data) {
+                tmp = lustre_msg_buf(req->rq_reqmsg, offset + 3, datalen);
+                memcpy(tmp, data, datalen);
+        }
+        EXIT;
+}
+
+void mdc_create_pack(struct ptlrpc_request *req, int offset,
+                     struct mdc_op_data *op_data, const void *data,
+                     int datalen, __u32 mode, __u32 uid, __u32 gid,
+                     cfs_cap_t cap_effective, __u64 rdev)
+{
+        if (mdc_req_is_2_0_server(req))
+                mdc_create_pack_20(req, offset, op_data, data, datalen,
+                                   mode, uid, gid, cap_effective, rdev);
+        else
+                mdc_create_pack_18(req, offset, op_data, data, datalen,
+                                   mode, uid, gid, cap_effective, rdev);
  }
  
  static __u32 mds_pack_open_flags(__u32 flags)
@@ -133,36 +261,62 @@ static __u32 mds_pack_open_flags(__u32 flags)
  }
  
  /* packing of MDS records */
-void mdc_join_pack(struct ptlrpc_request *req, int offset,
-                   struct mdc_op_data *op_data, __u64 head_size)
+static void mdc_join_pack_18(struct ptlrpc_request *req, int offset,
+                             struct mdc_op_data *op_data, __u64 head_size)
  {
          struct mds_rec_join *rec;
+        ENTRY;
  
          rec = lustre_msg_buf(req->rq_reqmsg, offset, sizeof(*rec));
          LASSERT(rec != NULL);
          rec->jr_fid = op_data->fid2;
          rec->jr_headsize = head_size;
+        EXIT;
  }
  
-void mdc_open_pack(struct ptlrpc_request *req, int offset,
-                   struct mdc_op_data *op_data, __u32 mode, __u64 rdev,
-                   __u32 flags, const void *lmm, int lmmlen)
+static void mdc_join_pack_20(struct ptlrpc_request *req, int offset,
+                             struct mdc_op_data *op_data, __u64 head_size)
+{
+        struct mdt_rec_join *rec;
+        ENTRY;
+
+        rec = lustre_msg_buf(req->rq_reqmsg, offset, sizeof(*rec));
+        LASSERT(rec != NULL);
+        memcpy(&rec->jr_fid, &op_data->fid2, sizeof(op_data->fid2));
+        rec->jr_headsize = head_size;
+        EXIT;
+}
+
+void mdc_join_pack(struct ptlrpc_request *req, int offset,
+                   struct mdc_op_data *op_data, __u64 head_size)
+{
+        if (mdc_req_is_2_0_server(req))
+                mdc_join_pack_20(req, offset, op_data, head_size);
+        else
+                mdc_join_pack_18(req, offset, op_data, head_size);
+}
+
+static void mdc_open_pack_18(struct ptlrpc_request *req, int offset,
+                            struct mdc_op_data *op_data, __u32 mode, __u64 rdev,
+                             __u32 flags, const void *lmm, int lmmlen)
  {
          struct mds_rec_create *rec;
          char *tmp;
+        ENTRY;
+
          rec = lustre_msg_buf(req->rq_reqmsg, offset, sizeof (*rec));
  
          /* XXX do something about time, uid, gid */
-        rec->cr_opcode = REINT_OPEN;
-        rec->cr_fsuid = current->fsuid;
-        rec->cr_fsgid = current->fsgid;
-        rec->cr_cap = current->cap_effective;
-        rec->cr_fid = op_data->fid1;
+        rec->cr_opcode  = REINT_OPEN;
+        rec->cr_fsuid   = current->fsuid;
+        rec->cr_fsgid   = current->fsgid;
+        rec->cr_cap     = cfs_curproc_cap_pack();
+        rec->cr_fid     = op_data->fid1;
          memset(&rec->cr_replayfid, 0, sizeof(rec->cr_replayfid));
-        rec->cr_mode = mode;
-        rec->cr_flags = mds_pack_open_flags(flags);
-        rec->cr_rdev = rdev;
-        rec->cr_time = op_data->mod_time;
+        rec->cr_mode    = mode;
+        rec->cr_flags   = mds_pack_open_flags(flags);
+        rec->cr_rdev    = rdev;
+        rec->cr_time    = op_data->mod_time;
          rec->cr_suppgid = op_data->suppgids[0];
  
          if (op_data->name) {
@@ -180,6 +334,66 @@ void mdc_open_pack(struct ptlrpc_request *req, int offset,
                  tmp = lustre_msg_buf(req->rq_reqmsg, offset + 2, lmmlen);
                  memcpy (tmp, lmm, lmmlen);
          }
+        EXIT;
+}
+
+static void mdc_open_pack_20(struct ptlrpc_request *req, int offset,
+                            struct mdc_op_data *op_data, __u32 mode, __u64 rdev,
+                             __u32 flags, const void *lmm, int lmmlen)
+{
+        struct mdt_rec_create *rec;
+        char *tmp;
+        ENTRY;
+
+        rec = lustre_msg_buf(req->rq_reqmsg, offset, sizeof (*rec));
+
+        /* XXX do something about time, uid, gid */
+        rec->cr_opcode = REINT_OPEN;
+        rec->cr_fsuid  = current->fsuid;
+        rec->cr_fsgid  = current->fsgid;
+        rec->cr_cap    = cfs_curproc_cap_pack();
+        memcpy(&rec->cr_fid1, &op_data->fid1, sizeof(op_data->fid1));
+        memcpy(&rec->cr_fid2, &op_data->fid2, sizeof(op_data->fid2));
+        rec->cr_mode   = mode;
+        rec->cr_flags  = mds_pack_open_flags(flags);
+        rec->cr_rdev   = rdev;
+        rec->cr_time   = op_data->mod_time;
+        rec->cr_suppgid1 = op_data->suppgids[0];
+        rec->cr_suppgid2 = op_data->suppgids[1];
+
+        if (op_data->name) {
+                tmp = lustre_msg_buf(req->rq_reqmsg, offset + 3,
+                                     op_data->namelen + 1);
+                CDEBUG(D_INFO, "offset=%d, src=%p(%d):%s, dst=%p\n",
+                        offset, op_data->name, op_data->namelen,
+                        op_data->name, tmp);
+                LASSERT(tmp);
+                LOGL0(op_data->name, op_data->namelen, tmp);
+        }
+
+        if (lmm) {
+                rec->cr_flags |= MDS_OPEN_HAS_EA;
+#ifndef __KERNEL__
+                /*XXX a hack for liblustre to set EA (LL_IOC_LOV_SETSTRIPE) */
+                memcpy(&rec->cr_fid2, &op_data->fid2, sizeof(op_data->fid2));
+#endif
+                tmp = lustre_msg_buf(req->rq_reqmsg, offset + 4, lmmlen);
+                memcpy(tmp, lmm, lmmlen);
+        }
+        EXIT;
+}
+
+void mdc_open_pack(struct ptlrpc_request *req, int offset,
+                   struct mdc_op_data *op_data, __u32 mode, __u64 rdev,
+                   __u32 flags, const void *lmm, int lmmlen)
+{
+        if (mdc_req_is_2_0_server(req))
+                mdc_open_pack_20(req, offset, op_data, mode, rdev,
+                                 flags, lmm, lmmlen);
+        else
+                mdc_open_pack_18(req, offset, op_data, mode, rdev,
+                                 flags, lmm, lmmlen);
+
  }
  
  static inline __u64 attr_pack(unsigned int ia_valid) {
@@ -221,16 +435,18 @@ static inline __u64 attr_pack(unsigned int ia_valid) {
          return sa_valid;
  }
  
-void mdc_setattr_pack(struct ptlrpc_request *req, int offset,
-                      struct mdc_op_data *data, struct iattr *iattr, void *ea,
-                      int ealen, void *ea2, int ea2len)
+void mdc_setattr_pack_18(struct ptlrpc_request *req, int offset,
+                         struct mdc_op_data *data, struct iattr *iattr, void *ea,
+                         int ealen, void *ea2, int ea2len)
  {
          struct mds_rec_setattr *rec = lustre_msg_buf(req->rq_reqmsg, offset,
                                                       sizeof(*rec));
+        ENTRY;
+
          rec->sa_opcode = REINT_SETATTR;
          rec->sa_fsuid = current->fsuid;
          rec->sa_fsgid = current->fsgid;
-        rec->sa_cap = current->cap_effective;
+        rec->sa_cap = cfs_curproc_cap_pack();
          rec->sa_fid = data->fid1;
          rec->sa_suppgid = -1;
  
@@ -251,22 +467,86 @@ void mdc_setattr_pack(struct ptlrpc_request *req, int offset,
                          rec->sa_suppgid = data->suppgids[0];
          }
  
-        if (ealen == 0)
+        if (ealen == 0) {
+                EXIT;
                  return;
+        }
  
          memcpy(lustre_msg_buf(req->rq_reqmsg, offset + 1, ealen), ea, ealen);
  
-        if (ea2len == 0)
+        if (ea2len == 0) {
+                EXIT;
                  return;
-
+        }
          memcpy(lustre_msg_buf(req->rq_reqmsg, offset + 2, ea2len), ea2, ea2len);
+
+        EXIT;
  }
  
-void mdc_unlink_pack(struct ptlrpc_request *req, int offset,
-                     struct mdc_op_data *data)
+static void mdc_setattr_pack_20(struct ptlrpc_request *req, int offset,
+                                struct mdc_op_data *data, struct iattr *iattr,
+                                void *ea, int ealen, void *ea2, int ea2len)
+{
+        struct mdt_rec_setattr *rec = lustre_msg_buf(req->rq_reqmsg, offset,
+                                                     sizeof(*rec));
+        ENTRY;
+
+        rec->sa_opcode  = REINT_SETATTR;
+        rec->sa_fsuid   = current->fsuid;
+        rec->sa_fsgid   = current->fsgid;
+        rec->sa_cap     = cfs_curproc_cap_pack();
+        memcpy(&rec->sa_fid, &data->fid1, sizeof(data->fid1));
+        rec->sa_suppgid = -1;
+
+        if (iattr) {
+                rec->sa_valid   = attr_pack(iattr->ia_valid);
+                rec->sa_mode    = iattr->ia_mode;
+                rec->sa_uid     = iattr->ia_uid;
+                rec->sa_gid     = iattr->ia_gid;
+                rec->sa_size    = iattr->ia_size;
+//              rec->sa_blocks  = iattr->ia_blocks;
+                rec->sa_atime   = LTIME_S(iattr->ia_atime);
+                rec->sa_mtime   = LTIME_S(iattr->ia_mtime);
+                rec->sa_ctime   = LTIME_S(iattr->ia_ctime);
+                rec->sa_attr_flags = 
+                        ((struct ll_iattr_struct *)iattr)->ia_attr_flags;
+                if ((iattr->ia_valid & ATTR_GID) && in_group_p(iattr->ia_gid))
+                        rec->sa_suppgid = iattr->ia_gid;
+                else
+                        rec->sa_suppgid = data->suppgids[0];
+        }
+        if (ealen == 0) {
+                EXIT;
+                return;
+        }
+        memcpy(lustre_msg_buf(req->rq_reqmsg, offset + 3, ealen), ea, ealen);
+
+        if (ea2len == 0) {
+                EXIT;
+                return;
+        }
+        memcpy(lustre_msg_buf(req->rq_reqmsg, offset + 4, ea2len), ea2, ea2len);
+        EXIT;
+}
+
+void mdc_setattr_pack(struct ptlrpc_request *req, int offset,
+                      struct mdc_op_data *data, struct iattr *iattr,
+                      void *ea, int ealen, void *ea2, int ea2len)
+{
+        if (mdc_req_is_2_0_server(req))
+                mdc_setattr_pack_20(req, offset, data, iattr,
+                                    ea, ealen, ea2, ea2len);
+        else
+                mdc_setattr_pack_18(req, offset, data, iattr,
+                                    ea, ealen, ea2, ea2len);
+}
+
+static void mdc_unlink_pack_18(struct ptlrpc_request *req, int offset,
+                               struct mdc_op_data *data)
  {
          struct mds_rec_unlink *rec;
          char *tmp;
+        ENTRY;
  
          rec = lustre_msg_buf(req->rq_reqmsg, offset, sizeof (*rec));
          LASSERT (rec != NULL);
@@ -274,7 +554,7 @@ void mdc_unlink_pack(struct ptlrpc_request *req, int offset,
          rec->ul_opcode = REINT_UNLINK;
          rec->ul_fsuid = current->fsuid;
          rec->ul_fsgid = current->fsgid;
-        rec->ul_cap = current->cap_effective;
+        rec->ul_cap = cfs_curproc_cap_pack();
          rec->ul_mode = data->create_mode;
          rec->ul_suppgid = data->suppgids[0];
          rec->ul_fid1 = data->fid1;
@@ -284,20 +564,58 @@ void mdc_unlink_pack(struct ptlrpc_request *req, int offset,
          tmp = lustre_msg_buf(req->rq_reqmsg, offset + 1, data->namelen + 1);
          LASSERT (tmp != NULL);
          LOGL0(data->name, data->namelen, tmp);
+        EXIT;
  }
  
-void mdc_link_pack(struct ptlrpc_request *req, int offset,
-                   struct mdc_op_data *data)
+static void mdc_unlink_pack_20(struct ptlrpc_request *req, int offset,
+                               struct mdc_op_data *data)
+{
+        struct mdt_rec_unlink *rec;
+        char *tmp;
+        ENTRY;
+
+        rec = lustre_msg_buf(req->rq_reqmsg, offset, sizeof (*rec));
+        LASSERT (rec != NULL);
+
+        rec->ul_opcode  = REINT_UNLINK;
+        rec->ul_fsuid   = current->fsuid;
+        rec->ul_fsgid   = current->fsgid;
+        rec->ul_cap     = cfs_curproc_cap_pack();
+        rec->ul_mode    = data->create_mode;
+        rec->ul_suppgid1= data->suppgids[0];
+        memcpy(&rec->ul_fid1, &data->fid1, sizeof(data->fid1));
+        memcpy(&rec->ul_fid2, &data->fid2, sizeof(data->fid2));
+        rec->ul_time    = data->mod_time;
+
+        /* NULL capa is skipped. */
+
+        tmp = lustre_msg_buf(req->rq_reqmsg, offset + 2, data->namelen + 1);
+        LASSERT (tmp != NULL);
+        LOGL0(data->name, data->namelen, tmp);
+        EXIT;
+}
+
+void mdc_unlink_pack(struct ptlrpc_request *req, int offset,
+                     struct mdc_op_data *data)
+{
+        if (mdc_req_is_2_0_server(req))
+                mdc_unlink_pack_20(req, offset, data);
+        else
+                mdc_unlink_pack_18(req, offset, data);
+}
+static void mdc_link_pack_18(struct ptlrpc_request *req, int offset,
+                             struct mdc_op_data *data)
  {
          struct mds_rec_link *rec;
          char *tmp;
+        ENTRY;
  
          rec = lustre_msg_buf(req->rq_reqmsg, offset, sizeof (*rec));
  
          rec->lk_opcode = REINT_LINK;
          rec->lk_fsuid = current->fsuid;
          rec->lk_fsgid = current->fsgid;
-        rec->lk_cap = current->cap_effective;
+        rec->lk_cap = cfs_curproc_cap_pack();
          rec->lk_suppgid1 = data->suppgids[0];
          rec->lk_suppgid2 = data->suppgids[1];
          rec->lk_fid1 = data->fid1;
@@ -306,14 +624,53 @@ void mdc_link_pack(struct ptlrpc_request *req, int offset,
  
          tmp = lustre_msg_buf(req->rq_reqmsg, offset + 1, data->namelen + 1);
          LOGL0(data->name, data->namelen, tmp);
+        EXIT;
  }
  
-void mdc_rename_pack(struct ptlrpc_request *req, int offset,
-                     struct mdc_op_data *data,
-                     const char *old, int oldlen, const char *new, int newlen)
+static void mdc_link_pack_20(struct ptlrpc_request *req, int offset,
+                             struct mdc_op_data *data)
+{
+        struct mdt_rec_link *rec;
+        char *tmp;
+        ENTRY;
+
+        rec = lustre_msg_buf(req->rq_reqmsg, offset, sizeof (*rec));
+
+        rec->lk_opcode   = REINT_LINK;
+        rec->lk_fsuid    = current->fsuid;
+        rec->lk_fsgid    = current->fsgid;
+        rec->lk_cap      = cfs_curproc_cap_pack();
+        rec->lk_suppgid1 = data->suppgids[0];
+        rec->lk_suppgid2 = data->suppgids[1];
+        memcpy(&rec->lk_fid1, &data->fid1, sizeof(data->fid1));
+        memcpy(&rec->lk_fid2, &data->fid2, sizeof(data->fid2));
+        rec->lk_time     = data->mod_time;
+
+
+        /* capa @ offset + 1; */
+        /* capa @ offset + 2; */
+
+        tmp = lustre_msg_buf(req->rq_reqmsg, offset + 3, data->namelen + 1);
+        LOGL0(data->name, data->namelen, tmp);
+        EXIT;
+}
+
+void mdc_link_pack(struct ptlrpc_request *req, int offset,
+                   struct mdc_op_data *data)
+{
+        if (mdc_req_is_2_0_server(req))
+                mdc_link_pack_20(req, offset, data);
+        else
+                mdc_link_pack_18(req, offset, data);
+}
+
+static void mdc_rename_pack_18(struct ptlrpc_request *req, int offset,
+                               struct mdc_op_data *data, const char *old, 
+                               int oldlen, const char *new, int newlen)
  {
          struct mds_rec_rename *rec;
          char *tmp;
+        ENTRY;
  
          rec = lustre_msg_buf(req->rq_reqmsg, offset, sizeof (*rec));
  
@@ -321,7 +678,7 @@ void mdc_rename_pack(struct ptlrpc_request *req, int offset,
          rec->rn_opcode = REINT_RENAME;
          rec->rn_fsuid = current->fsuid;
          rec->rn_fsgid = current->fsgid;
-        rec->rn_cap = current->cap_effective;
+        rec->rn_cap = cfs_curproc_cap_pack();
          rec->rn_suppgid1 = data->suppgids[0];
          rec->rn_suppgid2 = data->suppgids[1];
          rec->rn_fid1 = data->fid1;
@@ -335,17 +692,66 @@ void mdc_rename_pack(struct ptlrpc_request *req, int offset,
                  tmp = lustre_msg_buf(req->rq_reqmsg, offset + 2, newlen + 1);
                  LOGL0(new, newlen, tmp);
          }
+        EXIT;
+}
+
+static void mdc_rename_pack_20(struct ptlrpc_request *req, int offset,
+                               struct mdc_op_data *data, const char *old,
+                               int oldlen, const char *new, int newlen)
+{
+        struct mdt_rec_rename *rec;
+        char *tmp;
+        ENTRY;
+
+        rec = lustre_msg_buf(req->rq_reqmsg, offset, sizeof (*rec));
+
+        /* XXX do something about time, uid, gid */
+        rec->rn_opcode   = REINT_RENAME;
+        rec->rn_fsuid    = current->fsuid;
+        rec->rn_fsgid    = current->fsgid;
+        rec->rn_cap      = cfs_curproc_cap_pack();
+        rec->rn_suppgid1 = data->suppgids[0];
+        rec->rn_suppgid2 = data->suppgids[1];
+        memcpy(&rec->rn_fid1, &data->fid1, sizeof(data->fid1));
+        memcpy(&rec->rn_fid2, &data->fid2, sizeof(data->fid2));
+        rec->rn_time     = data->mod_time;
+        rec->rn_mode     = data->create_mode;
+
+
+        /* skip capa @ offset + 1 */
+        /* skip capa @ offset + 2 */
+
+        tmp = lustre_msg_buf(req->rq_reqmsg, offset + 3, oldlen + 1);
+        LOGL0(old, oldlen, tmp);
+
+        if (new) {
+                tmp = lustre_msg_buf(req->rq_reqmsg, offset + 4, newlen + 1);
+                LOGL0(new, newlen, tmp);
+        }
+        EXIT;
+}
+
+void mdc_rename_pack(struct ptlrpc_request *req, int offset,
+                     struct mdc_op_data *data, const char *old,
+                     int oldlen, const char *new, int newlen)
+{
+        if (mdc_req_is_2_0_server(req))
+                mdc_rename_pack_20(req, offset, data, old, oldlen, new, newlen);
+        else
+                mdc_rename_pack_18(req, offset, data, old, oldlen, new, newlen);
  }
  
-void mdc_getattr_pack(struct ptlrpc_request *req, int offset, __u64 valid,
-                      int flags, struct mdc_op_data *data)
+static void mdc_getattr_pack_18(struct ptlrpc_request *req, int offset,
+                                __u64 valid, int flags, struct mdc_op_data *data)
  {
          struct mds_body *b;
+        ENTRY;
+
          b = lustre_msg_buf(req->rq_reqmsg, offset, sizeof(*b));
  
          b->fsuid = current->fsuid;
          b->fsgid = current->fsgid;
-        b->capability = current->cap_effective;
+        b->capability = cfs_curproc_cap_pack();
          b->valid = valid;
          b->flags = flags | MDS_BFLAG_EXT_FLAGS;
          /* skip MDS_BFLAG_EXT_FLAGS to verify the "client < 1.4.7" case 
@@ -364,16 +770,56 @@ void mdc_getattr_pack(struct ptlrpc_request *req, int offset, __u64 valid,
                  memcpy(tmp, data->name, data->namelen);
                  data->name = tmp;
          }
+        EXIT;
  }
  
-void mdc_close_pack(struct ptlrpc_request *req, int offset, struct obdo *oa,
-                    __u64 valid, struct obd_client_handle *och)
+static void mdc_getattr_pack_20(struct ptlrpc_request *req, int offset,
+                                __u64 valid, int flags, struct mdc_op_data *data)
+{
+        struct mdt_body *b;
+        ENTRY;
+
+        b = lustre_msg_buf(req->rq_reqmsg, offset, sizeof (*b));
+
+        b->fsuid = current->fsuid;
+        b->fsgid = current->fsgid;
+        b->capability = cfs_curproc_cap_pack();
+        b->valid = valid;
+        b->flags = flags | MDS_BFLAG_EXT_FLAGS;
+        b->suppgid = data->suppgids[0];
+
+        memcpy(&b->fid1, &data->fid1, sizeof(data->fid1));
+        memcpy(&b->fid2, &data->fid2, sizeof(data->fid2));
+        b->valid |= OBD_MD_FLID;
+        if (data->name) {
+                char *tmp;
+                tmp = lustre_msg_buf(req->rq_reqmsg, offset + 2,
+                                     data->namelen + 1);
+                LASSERT(tmp);
+                LOGL0(data->name, data->namelen, tmp);
+        }
+        EXIT;
+}
+
+void mdc_getattr_pack(struct ptlrpc_request *req, int offset,
+                      __u64 valid, int flags, struct mdc_op_data *data)
+{
+        if (mdc_req_is_2_0_server(req))
+                mdc_getattr_pack_20(req, offset, valid, flags, data);
+        else
+                mdc_getattr_pack_18(req, offset, valid, flags, data);
+}
+static void mdc_close_pack_18(struct ptlrpc_request *req, int offset,
+                              struct mdc_op_data *data,
+                              struct obdo *oa, __u64 valid,
+                              struct obd_client_handle *och)
  {
          struct mds_body *body;
+        ENTRY;
  
          body = lustre_msg_buf(req->rq_reqmsg, offset, sizeof(*body));
  
-        mdc_pack_fid(&body->fid1, oa->o_id, 0, oa->o_mode);
+        body->fid1 = data->fid1;
          memcpy(&body->handle, &och->och_fh, sizeof(body->handle));
          if (oa->o_valid & OBD_MD_FLATIME) {
                  body->atime = oa->o_atime;
@@ -399,9 +845,73 @@ void mdc_close_pack(struct ptlrpc_request *req, int offset, struct obdo *oa,
                  body->flags = oa->o_flags;
                  body->valid |= OBD_MD_FLFLAGS;
          }
+        EXIT;
  }
  
-struct mdc_cache_waiter {       
+static void mdc_close_pack_20(struct ptlrpc_request *req, int offset,
+                              struct mdc_op_data *data,
+                              struct obdo *oa, __u64 valid,
+                              struct obd_client_handle *och)
+{
+        struct mdt_epoch *epoch;
+        struct mdt_rec_setattr *rec;
+        ENTRY;
+
+        epoch = lustre_msg_buf(req->rq_reqmsg, offset, sizeof(*epoch));
+        rec = lustre_msg_buf(req->rq_reqmsg, offset + 1, sizeof(*rec));
+
+        rec->sa_opcode  = REINT_SETATTR;
+        rec->sa_fsuid   = current->fsuid;
+        rec->sa_fsgid   = current->fsgid;
+        rec->sa_cap     = cfs_curproc_cap_pack();
+        rec->sa_suppgid = -1;
+
+        memcpy(&rec->sa_fid, &data->fid1, sizeof(data->fid1));
+
+        if (oa->o_valid & OBD_MD_FLATIME) {
+                rec->sa_atime = oa->o_atime;
+                rec->sa_valid |= MDS_ATTR_ATIME;
+        }
+        if (oa->o_valid & OBD_MD_FLMTIME) {
+                rec->sa_mtime = oa->o_mtime;
+                rec->sa_valid |= MDS_ATTR_MTIME;
+        }
+        if (oa->o_valid & OBD_MD_FLCTIME) {
+                rec->sa_ctime = oa->o_ctime;
+                rec->sa_valid |= MDS_ATTR_CTIME;
+        }
+        if (oa->o_valid & OBD_MD_FLSIZE) {
+                rec->sa_size = oa->o_size;
+                rec->sa_valid |= MDS_ATTR_SIZE;
+        }
+        if (oa->o_valid & OBD_MD_FLBLOCKS) {
+                rec->sa_blocks = oa->o_blocks;
+                rec->sa_valid |= MDS_ATTR_BLOCKS;
+        }
+        if (oa->o_valid & OBD_MD_FLFLAGS) {
+                rec->sa_attr_flags = oa->o_flags;
+                rec->sa_valid |= MDS_ATTR_ATTR_FLAG;
+        }
+
+        epoch->handle = och->och_fh;
+        epoch->ioepoch = 0;
+        epoch->flags = 0;
+
+        EXIT;
+}
+
+
+void mdc_close_pack(struct ptlrpc_request *req, int offset,
+                    struct mdc_op_data *data,
+                    struct obdo *oa, __u64 valid,
+                    struct obd_client_handle *och)
+{
+        if (mdc_req_is_2_0_server(req))
+                mdc_close_pack_20(req, offset, data, oa, valid, och);
+        else
+                mdc_close_pack_18(req, offset, data, oa, valid, och);
+}
+struct mdc_cache_waiter {
          struct list_head        mcw_entry;
          wait_queue_head_t       mcw_waitq;
  };
diff --git a/lustre/mdc/mdc_locks.c b/lustre/mdc/mdc_locks.c

index d06c739..cf868af 100644 (file)
--- a/lustre/mdc/mdc_locks.c
+++ b/lustre/mdc/mdc_locks.c
@@ -1,25 +1,37 @@
  /* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
   * vim:expandtab:shiftwidth=8:tabstop=8:
   *
- * Copyright (C) 2001-2003 Cluster File Systems, Inc.
+ * GPL HEADER START
   *
- *   This file is part of the Lustre file system, http://www.lustre.org
- *   Lustre is a trademark of Cluster File Systems, Inc.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   *
- *   You may have signed or agreed to another license before downloading
- *   this software.  If so, you are bound by the terms and conditions
- *   of that agreement, and the following does not apply to you.  See the
- *   LICENSE file included with this distribution for more information.
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
   *
- *   If you did not agree to a different license, then this copy of Lustre
- *   is open source software; you can redistribute it and/or modify it
- *   under the terms of version 2 of the GNU General Public License as
- *   published by the Free Software Foundation.
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
   *
- *   In either case, Lustre is distributed in the hope that it will be
- *   useful, but WITHOUT ANY WARRANTY; without even the implied warranty
- *   of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- *   license text for more details.
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright  2008 Sun Microsystems, Inc. All rights reserved
+ * Use is subject to license terms.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
   */
  
  #ifndef EXPORT_SYMTAB
@@ -134,12 +146,10 @@ EXPORT_SYMBOL(mdc_set_lock_data);
  int mdc_change_cbdata(struct obd_export *exp, struct ll_fid *fid, 
                        ldlm_iterator_t it, void *data)
  {
-        struct ldlm_res_id res_id = { .name = {0} };
+        struct ldlm_res_id res_id;
          ENTRY;
  
-        res_id.name[0] = fid->id;
-        res_id.name[1] = fid->generation;
-
+        fid_build_reg_res_name((struct lu_fid*)fid, &res_id);
          ldlm_resource_iterate(class_exp2obd(exp)->obd_namespace, &res_id,
                                it, data);
  
@@ -197,7 +207,7 @@ static void mdc_realloc_openmsg(struct ptlrpc_request *req,
  
          OBD_ALLOC(new_msg, new_size);
          if (new_msg != NULL) {
-                DEBUG_REQ(D_INFO, req, "replace reqmsg for larger EA %u\n",
+                DEBUG_REQ(D_INFO, req, "replace reqmsg for larger EA %u",
                            body->eadatasize);
                  memcpy(new_msg, old_msg, old_size);
  
@@ -217,12 +227,12 @@ static void mdc_realloc_openmsg(struct ptlrpc_request *req,
  static struct ptlrpc_request *mdc_intent_open_pack(struct obd_export *exp,
                                                     struct lookup_intent *it,
                                                     struct mdc_op_data *data,
-                                                   void *lmm, int lmmsize)
+                                                   void *lmm, __u32 lmmsize)
  {
          struct ptlrpc_request *req;
          struct ldlm_intent *lit;
          struct obd_device *obddev = class_exp2obd(exp);
-        int size[7] = { [MSG_PTLRPC_BODY_OFF] = sizeof(struct ptlrpc_body),
+        __u32 size[9] = { [MSG_PTLRPC_BODY_OFF] = sizeof(struct ptlrpc_body),
                          [DLM_LOCKREQ_OFF]     = sizeof(struct ldlm_request),
                          [DLM_INTENT_IT_OFF]   = sizeof(*lit),
                          [DLM_INTENT_REC_OFF]  = sizeof(struct mds_rec_create),
@@ -236,27 +246,40 @@ static struct ptlrpc_request *mdc_intent_open_pack(struct obd_export *exp,
                           * default-sized LOV EA for open replay. */
                          [DLM_INTENT_REC_OFF+2]= max(lmmsize,
                                           obddev->u.cli.cl_default_mds_easize) };
-        int repsize[5] = { [MSG_PTLRPC_BODY_OFF] = sizeof(struct ptlrpc_body),
+        __u32 repsize[7] = { [MSG_PTLRPC_BODY_OFF] = sizeof(struct ptlrpc_body),
                             [DLM_LOCKREPLY_OFF]   = sizeof(struct ldlm_reply),
-                           [DLM_REPLY_REC_OFF]   = sizeof(struct mds_body),
+                           [DLM_REPLY_REC_OFF]   = sizeof(struct mdt_body),
                             [DLM_REPLY_REC_OFF+1] = obddev->u.cli.
                                                          cl_max_mds_easize,
                             [DLM_REPLY_REC_OFF+2] = LUSTRE_POSIX_ACL_MAX_SIZE };
          CFS_LIST_HEAD(cancels);
          int do_join = (it->it_flags & O_JOIN_FILE) && data->data;
          int count = 0;
+        int bufcount = 6;
+        int repbufcount = 5;
          int mode;
          int rc;
+        ENTRY;
  
-        it->it_create_mode |= S_IFREG;
-
-        rc = lustre_msg_size(class_exp2cliimp(exp)->imp_msg_magic, 6, size);
+        it->it_create_mode = (it->it_create_mode & ~S_IFMT) | S_IFREG;
+        if (mdc_exp_is_2_0_server(exp)) {
+                size[DLM_INTENT_REC_OFF] = sizeof(struct mdt_rec_create);
+                size[DLM_INTENT_REC_OFF+4] = size[DLM_INTENT_REC_OFF+2];
+                size[DLM_INTENT_REC_OFF+3] = size[DLM_INTENT_REC_OFF+1];
+                size[DLM_INTENT_REC_OFF+2] = 0; /* capa */
+                size[DLM_INTENT_REC_OFF+1] = 0; /* capa */
+                bufcount = 8;
+                repsize[DLM_REPLY_REC_OFF+3]=sizeof(struct lustre_capa);
+                repsize[DLM_REPLY_REC_OFF+4]=sizeof(struct lustre_capa); 
+                repbufcount = 7;
+        }
+        rc = lustre_msg_size(class_exp2cliimp(exp)->imp_msg_magic,
+                             bufcount, size);
          if (rc & (rc - 1))
-                size[DLM_INTENT_REC_OFF + 2] =
-                         min(size[DLM_INTENT_REC_OFF + 2] + round_up(rc) - rc,
-                                     obddev->u.cli.cl_max_mds_easize);
+                size[bufcount - 1] = min(size[bufcount - 1] + round_up(rc) - rc,
+                                         obddev->u.cli.cl_max_mds_easize);
  
-                /* If inode is known, cancel conflicting OPEN locks. */
+        /* If inode is known, cancel conflicting OPEN locks. */
          if (data->fid2.id) {
                  if (it->it_flags & (FMODE_WRITE|MDS_OPEN_TRUNC))
                          mode = LCK_CW;
@@ -279,14 +302,19 @@ static struct ptlrpc_request *mdc_intent_open_pack(struct obd_export *exp,
                                           MDS_INODELOCK_UPDATE);
          if (do_join) {
                  __u64 head_size = (*(__u64 *)data->data);
-                        /* join is like an unlink of the tail */
-                size[DLM_INTENT_REC_OFF + 3] = sizeof(struct mds_rec_join);
-                req = ldlm_prep_enqueue_req(exp, 7, size, &cancels, count);
+                /* join is like an unlink of the tail */
+                if (mdc_exp_is_2_0_server(exp)) {
+                        size[DLM_INTENT_REC_OFF+5]=sizeof(struct mdt_rec_join);
+                } else {
+                        size[DLM_INTENT_REC_OFF+3]=sizeof(struct mds_rec_join);
+                }
+                bufcount++;
+
+                req = ldlm_prep_enqueue_req(exp, bufcount, size,&cancels,count);
                  if (req)
-                        mdc_join_pack(req, DLM_INTENT_REC_OFF + 3, data, 
-                                      head_size);
+                        mdc_join_pack(req, bufcount - 1, data, head_size);
          } else {
-                req = ldlm_prep_enqueue_req(exp, 6, size, &cancels, count);
+                req = ldlm_prep_enqueue_req(exp, bufcount, size,&cancels,count);
                  it->it_flags &= ~O_JOIN_FILE;
          }
  
@@ -305,9 +333,9 @@ static struct ptlrpc_request *mdc_intent_open_pack(struct obd_export *exp,
                                it->it_create_mode, 0, it->it_flags,
                                lmm, lmmsize);
  
-                ptlrpc_req_set_repsize(req, 5, repsize);
+                ptlrpc_req_set_repsize(req, repbufcount, repsize);
          }
-        return req;
+        RETURN(req);
  }
  
  static struct ptlrpc_request *mdc_intent_unlink_pack(struct obd_export *exp,
@@ -317,18 +345,21 @@ static struct ptlrpc_request *mdc_intent_unlink_pack(struct obd_export *exp,
          struct ptlrpc_request *req;
          struct ldlm_intent *lit;
          struct obd_device *obddev = class_exp2obd(exp);
-        int size[5] = { [MSG_PTLRPC_BODY_OFF] = sizeof(struct ptlrpc_body),
+        __u32 size[5] = { [MSG_PTLRPC_BODY_OFF] = sizeof(struct ptlrpc_body),
                          [DLM_LOCKREQ_OFF]     = sizeof(struct ldlm_request),
                          [DLM_INTENT_IT_OFF]   = sizeof(*lit),
-                        [DLM_INTENT_REC_OFF]  = sizeof(struct mds_rec_unlink),
+                        [DLM_INTENT_REC_OFF]  = mdc_exp_is_2_0_server(exp) ?
+                                                sizeof(struct mdt_rec_unlink) :
+                                                sizeof(struct mds_rec_unlink),
                          [DLM_INTENT_REC_OFF+1]= data->namelen + 1 };
-        int repsize[5] = { [MSG_PTLRPC_BODY_OFF] = sizeof(struct ptlrpc_body),
+        __u32 repsize[5] = { [MSG_PTLRPC_BODY_OFF] = sizeof(struct ptlrpc_body),
                             [DLM_LOCKREPLY_OFF]   = sizeof(struct ldlm_reply),
-                           [DLM_REPLY_REC_OFF]   = sizeof(struct mds_body),
+                           [DLM_REPLY_REC_OFF]   = sizeof(struct mdt_body),
                             [DLM_REPLY_REC_OFF+1] = obddev->u.cli.
                                                          cl_max_mds_easize,
                             [DLM_REPLY_REC_OFF+2] = obddev->u.cli.
                                                          cl_max_mds_cookiesize };
+        ENTRY;
  
          req = ldlm_prep_enqueue_req(exp, 5, size, NULL, 0);
          if (req) {
@@ -342,7 +373,7 @@ static struct ptlrpc_request *mdc_intent_unlink_pack(struct obd_export *exp,
  
                  ptlrpc_req_set_repsize(req, 5, repsize);
          }
-        return req;
+        RETURN(req);
  }
  
  static struct ptlrpc_request *mdc_intent_lookup_pack(struct obd_export *exp,
@@ -352,21 +383,30 @@ static struct ptlrpc_request *mdc_intent_lookup_pack(struct obd_export *exp,
          struct ptlrpc_request *req;
          struct ldlm_intent *lit;
          struct obd_device *obddev = class_exp2obd(exp);
-        int size[5] = { [MSG_PTLRPC_BODY_OFF] = sizeof(struct ptlrpc_body),
+        __u32 size[6] = { [MSG_PTLRPC_BODY_OFF] = sizeof(struct ptlrpc_body),
                          [DLM_LOCKREQ_OFF]     = sizeof(struct ldlm_request),
                          [DLM_INTENT_IT_OFF]   = sizeof(*lit),
-                        [DLM_INTENT_REC_OFF]  = sizeof(struct mds_body),
-                        [DLM_INTENT_REC_OFF+1]= data->namelen + 1 };
-        int repsize[5] = { [MSG_PTLRPC_BODY_OFF] = sizeof(struct ptlrpc_body),
+                        [DLM_INTENT_REC_OFF]  = sizeof(struct mdt_body),
+                        [DLM_INTENT_REC_OFF+1]= data->namelen + 1,
+                        [DLM_INTENT_REC_OFF+2]= 0 };
+        __u32 repsize[6] = { [MSG_PTLRPC_BODY_OFF] = sizeof(struct ptlrpc_body),
                             [DLM_LOCKREPLY_OFF]   = sizeof(struct ldlm_reply),
-                           [DLM_REPLY_REC_OFF]   = sizeof(struct mds_body),
+                           [DLM_REPLY_REC_OFF]   = sizeof(struct mdt_body),
                             [DLM_REPLY_REC_OFF+1] = obddev->u.cli.
                                                          cl_max_mds_easize,
-                           [DLM_REPLY_REC_OFF+2] = LUSTRE_POSIX_ACL_MAX_SIZE };
+                           [DLM_REPLY_REC_OFF+2] = LUSTRE_POSIX_ACL_MAX_SIZE,
+                           [DLM_REPLY_REC_OFF+3] = 0 };
          obd_valid valid = OBD_MD_FLGETATTR | OBD_MD_FLEASIZE | OBD_MD_FLACL |
                            OBD_MD_FLMODEASIZE | OBD_MD_FLDIREA;
+        int bufcount = 5;
+        ENTRY;
  
-                req = ldlm_prep_enqueue_req(exp, 5, size, NULL, 0);
+        if (mdc_exp_is_2_0_server(exp)) {
+                size[DLM_INTENT_REC_OFF+1] = 0; /* capa */
+                size[DLM_INTENT_REC_OFF+2] = data->namelen + 1;
+                bufcount = 6;
+        }
+        req = ldlm_prep_enqueue_req(exp, bufcount, size, NULL, 0);
          if (req) {
                  /* pack the intent */
                  lit = lustre_msg_buf(req->rq_reqmsg, DLM_INTENT_IT_OFF,
@@ -376,23 +416,25 @@ static struct ptlrpc_request *mdc_intent_lookup_pack(struct obd_export *exp,
                  /* pack the intended request */
                  mdc_getattr_pack(req, DLM_INTENT_REC_OFF, valid, it->it_flags,
                                   data);
-                ptlrpc_req_set_repsize(req, 5, repsize);
+                ptlrpc_req_set_repsize(req, bufcount, repsize);
          }
-        return req;
+        RETURN(req);
  }
  
  static struct ptlrpc_request *mdc_intent_readdir_pack(struct obd_export *exp)
  {
          struct ptlrpc_request *req;
-        int size[2] = { [MSG_PTLRPC_BODY_OFF] = sizeof(struct ptlrpc_body),
+        __u32 size[2] = { [MSG_PTLRPC_BODY_OFF] = sizeof(struct ptlrpc_body),
                          [DLM_LOCKREQ_OFF]     = sizeof(struct ldlm_request) };
-        int repsize[2] = { [MSG_PTLRPC_BODY_OFF] = sizeof(struct ptlrpc_body),
-                           [DLM_LOCKREPLY_OFF]   = sizeof(struct ldlm_reply) };
+        __u32 repsize[3] = { [MSG_PTLRPC_BODY_OFF] = sizeof(struct ptlrpc_body),
+                           [DLM_LOCKREPLY_OFF]   = sizeof(struct ldlm_reply),
+                           [DLM_REPLY_REC_OFF] = sizeof(struct ost_lvb) };
+        ENTRY;
  
          req = ldlm_prep_enqueue_req(exp, 2, size, NULL, 0);
          if (req)
-                ptlrpc_req_set_repsize(req, 2, repsize);
-        return req;
+                ptlrpc_req_set_repsize(req, 3, repsize);
+        RETURN(req);
  }
  
  static int mdc_finish_enqueue(struct obd_export *exp,
@@ -435,7 +477,7 @@ static int mdc_finish_enqueue(struct obd_export *exp,
  
          lockrep = lustre_msg_buf(req->rq_repmsg, DLM_LOCKREPLY_OFF,
                                   sizeof(*lockrep));
-        LASSERT(lockrep != NULL);                 /* checked by ldlm_cli_enqueue() */
+        LASSERT(lockrep != NULL);  /* checked by ldlm_cli_enqueue() */
          /* swabbed by ldlm_cli_enqueue() */
          LASSERT(lustre_rep_swabbed(req, DLM_LOCKREPLY_OFF));
  
@@ -493,24 +535,6 @@ static int mdc_finish_enqueue(struct obd_export *exp,
                                  CERROR ("Missing/short eadata\n");
                                  RETURN (-EPROTO);
                          }
-                        if (body->valid & OBD_MD_FLMODEASIZE) {
-                                struct obd_device *obddev = class_exp2obd(exp);
-
-                                if (obddev->u.cli.cl_max_mds_easize < 
-                                                        body->max_mdsize) {
-                                        obddev->u.cli.cl_max_mds_easize = 
-                                                body->max_mdsize;
-                                        CDEBUG(D_INFO, "maxeasize become %d\n",
-                                               body->max_mdsize);
-                                }
-                                if (obddev->u.cli.cl_max_mds_cookiesize <
-                                                        body->max_cookiesize) {
-                                        obddev->u.cli.cl_max_mds_cookiesize =
-                                                body->max_cookiesize;
-                                        CDEBUG(D_INFO, "cookiesize become %d\n",
-                                               body->max_cookiesize);
-                                }
-                        }
                          /* We save the reply LOV EA in case we have to replay
                           * a create for recovery.  If we didn't allocate a
                           * large enough request buffer above we need to
@@ -543,18 +567,27 @@ int mdc_enqueue(struct obd_export *exp, struct ldlm_enqueue_info *einfo,
  {
          struct ptlrpc_request *req;
          struct obd_device *obddev = class_exp2obd(exp);
-        struct ldlm_res_id res_id =
-                { .name = {data->fid1.id, data->fid1.generation} };
+        struct ldlm_res_id res_id;
          ldlm_policy_data_t policy = { .l_inodebits = { MDS_INODELOCK_LOOKUP } };
          int flags = extra_lock_flags | LDLM_FL_HAS_INTENT;
          int rc;
          ENTRY;
  
+        fid_build_reg_res_name((void *)&data->fid1, &res_id);
          LASSERTF(einfo->ei_type == LDLM_IBITS,"lock type %d\n", einfo->ei_type);
          if (it->it_op & (IT_UNLINK | IT_GETATTR | IT_READDIR))
                  policy.l_inodebits.bits = MDS_INODELOCK_UPDATE;
  
          if (it->it_op & IT_OPEN) {
+                if ((it->it_op & IT_CREAT) && mdc_exp_is_2_0_server(exp)) {
+                        struct client_obd *cli = &obddev->u.cli;
+                        data->fid3 = data->fid2;
+                        rc = mdc_fid_alloc(cli->cl_seq, (void *)&data->fid2);
+                        if (rc) {
+                                CERROR("fid allocation result: %d\n", rc);
+                                RETURN(rc);
+                        }
+                }
                  req = mdc_intent_open_pack(exp, it, data, lmm, lmmsize);
                  if (it->it_flags & O_JOIN_FILE) {
                          policy.l_inodebits.bits = MDS_INODELOCK_UPDATE;
@@ -600,11 +633,13 @@ int mdc_revalidate_lock(struct obd_export *exp, struct lookup_intent *it,
                  /* We could just return 1 immediately, but since we should only
                   * be called in revalidate_it if we already have a lock, let's
                   * verify that. */
-        struct ldlm_res_id res_id = {.name ={fid->id, fid->generation}};
+        struct ldlm_res_id res_id;
          struct lustre_handle lockh;
          ldlm_policy_data_t policy;
          ldlm_mode_t mode;
+        ENTRY;
  
+        fid_build_reg_res_name((struct lu_fid*)fid, &res_id);
          /* As not all attributes are kept under update lock, e.g. 
             owner/group/acls are under lookup lock, we need both 
             ibits for GETATTR. */
@@ -620,7 +655,7 @@ int mdc_revalidate_lock(struct obd_export *exp, struct lookup_intent *it,
                  it->d.lustre.it_lock_mode = mode;
          }
  
-        return !!mode;
+        RETURN(!!mode);
  }
  EXPORT_SYMBOL(mdc_revalidate_lock);
  
@@ -659,10 +694,15 @@ static int mdc_finish_intent_lock(struct obd_export *exp,
  
          /* If we were revalidating a fid/name pair, mark the intent in
           * case we fail and get called again from lookup */
-        if (data->fid2.id && (it->it_op != IT_GETATTR)) {
+
+        if (data->fid2.id && (it->it_op != IT_GETATTR) &&
+           ( !mdc_exp_is_2_0_server(exp) ||
+             (mdc_exp_is_2_0_server(exp) && (it->it_flags & O_CHECK_STALE)))) {
                  it_set_disposition(it, DISP_ENQ_COMPLETE);
+
                  /* Also: did we find the same inode? */
-                if (memcmp(&data->fid2, &mds_body->fid1, sizeof(data->fid2))) 
+                if (memcmp(&data->fid2, &mds_body->fid1, sizeof(data->fid2)) &&
+                    memcmp(&data->fid3, &mds_body->fid1, sizeof(data->fid3)))
                          RETURN(-ESTALE);
          }
  
@@ -762,8 +802,11 @@ int mdc_intent_lock(struct obd_export *exp, struct mdc_op_data *op_data,
  
          LASSERT(it);
  
-        CDEBUG(D_DLMTRACE,"name: %.*s in inode "LPU64", intent: %s flags %#o\n",
-               op_data->namelen, op_data->name, op_data->fid1.id,
+        CDEBUG(D_DLMTRACE,"name: %.*s("DFID") in inode ("DFID"), "
+               "intent: %s flags %#o\n",
+               op_data->namelen, op_data->name,
+               PFID(((void *)&op_data->fid2)),
+               PFID(((void *)&op_data->fid1)),
                 ldlm_it2str(it->it_op), it->it_flags);
  
          lockh.cookie = 0;
@@ -812,21 +855,15 @@ EXPORT_SYMBOL(mdc_intent_lock);
  static int mdc_intent_getattr_async_interpret(struct ptlrpc_request *req,
                                                void *unused, int rc)
  {
-        struct mdc_enqueue_args  *ma;
-        struct md_enqueue_info   *minfo;
-        struct ldlm_enqueue_info *einfo;
-        struct obd_export        *exp;
+        struct obd_export        *exp = req->rq_async_args.pointer_arg[0];
+        struct md_enqueue_info   *minfo = req->rq_async_args.pointer_arg[1];
+        struct ldlm_enqueue_info *einfo = req->rq_async_args.pointer_arg[2];
          struct lookup_intent     *it;
          struct lustre_handle     *lockh;
          struct obd_device        *obddev;
          int                       flags = LDLM_FL_HAS_INTENT;
          ENTRY;
  
-        ma = (struct mdc_enqueue_args *)&req->rq_async_args;
-        minfo = ma->ma_mi;
-        einfo = ma->ma_ei;
-
-        exp   = minfo->mi_exp;
          it    = &minfo->mi_it;
          lockh = &minfo->mi_lockh;
  
@@ -867,14 +904,10 @@ int mdc_intent_getattr_async(struct obd_export *exp,
          struct lookup_intent    *it = &minfo->mi_it;
          struct ptlrpc_request   *req;
          struct obd_device       *obddev = class_exp2obd(exp);
-        struct ldlm_res_id       res_id = {
-                                        .name = {op_data->fid1.id,
-                                                 op_data->fid1.generation}
-                                 };
+        struct ldlm_res_id res_id;
          ldlm_policy_data_t       policy = {
                                          .l_inodebits = { MDS_INODELOCK_LOOKUP }
                                   };
-        struct mdc_enqueue_args *aa;
          int                      rc;
          int                      flags = LDLM_FL_HAS_INTENT;
          ENTRY;
@@ -883,6 +916,7 @@ int mdc_intent_getattr_async(struct obd_export *exp,
                 op_data->namelen, op_data->name, op_data->fid1.id,
                 ldlm_it2str(it->it_op), it->it_flags);
  
+        fid_build_reg_res_name((void *)&op_data->fid1, &res_id);
          req = mdc_intent_lookup_pack(exp, it, op_data);
          if (!req)
                  RETURN(-ENOMEM);
@@ -895,10 +929,9 @@ int mdc_intent_getattr_async(struct obd_export *exp,
                  RETURN(rc);
          }
  
-        CLASSERT(sizeof(*aa) < sizeof(req->rq_async_args));
-        aa = (struct mdc_enqueue_args *)&req->rq_async_args;
-        aa->ma_mi = minfo;
-        aa->ma_ei = einfo;
+        req->rq_async_args.pointer_arg[0] = exp;
+        req->rq_async_args.pointer_arg[1] = minfo;
+        req->rq_async_args.pointer_arg[2] = einfo;
          req->rq_interpret_reply = mdc_intent_getattr_async_interpret;
          ptlrpcd_add_req(req);
  
diff --git a/lustre/mdc/mdc_reint.c b/lustre/mdc/mdc_reint.c

index 2e663f9..1f1f445 100644 (file)
--- a/lustre/mdc/mdc_reint.c
+++ b/lustre/mdc/mdc_reint.c
@@ -1,25 +1,37 @@
  /* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
   * vim:expandtab:shiftwidth=8:tabstop=8:
   *
- * Copyright (C) 2001-2003 Cluster File Systems, Inc.
+ * GPL HEADER START
   *
- *   This file is part of the Lustre file system, http://www.lustre.org
- *   Lustre is a trademark of Cluster File Systems, Inc.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   *
- *   You may have signed or agreed to another license before downloading
- *   this software.  If so, you are bound by the terms and conditions
- *   of that agreement, and the following does not apply to you.  See the
- *   LICENSE file included with this distribution for more information.
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
   *
- *   If you did not agree to a different license, then this copy of Lustre
- *   is open source software; you can redistribute it and/or modify it
- *   under the terms of version 2 of the GNU General Public License as
- *   published by the Free Software Foundation.
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
   *
- *   In either case, Lustre is distributed in the hope that it will be
- *   useful, but WITHOUT ANY WARRANTY; without even the implied warranty
- *   of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- *   license text for more details.
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright  2008 Sun Microsystems, Inc. All rights reserved
+ * Use is subject to license terms.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
   */
  
  #ifndef EXPORT_SYMTAB
@@ -70,12 +82,15 @@ int mdc_resource_get_unused(struct obd_export *exp, struct ll_fid *fid,
                              __u64 bits)
  {
          struct ldlm_namespace *ns = exp->exp_obd->obd_namespace;
-        struct ldlm_res_id res_id = { .name = {fid->id, fid->generation} };
-        struct ldlm_resource *res = ldlm_resource_get(ns, NULL, res_id, 0, 0);
+        struct ldlm_res_id res_id;
+        struct ldlm_resource *res;
          ldlm_policy_data_t policy = {{0}};
          int count;
          ENTRY;
  
+        fid_build_reg_res_name((struct lu_fid*)fid, &res_id);
+        res = ldlm_resource_get(ns, NULL, res_id, 0, 0);
+
          if (res == NULL)
                  RETURN(0);
  
@@ -88,7 +103,7 @@ int mdc_resource_get_unused(struct obd_export *exp, struct ll_fid *fid,
  }
  
  struct ptlrpc_request *mdc_prep_elc_req(struct obd_export *exp,
-                                        int bufcount, int *size, int off,
+                                        int bufcount, __u32 *size, int off,
                                          struct list_head *cancels, int count)
  {
          return ldlm_prep_elc_req(exp, LUSTRE_MDS_VERSION, MDS_REINT,
@@ -107,22 +122,39 @@ int mdc_setattr(struct obd_export *exp, struct mdc_op_data *op_data,
  {
          CFS_LIST_HEAD(cancels);
          struct ptlrpc_request *req;
-        struct mds_rec_setattr *rec;
          struct mdc_rpc_lock *rpc_lock;
          struct obd_device *obd = exp->exp_obd;
-        int size[5] = { sizeof(struct ptlrpc_body),
-                        sizeof(*rec), ealen, ea2len,
-                        sizeof(struct ldlm_request) };
-        int count, bufcount = 2, rc;
+        __u32 size[7] = { [MSG_PTLRPC_BODY_OFF] = sizeof(struct ptlrpc_body),
+                        [REQ_REC_OFF] = sizeof(struct mds_rec_setattr),
+                        [REQ_REC_OFF + 1] = ealen,
+                        [REQ_REC_OFF + 2] = ea2len,
+                        [REQ_REC_OFF + 3] = sizeof(struct ldlm_request) };
+        __u32 replysize[6] = { [MSG_PTLRPC_BODY_OFF] = sizeof(struct ptlrpc_body),
+                             [REPLY_REC_OFF] = sizeof(struct mdt_body),
+                             [REPLY_REC_OFF+1] = obd->u.cli.cl_max_mds_easize,
+                             [REPLY_REC_OFF+2] = LUSTRE_POSIX_ACL_MAX_SIZE,
+                             [REPLY_REC_OFF+3] = sizeof(struct lustre_capa),
+                             [REPLY_REC_OFF+4] = sizeof(struct lustre_capa)};
+
+        int count, bufcount = 2, rc, replybufcount = 2;
+        int offset = REQ_REC_OFF + 3;
          __u64 bits;
          ENTRY;
  
          LASSERT(iattr != NULL);
  
-        if (ealen > 0) {
-                bufcount++;
-                if (ea2len > 0)
-                        bufcount++;
+        if (mdc_exp_is_2_0_server(exp)) {
+                size[REQ_REC_OFF] = sizeof(struct mdt_rec_setattr);
+                size[REQ_REC_OFF + 1] = 0; /* capa */
+                size[REQ_REC_OFF + 2] = 0; //sizeof (struct mdt_epoch);
+                size[REQ_REC_OFF + 3] = ealen;
+                size[REQ_REC_OFF + 4] = ea2len;
+                size[REQ_REC_OFF + 5] = sizeof(struct ldlm_request);
+                offset = REQ_REC_OFF + 5;
+                bufcount = 6;
+                replybufcount = 6;
+        } else {
+                bufcount = 4;
          }
  
          bits = MDS_INODELOCK_UPDATE;
@@ -131,9 +163,9 @@ int mdc_setattr(struct obd_export *exp, struct mdc_op_data *op_data,
          count = mdc_resource_get_unused(exp, &op_data->fid1,
                                          &cancels, LCK_EX, bits);
          if (exp_connect_cancelset(exp))
-                bufcount = 5;
+                bufcount ++ ;
          req = mdc_prep_elc_req(exp, bufcount, size,
-                               REQ_REC_OFF + 3, &cancels, count);
+                               offset, &cancels, count);
          if (req == NULL)
                  RETURN(-ENOMEM);
  
@@ -151,8 +183,7 @@ int mdc_setattr(struct obd_export *exp, struct mdc_op_data *op_data,
          mdc_setattr_pack(req, REQ_REC_OFF, op_data, iattr,
                           ea, ealen, ea2, ea2len);
  
-        size[REPLY_REC_OFF] = sizeof(struct mds_body);
-        ptlrpc_req_set_repsize(req, 2, size);
+        ptlrpc_req_set_repsize(req, replybufcount, replysize);
  
          rc = mdc_reint(req, rpc_lock, LUSTRE_IMP_FULL);
          *request = req;
@@ -164,18 +195,28 @@ int mdc_setattr(struct obd_export *exp, struct mdc_op_data *op_data,
  
  int mdc_create(struct obd_export *exp, struct mdc_op_data *op_data,
                 const void *data, int datalen, int mode, __u32 uid, __u32 gid,
-               __u32 cap_effective, __u64 rdev, struct ptlrpc_request **request)
+               cfs_cap_t cap_effective, __u64 rdev,
+               struct ptlrpc_request **request)
  {
          CFS_LIST_HEAD(cancels);
          struct obd_device *obd = exp->exp_obd;
          struct ptlrpc_request *req;
          int level, bufcount = 3, rc;
-        int size[5] = { sizeof(struct ptlrpc_body),
+        __u32 size[6] = { sizeof(struct ptlrpc_body),
                          sizeof(struct mds_rec_create),
                          op_data->namelen + 1, 0, sizeof(struct ldlm_request) };
+        int offset = REQ_REC_OFF + 3;
          int count;
          ENTRY;
  
+        if (mdc_exp_is_2_0_server(exp)) {
+                size[REQ_REC_OFF] = sizeof(struct mdt_rec_create);
+                size[REQ_REC_OFF + 1] = 0; /* capa */
+                size[REQ_REC_OFF + 2] = op_data->namelen + 1;
+                size[REQ_REC_OFF + 4] = sizeof(struct ldlm_request);
+                bufcount++;
+                offset ++;
+        }
          if (data && datalen) {
                  size[bufcount] = datalen;
                  bufcount++;
@@ -183,10 +224,25 @@ int mdc_create(struct obd_export *exp, struct mdc_op_data *op_data,
  
          count = mdc_resource_get_unused(exp, &op_data->fid1, &cancels,
                                          LCK_EX, MDS_INODELOCK_UPDATE);
-        if (exp_connect_cancelset(exp))
-                bufcount = 5;
+        if (exp_connect_cancelset(exp)) {
+                if (mdc_exp_is_2_0_server(exp)) {
+                        bufcount = 6;
+                } else {
+                        bufcount = 5;
+                }
+        }
+
+        if (mdc_exp_is_2_0_server(exp)) {
+                struct client_obd *cli = &obd->u.cli;
+                rc = mdc_fid_alloc(cli->cl_seq, (void *)&op_data->fid2);
+                if (rc) {
+                        CERROR("fid allocation result: %d\n", rc);
+                        RETURN(rc);
+                }
+        }
+
          req = mdc_prep_elc_req(exp, bufcount, size,
-                               REQ_REC_OFF + 3, &cancels, count);
+                               offset, &cancels, count);
          if (req == NULL)
                  RETURN(-ENOMEM);
  
@@ -195,8 +251,9 @@ int mdc_create(struct obd_export *exp, struct mdc_op_data *op_data,
          mdc_create_pack(req, REQ_REC_OFF, op_data, data, datalen, mode, uid,
                          gid, cap_effective, rdev);
  
-        size[REPLY_REC_OFF] = sizeof(struct mds_body);
-        ptlrpc_req_set_repsize(req, 2, size);
+        size[REPLY_REC_OFF] = sizeof(struct mdt_body);
+        size[REPLY_REC_OFF+1] = sizeof(struct ost_lvb);
+        ptlrpc_req_set_repsize(req, 3, size);
  
          level = LUSTRE_IMP_FULL;
   resend:
@@ -220,12 +277,23 @@ int mdc_unlink(struct obd_export *exp, struct mdc_op_data *op_data,
          CFS_LIST_HEAD(cancels);
          struct obd_device *obd = class_exp2obd(exp);
          struct ptlrpc_request *req = *request;
-        int size[4] = { sizeof(struct ptlrpc_body),
-                        sizeof(struct mds_rec_unlink),
-                        op_data->namelen + 1, sizeof(struct ldlm_request) };
+        __u32 size[5] = { [MSG_PTLRPC_BODY_OFF] = sizeof(struct ptlrpc_body),
+                        [REQ_REC_OFF] = sizeof(struct mds_rec_unlink),
+                        [REQ_REC_OFF + 1] = op_data->namelen + 1,
+                        [REQ_REC_OFF + 2] = sizeof(struct ldlm_request) };
          int count, rc, bufcount = 3;
+        int offset = REQ_REC_OFF + 2;
          ENTRY;
  
+        if (mdc_exp_is_2_0_server(exp)) {
+                size[REQ_REC_OFF] = sizeof(struct mdt_rec_unlink);
+                size[REQ_REC_OFF + 1] = 0 /* capa */;
+                size[REQ_REC_OFF + 2] = op_data->namelen + 1;
+                size[REQ_REC_OFF + 3] = sizeof(struct ldlm_request);
+                bufcount ++;
+                offset ++;
+        }
+
          LASSERT(req == NULL);
          count = mdc_resource_get_unused(exp, &op_data->fid1, &cancels,
                                          LCK_EX, MDS_INODELOCK_UPDATE);
@@ -233,14 +301,15 @@ int mdc_unlink(struct obd_export *exp, struct mdc_op_data *op_data,
                  count += mdc_resource_get_unused(exp, &op_data->fid3, &cancels,
                                                   LCK_EX, MDS_INODELOCK_FULL);
          if (exp_connect_cancelset(exp))
-                bufcount = 4;
+                bufcount ++;
+
          req = mdc_prep_elc_req(exp, bufcount, size,
-                               REQ_REC_OFF + 2, &cancels, count);
+                               offset, &cancels, count);
          if (req == NULL)
                  RETURN(-ENOMEM);
          *request = req;
  
-        size[REPLY_REC_OFF] = sizeof(struct mds_body);
+        size[REPLY_REC_OFF] = sizeof(struct mdt_body);
          size[REPLY_REC_OFF + 1] = obd->u.cli.cl_max_mds_easize;
          size[REPLY_REC_OFF + 2] = obd->u.cli.cl_max_mds_cookiesize;
          ptlrpc_req_set_repsize(req, 4, size);
@@ -259,26 +328,39 @@ int mdc_link(struct obd_export *exp, struct mdc_op_data *op_data,
          CFS_LIST_HEAD(cancels);
          struct obd_device *obd = exp->exp_obd;
          struct ptlrpc_request *req;
-        int size[4] = { sizeof(struct ptlrpc_body),
-                        sizeof(struct mds_rec_link),
-                        op_data->namelen + 1, sizeof(struct ldlm_request) };
+        __u32 size[6] = { [MSG_PTLRPC_BODY_OFF] = sizeof(struct ptlrpc_body),
+                        [REQ_REC_OFF] = sizeof(struct mds_rec_link),
+                        [REQ_REC_OFF + 1] = op_data->namelen + 1,
+                        [REQ_REC_OFF + 2] = sizeof(struct ldlm_request)};
          int count, rc, bufcount = 3;
+        int offset = REQ_REC_OFF + 2;
          ENTRY;
  
+        if (mdc_exp_is_2_0_server(exp)) {
+                size[REQ_REC_OFF] = sizeof(struct mdt_rec_link);
+                size[REQ_REC_OFF + 1] = 0; /* capa */
+                size[REQ_REC_OFF + 2] = 0; /* capa */
+                size[REQ_REC_OFF + 3] = op_data->namelen + 1;
+                size[REQ_REC_OFF + 4] = sizeof(struct ldlm_request);
+                bufcount = 5;
+                offset += 2;
+        }
+
          count = mdc_resource_get_unused(exp, &op_data->fid1, &cancels,
                                          LCK_EX, MDS_INODELOCK_UPDATE);
          count += mdc_resource_get_unused(exp, &op_data->fid2, &cancels,
                                           LCK_EX, MDS_INODELOCK_UPDATE);
          if (exp_connect_cancelset(exp))
-                bufcount = 4;
+                bufcount++;
+
          req = mdc_prep_elc_req(exp, bufcount, size,
-                               REQ_REC_OFF + 2, &cancels, count);
+                               offset, &cancels, count);
          if (req == NULL)
                  RETURN(-ENOMEM);
  
          mdc_link_pack(req, REQ_REC_OFF, op_data);
  
-        size[REPLY_REC_OFF] = sizeof(struct mds_body);
+        size[REPLY_REC_OFF] = sizeof(struct mdt_body);
          ptlrpc_req_set_repsize(req, 2, size);
  
          rc = mdc_reint(req, obd->u.cli.cl_rpc_lock, LUSTRE_IMP_FULL);
@@ -296,12 +378,26 @@ int mdc_rename(struct obd_export *exp, struct mdc_op_data *op_data,
          CFS_LIST_HEAD(cancels);
          struct obd_device *obd = exp->exp_obd;
          struct ptlrpc_request *req;
-        int size[5] = { sizeof(struct ptlrpc_body),
-                        sizeof(struct mds_rec_rename),
-                        oldlen + 1, newlen + 1, sizeof(struct ldlm_request) };
+        __u32 size[7] = { [MSG_PTLRPC_BODY_OFF] = sizeof(struct ptlrpc_body),
+                        [REQ_REC_OFF] = sizeof(struct mds_rec_rename),
+                        [REQ_REC_OFF + 1] = oldlen + 1,
+                        [REQ_REC_OFF + 2] = newlen + 1,
+                        [REQ_REC_OFF + 3] = sizeof(struct ldlm_request) };
          int count, rc, bufcount = 4;
+        int offset = REQ_REC_OFF + 3;
          ENTRY;
  
+        if (mdc_exp_is_2_0_server(exp)) {
+                size[REQ_REC_OFF] = sizeof(struct mdt_rec_rename);
+                size[REQ_REC_OFF + 1] = 0; /* capa */
+                size[REQ_REC_OFF + 2] = 0; /* capa */
+                size[REQ_REC_OFF + 3] = oldlen + 1;
+                size[REQ_REC_OFF + 4] = newlen + 1;
+                size[REQ_REC_OFF + 5] = sizeof(struct ldlm_request);
+                bufcount = 6;
+                offset += 2;
+        }
+
          count = mdc_resource_get_unused(exp, &op_data->fid1, &cancels,
                                          LCK_EX, MDS_INODELOCK_UPDATE);
          count += mdc_resource_get_unused(exp, &op_data->fid2, &cancels,
@@ -313,15 +409,16 @@ int mdc_rename(struct obd_export *exp, struct mdc_op_data *op_data,
                  count += mdc_resource_get_unused(exp, &op_data->fid4, &cancels,
                                                   LCK_EX, MDS_INODELOCK_FULL);
          if (exp_connect_cancelset(exp))
-                bufcount = 5;
+                bufcount ++;
+
          req = mdc_prep_elc_req(exp, bufcount, size,
-                               REQ_REC_OFF + 3, &cancels, count);
+                               offset, &cancels, count);
          if (req == NULL)
                  RETURN(-ENOMEM);
  
          mdc_rename_pack(req, REQ_REC_OFF, op_data, old, oldlen, new, newlen);
  
-        size[REPLY_REC_OFF] = sizeof(struct mds_body);
+        size[REPLY_REC_OFF] = sizeof(struct mdt_body);
          size[REPLY_REC_OFF + 1] = obd->u.cli.cl_max_mds_easize;
          size[REPLY_REC_OFF + 2] = obd->u.cli.cl_max_mds_cookiesize;
          ptlrpc_req_set_repsize(req, 4, size);
diff --git a/lustre/mdc/mdc_request.c b/lustre/mdc/mdc_request.c

index 51cc8bc..34d9021 100644 (file)
--- a/lustre/mdc/mdc_request.c
+++ b/lustre/mdc/mdc_request.c
@@ -1,25 +1,37 @@
  /* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
   * vim:expandtab:shiftwidth=8:tabstop=8:
   *
- * Copyright (C) 2001-2003 Cluster File Systems, Inc.
+ * GPL HEADER START
   *
- *   This file is part of the Lustre file system, http://www.lustre.org
- *   Lustre is a trademark of Cluster File Systems, Inc.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   *
- *   You may have signed or agreed to another license before downloading
- *   this software.  If so, you are bound by the terms and conditions
- *   of that agreement, and the following does not apply to you.  See the
- *   LICENSE file included with this distribution for more information.
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
   *
- *   If you did not agree to a different license, then this copy of Lustre
- *   is open source software; you can redistribute it and/or modify it
- *   under the terms of version 2 of the GNU General Public License as
- *   published by the Free Software Foundation.
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
   *
- *   In either case, Lustre is distributed in the hope that it will be
- *   useful, but WITHOUT ANY WARRANTY; without even the implied warranty
- *   of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- *   license text for more details.
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright  2008 Sun Microsystems, Inc. All rights reserved
+ * Use is subject to license terms.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
   */
  
  #ifndef EXPORT_SYMTAB
@@ -54,21 +66,25 @@ static int mdc_cleanup(struct obd_device *obd);
  extern int mds_queue_req(struct ptlrpc_request *);
  /* Helper that implements most of mdc_getstatus and signal_completed_replay. */
  /* XXX this should become mdc_get_info("key"), sending MDS_GET_INFO RPC */
-static int send_getstatus(struct obd_import *imp, struct ll_fid *rootfid,
+static int send_getstatus(struct obd_export *exp, struct ll_fid *rootfid,
                            int level, int msg_flags)
  {
          struct ptlrpc_request *req;
          struct mds_body *body;
-        int rc, size[2] = { sizeof(struct ptlrpc_body), sizeof(*body) };
+        __u32 size[3] = { sizeof(struct ptlrpc_body),
+                          sizeof(struct mdt_body),
+                          sizeof(struct lustre_capa) };
+        int rc;
          ENTRY;
  
-        req = ptlrpc_prep_req(imp, LUSTRE_MDS_VERSION, MDS_GETSTATUS, 2, size,
+        req = ptlrpc_prep_req(class_exp2cliimp(exp), LUSTRE_MDS_VERSION, MDS_GETSTATUS, 2, size,
                                NULL);
          if (!req)
                  GOTO(out, rc = -ENOMEM);
  
+        req->rq_export = class_export_get(exp);
          req->rq_send_state = level;
-        ptlrpc_req_set_repsize(req, 2, size);
+        ptlrpc_req_set_repsize(req, 3, size);
  
          mdc_pack_req_body(req, REQ_REC_OFF, 0, NULL, 0, 0);
          lustre_msg_add_flags(req->rq_reqmsg, msg_flags);
@@ -100,23 +116,22 @@ static int send_getstatus(struct obd_import *imp, struct ll_fid *rootfid,
  /* This should be mdc_get_info("rootfid") */
  int mdc_getstatus(struct obd_export *exp, struct ll_fid *rootfid)
  {
-        return send_getstatus(class_exp2cliimp(exp), rootfid, LUSTRE_IMP_FULL,
-                              0);
+        return send_getstatus(exp, rootfid, LUSTRE_IMP_FULL, 0);
  }
  
  static
-int mdc_getattr_common(struct obd_export *exp, unsigned int ea_size, 
+int mdc_getattr_common(struct obd_export *exp, unsigned int ea_size,
                         unsigned int acl_size, struct ptlrpc_request *req)
  {
          struct obd_device *obddev = class_exp2obd(exp);
          struct mds_body *body;
          void *eadata;
-        int size[4] = { sizeof(struct ptlrpc_body), sizeof(*body) };
+        __u32 size[6] = { sizeof(struct ptlrpc_body),
+                          sizeof(struct mdt_body) };
          int bufcount = 2, rc;
          ENTRY;
  
          /* request message already built */
-
          if (ea_size != 0) {
                  size[bufcount++] = ea_size;
                  CDEBUG(D_INODE, "reserved %u bytes for MD/symlink in packet\n",
@@ -127,6 +142,10 @@ int mdc_getattr_common(struct obd_export *exp, unsigned int ea_size,
                  CDEBUG(D_INODE, "reserved %u bytes for ACL\n", acl_size);
          }
  
+        if (mdc_exp_is_2_0_server(exp)) {
+                bufcount = 6;
+        }
+
          ptlrpc_req_set_repsize(req, bufcount, size);
  
          mdc_enter_request(&obddev->u.cli);
@@ -155,16 +174,6 @@ int mdc_getattr_common(struct obd_export *exp, unsigned int ea_size,
                  }
          }
  
-        if (body->valid & OBD_MD_FLMODEASIZE) {
-                if (exp->exp_obd->u.cli.cl_max_mds_easize < body->max_mdsize) 
-                        exp->exp_obd->u.cli.cl_max_mds_easize = 
-                                                body->max_mdsize;
-                if (exp->exp_obd->u.cli.cl_max_mds_cookiesize < 
-                                                body->max_cookiesize)
-                        exp->exp_obd->u.cli.cl_max_mds_cookiesize = 
-                                                body->max_cookiesize;
-        }
-
          RETURN (0);
  }
  
@@ -173,7 +182,8 @@ int mdc_getattr(struct obd_export *exp, struct ll_fid *fid,
                  struct ptlrpc_request **request)
  {
          struct ptlrpc_request *req;
-        int size[2] = { sizeof(struct ptlrpc_body), sizeof(struct mds_body) };
+        __u32 size[2] = { sizeof(struct ptlrpc_body),
+                          sizeof(struct mdt_body) };
          int acl_size = 0, rc;
          ENTRY;
  
@@ -185,6 +195,7 @@ int mdc_getattr(struct obd_export *exp, struct ll_fid *fid,
          if (!req)
                  GOTO(out, rc = -ENOMEM);
  
+        req->rq_export = class_export_get(exp);
          mdc_pack_req_body(req, REQ_REC_OFF, valid, fid, ea_size,
                            MDS_BFLAG_EXT_FLAGS/*request "new" flags(bug 9486)*/);
  
@@ -207,20 +218,32 @@ int mdc_getattr_name(struct obd_export *exp, struct ll_fid *fid,
                       unsigned int ea_size, struct ptlrpc_request **request)
  {
          struct ptlrpc_request *req;
-        struct mds_body *body;
-        int rc, size[3] = { sizeof(struct ptlrpc_body), sizeof(*body), namelen};
+        __u32 size[4] = { [MSG_PTLRPC_BODY_OFF] = sizeof(struct ptlrpc_body),
+                          [REQ_REC_OFF] = sizeof(struct mdt_body),
+                          [REQ_REC_OFF + 1] = namelen };
+        int rc;
+        int bufcount = 3;
+        int nameoffset = REQ_REC_OFF + 1;
          ENTRY;
  
+        if (mdc_exp_is_2_0_server(exp)) {
+                size[REQ_REC_OFF + 1] = 0;
+                size[REQ_REC_OFF + 2] = namelen;
+                bufcount ++;
+                nameoffset ++;
+        }
+
          req = ptlrpc_prep_req(class_exp2cliimp(exp), LUSTRE_MDS_VERSION,
-                              MDS_GETATTR_NAME, 3, size, NULL);
+                              MDS_GETATTR_NAME, bufcount, size, NULL);
          if (!req)
                  GOTO(out, rc = -ENOMEM);
  
+        req->rq_export = class_export_get(exp);
          mdc_pack_req_body(req, REQ_REC_OFF, valid, fid, ea_size,
                            MDS_BFLAG_EXT_FLAGS/*request "new" flags(bug 9486)*/);
- 
+
          LASSERT(strnlen(filename, namelen) == namelen - 1);
-        memcpy(lustre_msg_buf(req->rq_reqmsg, REQ_REC_OFF + 1, namelen),
+        memcpy(lustre_msg_buf(req->rq_reqmsg, nameoffset, namelen),
                 filename, namelen);
  
          rc = mdc_getattr_common(exp, ea_size, 0, req);
@@ -241,12 +264,24 @@ int mdc_xattr_common(struct obd_export *exp, struct ll_fid *fid,
  {
          struct obd_device *obddev = class_exp2obd(exp);
          struct ptlrpc_request *req;
-        int size[4] = { sizeof(struct ptlrpc_body), sizeof(struct mds_body) };
-        // int size[3] = {sizeof(struct mds_body)}, bufcnt = 1;
-        int rc, xattr_namelen = 0, bufcnt = 2, offset;
+        __u32 size[5] = { [MSG_PTLRPC_BODY_OFF] = sizeof(struct ptlrpc_body),
+                        [REQ_REC_OFF] = sizeof(struct mdt_body),
+                        [REQ_REC_OFF + 1] = 0, /* capa */
+                        [REQ_REC_OFF + 2] = 0, /* name */
+                        [REQ_REC_OFF + 3] = 0 };
+        int rc, xattr_namelen = 0, bufcnt = 2, offset = REQ_REC_OFF + 1;
          void *tmp;
          ENTRY;
  
+        if (mdc_exp_is_2_0_server(exp)) {
+                bufcnt++;
+                offset++;
+                if (opcode == MDS_SETXATTR) {
+                        size[REQ_REC_OFF] = sizeof (struct mdt_rec_setxattr);
+                        opcode = MDS_REINT;
+                }
+        }
+
          if (xattr_name) {
                  xattr_namelen = strlen(xattr_name) + 1;
                  size[bufcnt++] = xattr_namelen;
@@ -261,10 +296,26 @@ int mdc_xattr_common(struct obd_export *exp, struct ll_fid *fid,
          if (!req)
                  GOTO(out, rc = -ENOMEM);
  
-        /* request data */
-        mdc_pack_req_body(req, REQ_REC_OFF, valid, fid, output_size, flags);
-
-        offset = REQ_REC_OFF + 1;
+        req->rq_export = class_export_get(exp);
+
+        if (opcode == MDS_REINT && mdc_exp_is_2_0_server(exp)) {
+                struct mdt_rec_setxattr *rec;
+                rec = lustre_msg_buf(req->rq_reqmsg, REQ_REC_OFF,
+                                     sizeof(struct mdt_rec_setxattr));
+                rec->sx_opcode = REINT_SETXATTR;
+                rec->sx_fsuid  = current->fsuid;
+                rec->sx_fsgid  = current->fsgid;
+                rec->sx_cap    = cfs_curproc_cap_pack();
+                rec->sx_suppgid1 = -1;
+                rec->sx_suppgid2 = -1;
+                rec->sx_fid    = *((struct lu_fid*)fid);
+                rec->sx_valid  = valid;
+                rec->sx_size   = output_size;
+                rec->sx_flags  = flags;
+        } else {
+                /* request data */
+                mdc_pack_req_body(req, REQ_REC_OFF, valid, fid, output_size, flags);
+        }
  
          if (xattr_name) {
                  tmp = lustre_msg_buf(req->rq_reqmsg, offset++, xattr_namelen);
@@ -275,28 +326,32 @@ int mdc_xattr_common(struct obd_export *exp, struct ll_fid *fid,
                  memcpy(tmp, input, input_size);
          }
  
-        /* reply buffers */
-        if (opcode == MDS_GETXATTR) {
-                size[REPLY_REC_OFF] = sizeof(struct mds_body);
+        size[REPLY_REC_OFF] = sizeof(struct mdt_body);
+        if (mdc_exp_is_2_0_server(exp)) {
                  bufcnt = 2;
          } else {
-                bufcnt = 1;
+                /* reply buffers */
+                if (opcode == MDS_GETXATTR) {
+                        bufcnt = 2;
+                } else {
+                        bufcnt = 1;
+                }
+
          }
  
          /* we do this even output_size is 0, because server is doing that */
          size[bufcnt++] = output_size;
-
          ptlrpc_req_set_repsize(req, bufcnt, size);
  
          /* make rpc */
-        if (opcode == MDS_SETXATTR)
+        if (opcode == MDS_SETXATTR || opcode == MDS_REINT)
                  mdc_get_rpc_lock(exp->exp_obd->u.cli.cl_rpc_lock, NULL);
          else
                  mdc_enter_request(&obddev->u.cli);
  
          rc = ptlrpc_queue_wait(req);
  
-        if (opcode == MDS_SETXATTR)
+        if (opcode == MDS_SETXATTR || opcode == MDS_REINT)
                  mdc_put_rpc_lock(exp->exp_obd->u.cli.cl_rpc_lock, NULL);
          else
                  mdc_exit_request(&obddev->u.cli);
@@ -412,6 +467,7 @@ int mdc_req2lustre_md(struct ptlrpc_request *req, int offset,
                        struct lustre_md *md)
  {
          int rc = 0;
+        int iop = mdc_req_is_2_0_server(req);
          ENTRY;
  
          LASSERT(md);
@@ -455,17 +511,19 @@ int mdc_req2lustre_md(struct ptlrpc_request *req, int offset,
                  }
                  rc = 0;
  
-                offset++;
-        }
-
-        if (md->body->valid & OBD_MD_FLDIREA) {
+                if (!iop)
+                        offset++;
+        } else if (md->body->valid & OBD_MD_FLDIREA) {
                  if(!S_ISDIR(md->body->mode)) {
                          CERROR("OBD_MD_FLDIREA set, should be a directory, but "
                                 "is not\n");
                          GOTO(err_out, rc = -EPROTO);
                  }
-                offset++;
+                if (!iop)
+                        offset++;
          }
+        if (iop)
+                offset++;
  
          /* for ACL, it's possible that FLACL is set but aclsize is zero.
           * only when aclsize != 0 there's an actual segment for ACL in
@@ -534,7 +592,7 @@ static void mdc_replay_open(struct ptlrpc_request *req)
                  EXIT;
                  return;
          }
-        DEBUG_REQ(D_ERROR, req, "mdc open data found");
+        DEBUG_REQ(D_INFO, req, "mdc open data found");
  
          och = mod->mod_och;
          if (och != NULL) {
@@ -655,24 +713,33 @@ static void mdc_commit_close(struct ptlrpc_request *req)
          spin_unlock(&open_req->rq_lock);
  }
  
-int mdc_close(struct obd_export *exp, struct obdo *oa,
+int mdc_close(struct obd_export *exp, struct mdc_op_data *data, struct obdo *oa,
                struct obd_client_handle *och, struct ptlrpc_request **request)
  {
          struct obd_device *obd = class_exp2obd(exp);
-        int reqsize[2] = { sizeof(struct ptlrpc_body),
-                           sizeof(struct mds_body) };
-        int rc, repsize[4] = { sizeof(struct ptlrpc_body),
-                               sizeof(struct mds_body),
+        __u32 reqsize[4] = { sizeof(struct ptlrpc_body),
+                             sizeof(struct mdt_body) };
+        __u32 repsize[4] = { sizeof(struct ptlrpc_body),
+                             sizeof(struct mdt_body),
                                 obd->u.cli.cl_max_mds_easize,
                                 obd->u.cli.cl_max_mds_cookiesize };
+        int rc;
          struct ptlrpc_request *req;
          struct mdc_open_data *mod;
+        int bufcount = 2;
          ENTRY;
  
+        if (mdc_exp_is_2_0_server(exp)) {
+                reqsize[1] = sizeof(struct mdt_epoch);
+                reqsize[2] = sizeof(struct mdt_rec_create);
+                reqsize[3] = 0; /* capa */
+                bufcount = 4;
+        }
          req = ptlrpc_prep_req(class_exp2cliimp(exp), LUSTRE_MDS_VERSION,
-                              MDS_CLOSE, 2, reqsize, NULL);
+                              MDS_CLOSE, bufcount, reqsize, NULL);
          if (req == NULL)
                  GOTO(out, rc = -ENOMEM);
+        req->rq_export = class_export_get(exp);
  
          /* To avoid a livelock (bug 7034), we need to send CLOSE RPCs to a
           * portal whose threads are not taking any DLM locks and are therefore
@@ -693,12 +760,13 @@ int mdc_close(struct obd_export *exp, struct obdo *oa,
                          GOTO(out, rc = -EIO);
                  }
                  mod->mod_close_req = req;
+                DEBUG_REQ(D_RPCTRACE, mod->mod_close_req, "close req");
                  DEBUG_REQ(D_RPCTRACE, mod->mod_open_req, "matched open");
          } else {
                  CDEBUG(D_RPCTRACE, "couldn't find open req; expecting error\n");
          }
  
-        mdc_close_pack(req, REQ_REC_OFF, oa, oa->o_valid, och);
+        mdc_close_pack(req, REQ_REC_OFF, data, oa, oa->o_valid, och);
  
          ptlrpc_req_set_repsize(req, 4, repsize);
          req->rq_commit_cb = mdc_commit_close;
@@ -723,7 +791,7 @@ int mdc_close(struct obd_export *exp, struct obdo *oa,
                                  rc = -rc;
                  } else if (mod == NULL) {
                          CERROR("Unexpected: can't find mdc_open_data, but the "
-                               "close succeeded.  Please tell CFS.\n");
+                               "close succeeded.  Please tell <http://bugzilla.lustre.org/>.\n");
                  }
                  if (!lustre_swab_repbuf(req, REPLY_REC_OFF,
                                          sizeof(struct mds_body),
@@ -742,11 +810,14 @@ int mdc_close(struct obd_export *exp, struct obdo *oa,
          return rc;
  }
  
-int mdc_done_writing(struct obd_export *exp, struct obdo *obdo)
+int mdc_done_writing(struct obd_export *exp, struct mdc_op_data *data,
+                     struct obdo *obdo)
  {
          struct ptlrpc_request *req;
          struct mds_body *body;
-        int rc, size[2] = { sizeof(struct ptlrpc_body), sizeof(*body) };
+        __u32 size[2] = { sizeof(struct ptlrpc_body),
+                          sizeof(struct mdt_body) };
+        int rc;
          ENTRY;
  
          req = ptlrpc_prep_req(class_exp2cliimp(exp), LUSTRE_MDS_VERSION,
@@ -754,8 +825,9 @@ int mdc_done_writing(struct obd_export *exp, struct obdo *obdo)
          if (req == NULL)
                  RETURN(-ENOMEM);
  
+        req->rq_export = class_export_get(exp);
          body = lustre_msg_buf(req->rq_reqmsg, REQ_REC_OFF, sizeof(*body));
-        mdc_pack_fid(&body->fid1, obdo->o_id, 0, obdo->o_mode);
+        body->fid1 = data->fid1;
          body->size = obdo->o_size;
          body->blocks = obdo->o_blocks;
          body->flags = obdo->o_flags;
@@ -776,7 +848,9 @@ int mdc_readpage(struct obd_export *exp, struct ll_fid *fid, __u64 offset,
          struct ptlrpc_request *req = NULL;
          struct ptlrpc_bulk_desc *desc = NULL;
          struct mds_body *body;
-        int rc, size[2] = { sizeof(struct ptlrpc_body), sizeof(*body) };
+        __u32 size[2] = { sizeof(struct ptlrpc_body),
+                          sizeof(struct mdt_body) };
+        int rc;
          ENTRY;
  
          CDEBUG(D_INODE, "inode: "LPU64"\n", fid->id);
@@ -786,6 +860,7 @@ int mdc_readpage(struct obd_export *exp, struct ll_fid *fid, __u64 offset,
          if (req == NULL)
                  GOTO(out, rc = -ENOMEM);
  
+        req->rq_export = class_export_get(exp);
          req->rq_request_portal = MDS_READPAGE_PORTAL;
          ptlrpc_at_set_req_timeout(req);
  
@@ -835,14 +910,10 @@ static int mdc_iocontrol(unsigned int cmd, struct obd_export *exp, int len,
          int rc;
          ENTRY;
  
-#if (LINUX_VERSION_CODE < KERNEL_VERSION(2,5,0))
-        MOD_INC_USE_COUNT;
-#else
          if (!try_module_get(THIS_MODULE)) {
                  CERROR("Can't get module. Is it alive?");
                  return -EINVAL;
          }
-#endif
          switch (cmd) {
          case OBD_IOC_CLIENT_RECOVER:
                  rc = ptlrpc_recover_import(imp, data->ioc_inlbuf1);
@@ -871,17 +942,15 @@ static int mdc_iocontrol(unsigned int cmd, struct obd_export *exp, int len,
                  rc = lquota_poll_check(quota_interface, exp,
                                         (struct if_quotacheck *)karg);
                  GOTO(out, rc);
+        case OBD_IOC_PING_TARGET:
+                rc = ptlrpc_obd_ping(obd);
+                GOTO(out, rc);
          default:
                  CERROR("mdc_ioctl(): unrecognised ioctl %#x\n", cmd);
                  GOTO(out, rc = -ENOTTY);
          }
  out:
-#if (LINUX_VERSION_CODE < KERNEL_VERSION(2,5,0))
-        MOD_DEC_USE_COUNT;
-#else
          module_put(THIS_MODULE);
-#endif
-
          return rc;
  }
  
@@ -918,9 +987,10 @@ int mdc_set_info_async(struct obd_export *exp, obd_count keylen,
                         exp->exp_obd->obd_name, imp->imp_initial_recov_bk);
                  RETURN(0);
          }
-        if (KEY_IS(KEY_READONLY)) {
+        /* Accept the broken "read-only" key for 1.6.6 servers. b=17493 */
+        if (KEY_IS(KEY_READONLY) || KEY_IS(KEY_READONLY_166COMPAT)) {
                  struct ptlrpc_request *req;
-                int size[3] = { sizeof(struct ptlrpc_body), keylen, vallen };
+                __u32 size[3] = { sizeof(struct ptlrpc_body), keylen, vallen };
                  char *bufs[3] = { NULL, key, val };
  
                  if (vallen != sizeof(int))
@@ -941,6 +1011,7 @@ int mdc_set_info_async(struct obd_export *exp, obd_count keylen,
                  if (req == NULL)
                          RETURN(-ENOMEM);
  
+                req->rq_export = class_export_get(exp);
                  ptlrpc_req_set_repsize(req, 1, NULL);
                  if (set) {
                          rc = 0;
@@ -958,7 +1029,7 @@ int mdc_set_info_async(struct obd_export *exp, obd_count keylen,
  }
  
  int mdc_get_info(struct obd_export *exp, __u32 keylen, void *key,
-                 __u32 *vallen, void *val)
+                 __u32 *vallen, void *val, struct lov_stripe_md *lsm)
  {
          int rc = -EINVAL;
  
@@ -983,10 +1054,11 @@ static int mdc_statfs(struct obd_device *obd, struct obd_statfs *osfs,
          struct ptlrpc_request *req;
          struct obd_statfs *msfs;
          struct obd_import     *imp = NULL;
-        int rc, size[2] = { sizeof(struct ptlrpc_body), sizeof(*msfs) };
+        __u32 size[2] = { sizeof(struct ptlrpc_body), sizeof(*msfs) };
+        int rc;
          ENTRY;
  
-        /*Since the request might also come from lprocfs, so we need 
+        /*Since the request might also come from lprocfs, so we need
           *sync this with client_disconnect_export Bug15684*/
          down_read(&obd->u.cli.cl_sem);
          if (obd->u.cli.cl_import)
@@ -994,7 +1066,7 @@ static int mdc_statfs(struct obd_device *obd, struct obd_statfs *osfs,
          up_read(&obd->u.cli.cl_sem);
          if (!imp)
                  RETURN(-ENODEV);
-        
+
  
          /* We could possibly pass max_age in the request (as an absolute
           * timestamp or a "seconds.usec ago") so the target can avoid doing
@@ -1002,7 +1074,7 @@ static int mdc_statfs(struct obd_device *obd, struct obd_statfs *osfs,
           * during mount that would help a bit).  Having relative timestamps
           * is not so great if request processing is slow, while absolute
           * timestamps are not ideal because they need time synchronization. */
-        req = ptlrpc_prep_req(imp, LUSTRE_MDS_VERSION, MDS_STATFS, 1, NULL, 
+        req = ptlrpc_prep_req(imp, LUSTRE_MDS_VERSION, MDS_STATFS, 1, NULL,
                                NULL);
          if (!req)
                  GOTO(output, rc = -ENOMEM);
@@ -1036,21 +1108,27 @@ output:
          return rc;
  }
  
-static int mdc_pin(struct obd_export *exp, obd_id ino, __u32 gen, int type,
+static int mdc_pin(struct obd_export *exp, struct ll_fid *fid,
                     struct obd_client_handle *handle, int flag)
  {
          struct ptlrpc_request *req;
          struct mds_body *body;
-        int rc, size[2] = { sizeof(struct ptlrpc_body), sizeof(*body) };
+        __u32 size[3] = { sizeof(struct ptlrpc_body),
+                          sizeof(struct mdt_body), 0 };
+        int rc;
+        int bufcount = 2;
          ENTRY;
  
+        if (mdc_exp_is_2_0_server(exp))
+                bufcount = 3;
          req = ptlrpc_prep_req(class_exp2cliimp(exp), LUSTRE_MDS_VERSION,
-                              MDS_PIN, 2, size, NULL);
+                              MDS_PIN, bufcount, size, NULL);
          if (req == NULL)
                  RETURN(-ENOMEM);
  
+        req->rq_export = class_export_get(exp);
          body = lustre_msg_buf(req->rq_reqmsg, REQ_REC_OFF, sizeof(*body));
-        mdc_pack_fid(&body->fid1, ino, gen, type);
+        body->fid1 = *fid;
          body->flags = flag;
  
          ptlrpc_req_set_repsize(req, 2, size);
@@ -1089,7 +1167,9 @@ static int mdc_unpin(struct obd_export *exp,
  {
          struct ptlrpc_request *req;
          struct mds_body *body;
-        int rc, size[2] = { sizeof(struct ptlrpc_body), sizeof(*body) };
+        __u32 size[2] = { sizeof(struct ptlrpc_body),
+                          sizeof(struct mdt_body) };
+        int rc;
          ENTRY;
  
          if (handle->och_magic != OBD_CLIENT_HANDLE_MAGIC)
@@ -1100,6 +1180,7 @@ static int mdc_unpin(struct obd_export *exp,
          if (req == NULL)
                  RETURN(-ENOMEM);
  
+        req->rq_export = class_export_get(exp);
          body = lustre_msg_buf(req->rq_reqmsg, REQ_REC_OFF, sizeof(*body));
          memcpy(&body->handle, &handle->och_fh, sizeof(body->handle));
          body->flags = flag;
@@ -1122,15 +1203,21 @@ int mdc_sync(struct obd_export *exp, struct ll_fid *fid,
               struct ptlrpc_request **request)
  {
          struct ptlrpc_request *req;
-        int size[2] = { sizeof(struct ptlrpc_body), sizeof(struct mds_body) };
+        __u32 size[3] = { sizeof(struct ptlrpc_body),
+                          sizeof(struct mdt_body), 0 };
+        int bufcount = 2;
          int rc;
          ENTRY;
  
+
+        if (mdc_exp_is_2_0_server(exp))
+                bufcount = 3;
          req = ptlrpc_prep_req(class_exp2cliimp(exp), LUSTRE_MDS_VERSION,
-                              MDS_SYNC, 2, size, NULL);
+                              MDS_SYNC, bufcount, size, NULL);
          if (!req)
                  RETURN(rc = -ENOMEM);
  
+        req->rq_export = class_export_get(exp);
          mdc_pack_req_body(req, REQ_REC_OFF, 0, fid, 0, 0);
  
          ptlrpc_req_set_repsize(req, 2, size);
@@ -1239,7 +1326,7 @@ int mdc_init_ea_size(struct obd_export *mdc_exp, struct obd_export *lov_exp)
  {
          struct obd_device *obd = mdc_exp->exp_obd;
          struct client_obd *cli = &obd->u.cli;
-        struct lov_stripe_md lsm = { .lsm_magic = LOV_MAGIC };
+        struct lov_stripe_md lsm = { .lsm_magic = LOV_MAGIC_V3 };
          struct lov_desc desc;
          __u32 valsize = sizeof(desc);
          __u32 stripes;
@@ -1247,14 +1334,14 @@ int mdc_init_ea_size(struct obd_export *mdc_exp, struct obd_export *lov_exp)
          ENTRY;
  
          rc = obd_get_info(lov_exp, sizeof(KEY_LOVDESC), KEY_LOVDESC,
-                          &valsize, &desc);
+                          &valsize, &desc, NULL);
          if (rc)
                  RETURN(rc);
  
          stripes = min(desc.ld_tgt_count, (__u32)LOV_MAX_STRIPE_COUNT);
          lsm.lsm_stripe_count = stripes;
          size = obd_size_diskmd(lov_exp, &lsm);
-        
+
          if (cli->cl_max_mds_easize < size)
                  cli->cl_max_mds_easize = size;
  
@@ -1270,7 +1357,7 @@ int mdc_init_ea_size(struct obd_export *mdc_exp, struct obd_export *lov_exp)
  
          CDEBUG(D_HA, "updating max_mdsize/max_cookiesize: %d/%d\n",
                 cli->cl_max_mds_easize, cli->cl_max_mds_cookiesize);
-        
+
          RETURN(0);
  }
  
@@ -1280,17 +1367,18 @@ static int mdc_precleanup(struct obd_device *obd, enum obd_cleanup_stage stage)
          ENTRY;
  
          switch (stage) {
-        case OBD_CLEANUP_EARLY: 
+        case OBD_CLEANUP_EARLY:
          case OBD_CLEANUP_EXPORTS:
                  /* If we set up but never connected, the
                     client import will not have been cleaned. */
                  if (obd->u.cli.cl_import) {
                          struct obd_import *imp;
+                        down_write(&obd->u.cli.cl_sem);
                          imp = obd->u.cli.cl_import;
                          CERROR("client import never connected\n");
                          ptlrpc_invalidate_import(imp);
-                        ptlrpc_free_rq_pool(imp->imp_rq_pool);
                          class_destroy_import(imp);
+                        up_write(&obd->u.cli.cl_sem);
                          obd->u.cli.cl_import = NULL;
                  }
                  rc = obd_llog_finish(obd, 0);
@@ -1322,7 +1410,7 @@ static int mdc_cleanup(struct obd_device *obd)
  
  
  static int mdc_llog_init(struct obd_device *obd, struct obd_device *tgt,
-                         int count, struct llog_catid *logid, 
+                         int count, struct llog_catid *logid,
                           struct obd_uuid *uuid)
  {
          struct llog_ctxt *ctxt;
@@ -1343,21 +1431,38 @@ static int mdc_llog_init(struct obd_device *obd, struct obd_device *tgt,
                  ctxt = llog_get_context(obd, LLOG_LOVEA_REPL_CTXT);
                  llog_initiator_connect(ctxt);
                  llog_ctxt_put(ctxt);
+        } else {
+                GOTO(err_cleanup, rc);
          }
  
          RETURN(rc);
+err_cleanup:
+        ctxt = llog_get_context(obd, LLOG_CONFIG_REPL_CTXT);
+        if (ctxt)
+                llog_cleanup(ctxt);
+        ctxt = llog_get_context(obd, LLOG_LOVEA_REPL_CTXT);
+        if (ctxt)
+                llog_cleanup(ctxt);
+        return rc;
  }
  
  static int mdc_llog_finish(struct obd_device *obd, int count)
  {
-        int rc;
+        struct llog_ctxt *ctxt;
+        int rc = 0;
          ENTRY;
  
-        rc = llog_cleanup(llog_get_context(obd, LLOG_LOVEA_REPL_CTXT));
-        if (rc) {
-                CERROR("can not cleanup LLOG_CONFIG_REPL_CTXT rc %d\n", rc);
+        ctxt = llog_get_context(obd, LLOG_LOVEA_REPL_CTXT);
+        if (ctxt) {
+                rc = llog_cleanup(ctxt);
+                if (rc) {
+                        CERROR("Can not cleanup LLOG_CONFIG_REPL_CTXT "
+                               "rc %d\n", rc);
+                }
          }
-        rc = llog_cleanup(llog_get_context(obd, LLOG_CONFIG_REPL_CTXT));
+        ctxt = llog_get_context(obd, LLOG_CONFIG_REPL_CTXT);
+        if (ctxt)
+                rc = llog_cleanup(ctxt);
          RETURN(rc);
  }
  
@@ -1368,11 +1473,62 @@ static int mdc_process_config(struct obd_device *obd, obd_count len, void *buf)
          int rc = 0;
  
          lprocfs_mdc_init_vars(&lvars);
-        
+
          rc = class_process_proc_param(PARAM_MDC, lvars.obd_vars, lcfg, obd);
          return(rc);
  }
  
+static int mdc_fid_init(struct obd_export *exp)
+{
+        struct client_obd *cli;
+        char              *prefix;
+        int                rc;
+        ENTRY;
+
+        cli = &exp->exp_obd->u.cli;
+
+        OBD_ALLOC_PTR(cli->cl_seq);
+        if (cli->cl_seq == NULL)
+                RETURN(-ENOMEM);
+
+        OBD_ALLOC(prefix, MAX_OBD_NAME + 5);
+        if (prefix == NULL)
+                GOTO(out_free_seq, rc = -ENOMEM);
+
+        snprintf(prefix, MAX_OBD_NAME + 5, "srv-%s", exp->exp_obd->obd_name);
+
+        /* Init client side sequence-manager */
+        rc = seq_client_init(cli->cl_seq, exp,
+                             LUSTRE_SEQ_METADATA,
+                             LUSTRE_SEQ_MAX_WIDTH,
+                             prefix);
+        OBD_FREE(prefix, MAX_OBD_NAME + 5);
+        if (rc)
+                GOTO(out_free_seq, rc);
+
+        RETURN(rc);
+
+out_free_seq:
+        OBD_FREE_PTR(cli->cl_seq);
+        cli->cl_seq = NULL;
+        return rc;
+}
+
+static int mdc_fid_fini(struct obd_export *exp)
+{
+        struct client_obd *cli = &exp->exp_obd->u.cli;
+        ENTRY;
+
+        if (cli->cl_seq != NULL) {
+                LASSERT(cli->cl_seq->lcs_exp == exp);
+                seq_client_fini(cli->cl_seq);
+                OBD_FREE_PTR(cli->cl_seq);
+                cli->cl_seq = NULL;
+        }
+
+        RETURN(0);
+}
+
  struct obd_ops mdc_obd_ops = {
          .o_owner        = THIS_MODULE,
          .o_setup        = mdc_setup,
@@ -1382,6 +1538,8 @@ struct obd_ops mdc_obd_ops = {
          .o_del_conn     = client_import_del_conn,
          .o_connect      = client_connect_import,
          .o_disconnect   = client_disconnect_export,
+        .o_fid_init     = mdc_fid_init,
+        .o_fid_fini     = mdc_fid_fini,
          .o_iocontrol    = mdc_iocontrol,
          .o_set_info_async = mdc_set_info_async,
          .o_get_info     = mdc_get_info,
@@ -1420,7 +1578,7 @@ static void /*__exit*/ mdc_exit(void)
          class_unregister_type(LUSTRE_MDC_NAME);
  }
  
-MODULE_AUTHOR("Cluster File Systems, Inc. <info@clusterfs.com>");
+MODULE_AUTHOR("Sun Microsystems, Inc. <http://www.lustre.org/>");
  MODULE_DESCRIPTION("Lustre Metadata Client");
  MODULE_LICENSE("GPL");
  
diff --git a/lustre/mds/autoMakefile.am b/lustre/mds/autoMakefile.am

index e5bdbcf..d2aafc6 100644 (file)
--- a/lustre/mds/autoMakefile.am
+++ b/lustre/mds/autoMakefile.am
@@ -1,7 +1,38 @@
-# Copyright (C) 2001  Cluster File Systems, Inc.
  #
-# This code is issued under the GNU General Public License.
-# See the file COPYING in this distribution
+# GPL HEADER START
+#
+# DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License version 2 only,
+# as published by the Free Software Foundation.
+#
+# This program is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+# General Public License version 2 for more details (a copy is included
+# in the LICENSE file that accompanied this code).
+#
+# You should have received a copy of the GNU General Public License
+# version 2 along with this program; If not, see
+# http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+#
+# Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+# CA 95054 USA or visit www.sun.com if you need additional information or
+# have any questions.
+#
+# GPL HEADER END
+#
+
+#
+# Copyright  2008 Sun Microsystems, Inc. All rights reserved
+# Use is subject to license terms.
+#
+
+#
+# This file is part of Lustre, http://www.lustre.org/
+# Lustre is a trademark of Sun Microsystems, Inc.
+#
  
  if MODULES
  modulefs_DATA = mds$(KMODEXT)
diff --git a/lustre/mds/commit_confd.c b/lustre/mds/commit_confd.c

deleted file mode 100644 (file)

index 8dd2fcd..0000000
--- a/lustre/mds/commit_confd.c
+++ /dev/null
@@ -1,98 +0,0 @@
-/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
- * vim:expandtab:shiftwidth=8:tabstop=8:
- *
- *  Copyright (C) 2005 Cluster File Systems, Inc.
- *
- *   This file is part of the Lustre file system, http://www.lustre.org
- *   Lustre is a trademark of Cluster File Systems, Inc.
- *
- *   You may have signed or agreed to another license before downloading
- *   this software.  If so, you are bound by the terms and conditions
- *   of that agreement, and the following does not apply to you.  See the
- *   LICENSE file included with this distribution for more information.
- *
- *   If you did not agree to a different license, then this copy of Lustre
- *   is open source software; you can redistribute it and/or modify it
- *   under the terms of version 2 of the GNU General Public License as
- *   published by the Free Software Foundation.
- *
- *   In either case, Lustre is distributed in the hope that it will be
- *   useful, but WITHOUT ANY WARRANTY; without even the implied warranty
- *   of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- *   license text for more details.
- */
-
-void commit_add(struct )
-{
-        struct obd_import *import = commit_uuid2import(rec->  uuid);
-
-        if (!import) {
-                CERROR("unaware of OST UUID %s - dorpping\n", rec-> uuid);
-                EXIT;
-                return;
-        }
-
-        spin_lock(&import->llcconf_lock);
-        list_add(&rec->  &import);
-        spin_unlock(&import->llcconf_lock);
-        EXIT;
-        return;
-}
-
-void commit_confd_conf_import(struct obd_import *import,
-                              struct llog_commit_confirm_daemon *lccd)
-{
-        struct list_head *tmp, *save;
-
-
-        list_for_each_safe(&import->import_cc_list, tmp, save) {
-                struct llog_canceld_ctxt *cd;
-
-                if (atomic_read(import->import_cc_count) <=
-                    lccd->llcconf_lowwater)
-                        break;
-
-                cd = list_entry(tmp, struct llog_canceld_ctxt *, llcconf_entry);
-                atomic_dec(&import->import_cc_count);
-                commit_confd_add_and_fire(cd);
-        }
-        EXIT;
-        return;
-}
-
-
-int commit_confd_main(void *data)
-{
-        struct llog_commit_confirm_daemon *lccd = data;
-
-        while (1) {
-                /* something has happened */
-                event_wait();
-
-                if (lccd->flags & LCCD_STOP)
-                        break;
-
-
-                /* lock llccd imporlist */
-                spin_lock(&lccd->llcconf_lock);
-                list_for_each_safe(&lccd->llcconf_list,   ) {
-                        struct obd_import *import;
-                        import = list_entry(&lccd->llcconf_list,
-                                            struct obd_import,
-                                            import_entry);
-                        get_import(import);
-                        spin_unlock(&lccd->llcconf_lock);
-                        if (atomic_read(import->import_cc_count) >
-                            lccd->llcconf_highwater)
-                                commit_confd_conf_import(import);
-                        put_import(import);
-                        spin_lock(&lccd->llcconf_lock);
-
-                }
-                spin_unlock(&lccd->llcconf_lock);
-
-        }
-
-        lccd->flags = LCCD_STOPPED;
-        RETURN(0);
-}
diff --git a/lustre/mds/handler.c b/lustre/mds/handler.c

index 7d70520..3dceb6b 100644 (file)
--- a/lustre/mds/handler.c
+++ b/lustre/mds/handler.c
@@ -1,32 +1,44 @@
  /* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
   * vim:expandtab:shiftwidth=8:tabstop=8:
   *
- *  lustre/mds/handler.c
- *  Lustre Metadata Server (mds) request handler
+ * GPL HEADER START
   *
- *  Copyright (c) 2001-2005 Cluster File Systems, Inc.
- *   Author: Peter Braam <braam@clusterfs.com>
- *   Author: Andreas Dilger <adilger@clusterfs.com>
- *   Author: Phil Schwan <phil@clusterfs.com>
- *   Author: Mike Shaver <shaver@clusterfs.com>
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   *
- *   This file is part of the Lustre file system, http://www.lustre.org
- *   Lustre is a trademark of Cluster File Systems, Inc.
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
   *
- *   You may have signed or agreed to another license before downloading
- *   this software.  If so, you are bound by the terms and conditions
- *   of that agreement, and the following does not apply to you.  See the
- *   LICENSE file included with this distribution for more information.
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
   *
- *   If you did not agree to a different license, then this copy of Lustre
- *   is open source software; you can redistribute it and/or modify it
- *   under the terms of version 2 of the GNU General Public License as
- *   published by the Free Software Foundation.
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
   *
- *   In either case, Lustre is distributed in the hope that it will be
- *   useful, but WITHOUT ANY WARRANTY; without even the implied warranty
- *   of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- *   license text for more details.
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright  2008 Sun Microsystems, Inc. All rights reserved
+ * Use is subject to license terms.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/mds/handler.c
+ *
+ * Author: Peter Braam <braam@clusterfs.com>
+ * Author: Andreas Dilger <adilger@clusterfs.com>
+ * Author: Phil Schwan <phil@clusterfs.com>
+ * Author: Mike Shaver <shaver@clusterfs.com>
   */
  
  #ifndef EXPORT_SYMTAB
@@ -40,21 +52,16 @@
  #include <linux/random.h>
  #include <linux/fs.h>
  #include <linux/jbd.h>
-#if (LINUX_VERSION_CODE >= KERNEL_VERSION(2,5,0))
  # include <linux/smp_lock.h>
  # include <linux/buffer_head.h>
  # include <linux/workqueue.h>
  # include <linux/mount.h>
-#else
-# include <linux/locks.h>
-#endif
  
  #include <obd_class.h>
  #include <lustre_dlm.h>
  #include <obd_lov.h>
  #include <lustre_fsfilt.h>
  #include <lprocfs_status.h>
-#include <lustre_commit_confd.h>
  #include <lustre_quota.h>
  #include <lustre_disk.h>
  #include <lustre_param.h>
@@ -135,8 +142,8 @@ static int mds_sendpage(struct ptlrpc_request *req, struct file *file,
                  CERROR("Req deadline already passed %lu (now: %lu)\n",
                         req->rq_deadline, cfs_time_current_sec());
          }
-        lwi = LWI_TIMEOUT(max(timeout, 1) * HZ, NULL, NULL);
-        rc = l_wait_event(desc->bd_waitq, !ptlrpc_bulk_active(desc), &lwi);
+        lwi = LWI_TIMEOUT(cfs_time_seconds(max(timeout, 1)), NULL, NULL);
+        rc = l_wait_event(desc->bd_waitq, !ptlrpc_server_bulk_active(desc), &lwi);
          LASSERT (rc == 0 || rc == -ETIMEDOUT);
  
          if (rc == 0) {
@@ -157,7 +164,7 @@ static int mds_sendpage(struct ptlrpc_request *req, struct file *file,
  
          EXIT;
   abort_bulk:
-        ptlrpc_abort_bulk (desc);
+        ptlrpc_abort_bulk(desc);
   cleanup_buf:
          for (i = 0; i < npages; i++)
                  if (pages[i])
@@ -180,7 +187,7 @@ struct dentry *mds_fid2locked_dentry(struct obd_device *obd, struct ll_fid *fid,
          struct dentry *de = mds_fid2dentry(mds, fid, mnt), *retval = de;
          struct ldlm_res_id res_id = { .name = {0} };
          int flags = LDLM_FL_ATOMIC_CB, rc;
-        ldlm_policy_data_t policy = { .l_inodebits = { lockpart} }; 
+        ldlm_policy_data_t policy = { .l_inodebits = { lockpart} };
          ENTRY;
  
          if (IS_ERR(de))
@@ -188,8 +195,8 @@ struct dentry *mds_fid2locked_dentry(struct obd_device *obd, struct ll_fid *fid,
  
          res_id.name[0] = de->d_inode->i_ino;
          res_id.name[1] = de->d_inode->i_generation;
-        rc = ldlm_cli_enqueue_local(obd->obd_namespace, &res_id, 
-                                    LDLM_IBITS, &policy, lock_mode, &flags, 
+        rc = ldlm_cli_enqueue_local(obd->obd_namespace, &res_id,
+                                    LDLM_IBITS, &policy, lock_mode, &flags,
                                      ldlm_blocking_ast, ldlm_completion_ast,
                                      NULL, NULL, 0, NULL, lockh);
          if (rc != ELDLM_OK) {
@@ -206,6 +213,7 @@ struct dentry *mds_fid2locked_dentry(struct obd_device *obd, struct ll_fid *fid,
  struct dentry *mds_fid2dentry(struct mds_obd *mds, struct ll_fid *fid,
                                struct vfsmount **mnt)
  {
+        struct obd_device *obd = container_of(mds, struct obd_device, u.mds);
          char fid_name[32];
          unsigned long ino = fid->id;
          __u32 generation = fid->generation;
@@ -222,7 +230,7 @@ struct dentry *mds_fid2dentry(struct mds_obd *mds, struct ll_fid *fid,
  
          /* under ext3 this is neither supposed to return bad inodes
             nor NULL inodes. */
-        result = ll_lookup_one_len(fid_name, mds->mds_fid_de, strlen(fid_name));
+        result = mds_lookup(obd, fid_name, mds->mds_fid_de, strlen(fid_name));
          if (IS_ERR(result))
                  RETURN(result);
  
@@ -233,8 +241,6 @@ struct dentry *mds_fid2dentry(struct mds_obd *mds, struct ll_fid *fid,
         if (inode->i_nlink == 0) {
                  if (inode->i_mode == 0 &&
                      LTIME_S(inode->i_ctime) == 0 ) {
-                        struct obd_device *obd = container_of(mds, struct
-                                                              obd_device, u.mds);
                          LCONSOLE_WARN("Found inode with zero nlink, mode and "
                                        "ctime -- this may indicate disk"
                                        "corruption (device %s, inode %lu, link:"
@@ -265,7 +271,7 @@ struct dentry *mds_fid2dentry(struct mds_obd *mds, struct ll_fid *fid,
          RETURN(result);
  }
  
-static int mds_connect_internal(struct obd_export *exp, 
+static int mds_connect_internal(struct obd_export *exp,
                                  struct obd_connect_data *data)
  {
          struct obd_device *obd = exp->exp_obd;
@@ -302,7 +308,8 @@ static int mds_connect_internal(struct obd_export *exp,
  
  static int mds_reconnect(struct obd_export *exp, struct obd_device *obd,
                           struct obd_uuid *cluuid,
-                         struct obd_connect_data *data)
+                         struct obd_connect_data *data,
+                         void *localdata)
  {
          int rc;
          ENTRY;
@@ -311,6 +318,8 @@ static int mds_reconnect(struct obd_export *exp, struct obd_device *obd,
                  RETURN(-EINVAL);
  
          rc = mds_connect_internal(exp, data);
+        if (rc == 0)
+                mds_export_stats_init(obd, exp, localdata);
  
          RETURN(rc);
  }
@@ -328,18 +337,14 @@ static int mds_connect(struct lustre_handle *conn, struct obd_device *obd,
          struct obd_export *exp;
          struct mds_export_data *med;
          struct lsd_client_data *lcd = NULL;
-        int rc, abort_recovery;
+        int rc;
          ENTRY;
  
          if (!conn || !obd || !cluuid)
                  RETURN(-EINVAL);
  
          /* Check for aborted recovery. */
-        spin_lock_bh(&obd->obd_processing_task_lock);
-        abort_recovery = obd->obd_abort_recovery;
-        spin_unlock_bh(&obd->obd_processing_task_lock);
-        if (abort_recovery)
-                target_abort_recovery(obd);
+        target_recovery_check_and_stop(obd);
  
          /* XXX There is a small race between checking the list and adding a
           * new connection for the same UUID, but the real threat (list
@@ -389,15 +394,16 @@ out:
  int mds_init_export(struct obd_export *exp)
  {
          struct mds_export_data *med = &exp->exp_mds_data;
+        ENTRY;
  
          INIT_LIST_HEAD(&med->med_open_head);
          spin_lock_init(&med->med_open_lock);
-        
+
          spin_lock(&exp->exp_lock);
          exp->exp_connecting = 1;
          spin_unlock(&exp->exp_lock);
  
-        RETURN(0);
+        RETURN(ldlm_init_export(exp));
  }
  
  static int mds_destroy_export(struct obd_export *export)
@@ -407,47 +413,57 @@ static int mds_destroy_export(struct obd_export *export)
          struct mds_obd *mds = &obd->u.mds;
          struct lvfs_run_ctxt saved;
          struct lov_mds_md *lmm;
+        __u32 lmm_sz, cookie_sz;
          struct llog_cookie *logcookies;
+        struct list_head closing_list;
+        struct mds_file_data *mfd, *n;
          int rc = 0;
          ENTRY;
  
          med = &export->exp_mds_data;
          target_destroy_export(export);
+        ldlm_destroy_export(export);
  
          if (obd_uuid_equals(&export->exp_client_uuid, &obd->obd_uuid))
                  RETURN(0);
  
-        push_ctxt(&saved, &obd->obd_lvfs_ctxt, NULL);
-        /* Close any open files (which may also cause orphan unlinking). */
-
-        OBD_ALLOC(lmm, mds->mds_max_mdsize);
+        lmm_sz = mds->mds_max_mdsize;
+        OBD_ALLOC(lmm, lmm_sz);
          if (lmm == NULL) {
                  CWARN("%s: allocation failure during cleanup; can not force "
                        "close file handles on this service.\n", obd->obd_name);
                  GOTO(out, rc = -ENOMEM);
          }
  
-        OBD_ALLOC(logcookies, mds->mds_max_cookiesize);
+        cookie_sz = mds->mds_max_cookiesize;
+        OBD_ALLOC(logcookies, cookie_sz);
          if (logcookies == NULL) {
                  CWARN("%s: allocation failure during cleanup; can not force "
                        "close file handles on this service.\n", obd->obd_name);
-                OBD_FREE(lmm, mds->mds_max_mdsize);
+                OBD_FREE(lmm, lmm_sz);
                  GOTO(out, rc = -ENOMEM);
          }
  
+        push_ctxt(&saved, &obd->obd_lvfs_ctxt, NULL);
+        /* Close any open files (which may also cause orphan unlinking). */
+        CFS_INIT_LIST_HEAD(&closing_list);
          spin_lock(&med->med_open_lock);
          while (!list_empty(&med->med_open_head)) {
                  struct list_head *tmp = med->med_open_head.next;
                  struct mds_file_data *mfd =
                          list_entry(tmp, struct mds_file_data, mfd_list);
-                int lmm_size = mds->mds_max_mdsize;
-                umode_t mode = mfd->mfd_dentry->d_inode->i_mode;
-                __u64 valid = 0;
  
                  /* Remove mfd handle so it can't be found again.
                   * We are consuming the mfd_list reference here. */
                  mds_mfd_unlink(mfd, 0);
-                spin_unlock(&med->med_open_lock);
+                list_add_tail(&mfd->mfd_list, &closing_list);
+        }
+        spin_unlock(&med->med_open_lock);
+
+        list_for_each_entry_safe(mfd, n, &closing_list, mfd_list) {
+                int lmm_size = lmm_sz;
+                umode_t mode = mfd->mfd_dentry->d_inode->i_mode;
+                __u64 valid = 0;
  
                  /* If you change this message, be sure to update
                   * replay_single:test_46 */
@@ -457,7 +473,7 @@ static int mds_destroy_export(struct obd_export *export)
                         mfd->mfd_dentry->d_inode->i_ino);
  
                  rc = mds_get_md(obd, mfd->mfd_dentry->d_inode, lmm,
-                                &lmm_size, 1, 0);
+                                &lmm_size, 1, 0, 0);
                  if (rc < 0)
                          CWARN("mds_get_md failure, rc=%d\n", rc);
                  else
@@ -467,6 +483,7 @@ static int mds_destroy_export(struct obd_export *export)
                   * is_orphan race, mds_mfd_close drops it */
                  MDS_DOWN_WRITE_ORPHAN_SEM(mfd->mfd_dentry->d_inode);
  
+                list_del_init(&mfd->mfd_list);
                  rc = mds_mfd_close(NULL, REQ_REC_OFF, obd, mfd,
                                     !(export->exp_flags & OBD_OPT_FAILOVER),
                                     lmm, lmm_size, logcookies,
@@ -487,13 +504,10 @@ static int mds_destroy_export(struct obd_export *export)
                          valid &= ~OBD_MD_FLCOOKIE;
                  }
  
-                spin_lock(&med->med_open_lock);
          }
  
-        OBD_FREE(logcookies, mds->mds_max_cookiesize);
-        OBD_FREE(lmm, mds->mds_max_mdsize);
-
-        spin_unlock(&med->med_open_lock);
+        OBD_FREE(logcookies, cookie_sz);
+        OBD_FREE(lmm, lmm_sz);
  
          pop_ctxt(&saved, &obd->obd_lvfs_ctxt, NULL);
          mds_client_free(export);
@@ -515,6 +529,9 @@ static int mds_disconnect(struct obd_export *exp)
          if (exp->exp_obd->obd_namespace != NULL)
                  ldlm_cancel_locks_for_export(exp);
  
+        /* release nid stat refererence */
+        lprocfs_exp_cleanup(exp);
+
          /* complete all outstanding replies */
          spin_lock(&exp->exp_lock);
          while (!list_empty(&exp->exp_outstanding_replies)) {
@@ -557,10 +574,11 @@ static int mds_getstatus(struct ptlrpc_request *req)
  
  /* get the LOV EA from @inode and store it into @md.  It can be at most
   * @size bytes, and @size is updated with the actual EA size.
- * The EA size is also returned on success, and -ve errno on failure. 
+ * The EA size is also returned on success, and -ve errno on failure.
   * If there is no EA then 0 is returned. */
  int mds_get_md(struct obd_device *obd, struct inode *inode, void *md,
-               int *size, int lock, int flags)
+               int *size, int lock, int flags,
+               __u64 connect_flags)
  {
          int rc = 0;
          int lmm_size = 0;
@@ -577,7 +595,8 @@ int mds_get_md(struct obd_device *obd, struct inode *inode, void *md,
                         rc, inode->i_ino);
          } else if (rc > 0) {
                  lmm_size = rc;
-                rc = mds_convert_lov_ea(obd, inode, md, lmm_size);
+                rc = mds_convert_lov_ea(obd, inode, md, lmm_size,
+                                        connect_flags);
  
                  if (rc == 0) {
                          *size = lmm_size;
@@ -598,7 +617,8 @@ int mds_get_md(struct obd_device *obd, struct inode *inode, void *md,
  /* Call with lock=1 if you want mds_pack_md to take the i_mutex.
   * Call with lock=0 if the caller has already taken the i_mutex. */
  int mds_pack_md(struct obd_device *obd, struct lustre_msg *msg, int offset,
-                struct mds_body *body, struct inode *inode, int lock, int flags)
+                struct mds_body *body, struct inode *inode, int lock, int flags,
+                __u64 connect_flags)
  {
          struct mds_obd *mds = &obd->u.mds;
          void *lmm;
@@ -614,7 +634,10 @@ int mds_pack_md(struct obd_device *obd, struct lustre_msg *msg, int offset,
                         inode->i_ino);
                  RETURN(0);
          }
+        /* if this replay request we should be silencely exist without fill md*/
          lmm_size = lustre_msg_buflen(msg, offset);
+        if (lmm_size == 0)
+                RETURN(0);
  
          /* I don't really like this, but it is a sanity check on the client
           * MD request.  However, if the client doesn't know how much space
@@ -626,7 +649,8 @@ int mds_pack_md(struct obd_device *obd, struct lustre_msg *msg, int offset,
                  // RETURN(-EINVAL);
          }
  
-        rc = mds_get_md(obd, inode, lmm, &lmm_size, lock, flags);
+        rc = mds_get_md(obd, inode, lmm, &lmm_size, lock, flags,
+                        connect_flags);
          if (rc > 0) {
                  if (S_ISDIR(inode->i_mode))
                          body->valid |= OBD_MD_FLDIREA;
@@ -658,12 +682,9 @@ int mds_pack_posix_acl(struct inode *inode, struct lustre_msg *repmsg,
          if (!inode->i_op || !inode->i_op->getxattr)
                  GOTO(out, 0);
  
-        lock_24kernel();
          rc = inode->i_op->getxattr(&de, MDS_XATTR_NAME_ACL_ACCESS,
                                     lustre_msg_buf(repmsg, repoff, buflen),
                                     buflen);
-        unlock_24kernel();
-
          if (rc >= 0)
                  repbody->aclsize = rc;
          else if (rc != -ENODATA) {
@@ -702,7 +723,6 @@ static int mds_getattr_internal(struct obd_device *obd, struct dentry *dentry,
          body = lustre_msg_buf(req->rq_repmsg, reply_off, sizeof(*body));
          LASSERT(body != NULL);                 /* caller prepped reply */
  
-        mds_pack_inode2fid(&body->fid1, inode);
          body->flags = reqbody->flags; /* copy MDS_BFLAG_EXT_FLAGS if present */
          mds_pack_inode2body(body, inode);
          reply_off++;
@@ -714,7 +734,8 @@ static int mds_getattr_internal(struct obd_device *obd, struct dentry *dentry,
                          flags = MDS_GETATTR;
  
                  rc = mds_pack_md(obd, req->rq_repmsg, reply_off, body,
-                                 inode, 1, flags);
+                                 inode, 1, flags,
+                                 req->rq_export->exp_connect_flags);
  
                  /* If we have LOV EA data, the OST holds size, atime, mtime */
                  if (!(body->valid & OBD_MD_FLEASIZE) &&
@@ -811,7 +832,7 @@ static int mds_getattr_pack_msg(struct ptlrpc_request *req, struct inode *inode,
                         rc, inode->i_ino);
                  if ((rc == 0) && (lustre_msg_get_opc(req->rq_reqmsg) == MDS_GETATTR) &&
                       ((S_ISDIR(inode->i_mode) && (body->valid & OBD_MD_FLDIREA))))
-                        rc = sizeof(struct lov_mds_md);
+                        rc = sizeof(struct lov_mds_md_v3);
                  if (rc < 0) {
                          if (rc != -ENODATA) {
                                  CERROR("error getting inode %lu MD: rc = %d\n",
@@ -845,11 +866,8 @@ static int mds_getattr_pack_msg(struct ptlrpc_request *req, struct inode *inode,
  
                  size[bufcount] = 0;
                  if (inode->i_op && inode->i_op->getxattr) {
-                        lock_24kernel();
                          rc = inode->i_op->getxattr(&de, MDS_XATTR_NAME_ACL_ACCESS,
                                                     NULL, 0);
-                        unlock_24kernel();
-
                          if (rc < 0) {
                                  if (rc != -ENODATA) {
                                          CERROR("got acl size: %d\n", rc);
@@ -954,7 +972,7 @@ static int mds_getattr_lock(struct ptlrpc_request *req, int offset,
          }
  #endif
  
-        /* child_lockh() is only set in fixup_handle_for_resent_req() 
+        /* child_lockh() is only set in fixup_handle_for_resent_req()
           * if MSG_RESENT is set */
          if (lustre_handle_is_used(child_lockh)) {
                  LASSERT(lustre_msg_get_flags(req->rq_reqmsg) & MSG_RESENT);
@@ -964,9 +982,9 @@ static int mds_getattr_lock(struct ptlrpc_request *req, int offset,
          if (resent_req == 0) {
                  if (name) {
                          OBD_FAIL_TIMEOUT(OBD_FAIL_MDS_RESEND, obd_timeout*2);
-                        rc = mds_get_parent_child_locked(obd, &obd->u.mds, 
+                        rc = mds_get_parent_child_locked(obd, &obd->u.mds,
                                                           &body->fid1,
-                                                         &parent_lockh, 
+                                                         &parent_lockh,
                                                           &dparent, LCK_CR,
                                                           MDS_INODELOCK_UPDATE,
                                                           name, namesize,
@@ -980,7 +998,7 @@ static int mds_getattr_lock(struct ptlrpc_request *req, int offset,
                          LASSERT(dchild);
                          if (IS_ERR(dchild))
                                  rc = PTR_ERR(dchild);
-                } 
+                }
                  if (rc)
                          GOTO(cleanup, rc);
          } else {
@@ -989,7 +1007,7 @@ static int mds_getattr_lock(struct ptlrpc_request *req, int offset,
                  struct ldlm_resource *res;
                  DEBUG_REQ(D_DLMTRACE, req, "resent, not enqueuing new locks");
                  granted_lock = ldlm_handle2lock(child_lockh);
-                /* lock was granted in fixup_handle_for_resent_req() if 
+                /* lock was granted in fixup_handle_for_resent_req() if
                   * MSG_RESENT is set */
                  LASSERTF(granted_lock != NULL, LPU64"/%u lockh "LPX64"\n",
                           body->fid1.id, body->fid1.generation,
@@ -1133,7 +1151,7 @@ static int mds_statfs(struct ptlrpc_request *req)
  
          /* This will trigger a watchdog timeout */
          OBD_FAIL_TIMEOUT(OBD_FAIL_MDS_STATFS_LCW_SLEEP,
-                         (MDS_SERVICE_WATCHDOG_FACTOR * 
+                         (MDS_SERVICE_WATCHDOG_FACTOR *
                            at_get(&svc->srv_at_estimate) / 1000) + 1);
          OBD_COUNTER_INCREMENT(obd, statfs);
  
@@ -1187,7 +1205,6 @@ static int mds_sync(struct ptlrpc_request *req, int offset)
  
                  body = lustre_msg_buf(req->rq_repmsg, REPLY_REC_OFF,
                                        sizeof(*body));
-                mds_pack_inode2fid(&body->fid1, de->d_inode);
                  mds_pack_inode2body(body, de->d_inode);
  
                  l_dput(de);
@@ -1319,6 +1336,7 @@ static int mds_filter_recovery_request(struct ptlrpc_request *req,
          case MDS_CLOSE:
          case MDS_SYNC: /* used in unmounting */
          case OBD_PING:
+        case MDS_SETXATTR:
          case MDS_REINT:
          case LDLM_ENQUEUE:
                  *process = target_queue_recovery_request(req, obd);
@@ -1364,7 +1382,8 @@ static int mds_set_info_rpc(struct obd_export *exp, struct ptlrpc_request *req)
  
          lustre_msg_set_status(req->rq_repmsg, 0);
  
-        if (KEY_IS(KEY_READONLY)) {
+        /* Accept the broken "read-only" key from 1.6.6 clients. b=17493 */
+        if (KEY_IS(KEY_READONLY) || KEY_IS(KEY_READONLY_166COMPAT)) {
                  if (val == NULL || vallen < sizeof(__u32)) {
                          DEBUG_REQ(D_HA, req, "no set_info val");
                          RETURN(-EFAULT);
@@ -1381,6 +1400,7 @@ static int mds_set_info_rpc(struct obd_export *exp, struct ptlrpc_request *req)
          RETURN(0);
  }
  
+#ifdef HAVE_QUOTA_SUPPORT
  static int mds_handle_quotacheck(struct ptlrpc_request *req)
  {
          struct obd_quotactl *oqctl;
@@ -1421,6 +1441,7 @@ static int mds_handle_quotactl(struct ptlrpc_request *req)
          *repoqc = *oqctl;
          RETURN(0);
  }
+#endif
  
  static int mds_msg_check_version(struct lustre_msg *msg)
  {
@@ -1515,7 +1536,7 @@ int mds_handle(struct ptlrpc_request *req)
          /* XXX identical to OST */
          if (lustre_msg_get_opc(req->rq_reqmsg) != MDS_CONNECT) {
                  struct mds_export_data *med;
-                int recovering, abort_recovery;
+                int recovering;
  
                  if (req->rq_export == NULL) {
                          CERROR("operation %d on unconnected MDS from %s\n",
@@ -1550,12 +1571,10 @@ int mds_handle(struct ptlrpc_request *req)
  
                  /* Check for aborted recovery. */
                  spin_lock_bh(&obd->obd_processing_task_lock);
-                abort_recovery = obd->obd_abort_recovery;
                  recovering = obd->obd_recovering;
                  spin_unlock_bh(&obd->obd_processing_task_lock);
-                if (abort_recovery) {
-                        target_abort_recovery(obd);
-                } else if (recovering) {
+                if (recovering &&
+                    target_recovery_check_and_stop(obd) == 0) {
                          rc = mds_filter_recovery_request(req, obd,
                                                           &should_process);
                          if (rc || !should_process)
@@ -1646,9 +1665,9 @@ int mds_handle(struct ptlrpc_request *req)
                  __u32  opc;
                  int op = 0;
                  int size[4] = { sizeof(struct ptlrpc_body),
-                                sizeof(struct mds_body),
-                                mds->mds_max_mdsize,
-                                mds->mds_max_cookiesize };
+                               sizeof(struct mds_body),
+                               mds->mds_max_mdsize,
+                               mds->mds_max_cookiesize };
                  int bufcount;
  
                  /* NB only peek inside req now; mds_reint() will swab it */
@@ -1741,7 +1760,7 @@ int mds_handle(struct ptlrpc_request *req)
                  DEBUG_REQ(D_INODE, req, "set_info");
                  rc = mds_set_info_rpc(req->rq_export, req);
                  break;
-
+#ifdef HAVE_QUOTA_SUPPORT
          case MDS_QUOTACHECK:
                  DEBUG_REQ(D_INODE, req, "quotacheck");
                  OBD_FAIL_RETURN(OBD_FAIL_MDS_QUOTACHECK_NET, 0);
@@ -1753,10 +1772,12 @@ int mds_handle(struct ptlrpc_request *req)
                  OBD_FAIL_RETURN(OBD_FAIL_MDS_QUOTACTL_NET, 0);
                  rc = mds_handle_quotactl(req);
                  break;
-
+#endif
          case OBD_PING:
                  DEBUG_REQ(D_INODE, req, "ping");
                  rc = target_handle_ping(req);
+                if (req->rq_export->exp_delayed)
+                        mds_update_client_epoch(req->rq_export);
                  break;
  
          case OBD_LOG_CANCEL:
@@ -1839,19 +1860,8 @@ int mds_handle(struct ptlrpc_request *req)
          }
  
          EXIT;
- out:
-
-        if (lustre_msg_get_flags(req->rq_reqmsg) & MSG_LAST_REPLAY) {
-                if (obd && obd->obd_recovering) {
-                        DEBUG_REQ(D_HA, req, "LAST_REPLAY, queuing reply");
-                        return target_queue_last_replay_reply(req, rc);
-                }
-                /* Lost a race with recovery; let the error path DTRT. */
-                rc = req->rq_status = -ENOTCONN;
-        }
-
-        target_send_reply(req, rc, fail);
-        return 0;
+out:
+        return target_handle_reply(req, rc, fail);
  }
  
  /* Update the server data on disk.  This stores the new mount_count and
@@ -1983,10 +1993,12 @@ static int mds_setup(struct obd_device *obd, obd_count len, void *buf)
  
          sema_init(&mds->mds_epoch_sem, 1);
          spin_lock_init(&mds->mds_transno_lock);
-        mds->mds_max_mdsize = sizeof(struct lov_mds_md);
+        mds->mds_max_mdsize = sizeof(struct lov_mds_md_v3);
          mds->mds_max_cookiesize = sizeof(struct llog_cookie);
          mds->mds_atime_diff = MAX_ATIME_DIFF;
          mds->mds_evict_ost_nids = 1;
+        /* sync permission changes */
+        mds->mds_sync_permission = 0;
  
          sprintf(ns_name, "mds-%s", obd->obd_uuid.uuid);
          obd->obd_namespace = ldlm_namespace_new(obd, ns_name, LDLM_NAMESPACE_SERVER,
@@ -2002,6 +2014,9 @@ static int mds_setup(struct obd_device *obd, obd_count len, void *buf)
              lprocfs_alloc_obd_stats(obd, LPROC_MDS_LAST) == 0) {
                  /* Init private stats here */
                  mds_stats_counter_init(obd->obd_stats);
+#ifdef HAVE_DELAYED_RECOVERY
+                lprocfs_obd_attach_stale_exports(obd);
+#endif
                  obd->obd_proc_exports_entry = proc_mkdir("exports",
                                                           obd->obd_proc_entry);
          }
@@ -2016,7 +2031,7 @@ static int mds_setup(struct obd_device *obd, obd_count len, void *buf)
          if (obd->obd_proc_exports_entry)
                  lprocfs_add_simple(obd->obd_proc_exports_entry,
                                     "clear", lprocfs_nid_stats_clear_read,
-                                   lprocfs_nid_stats_clear_write, obd);
+                                   lprocfs_nid_stats_clear_write, obd, NULL);
  
          if (lcfg->lcfg_bufcount >= 4 && LUSTRE_CFG_BUFLEN(lcfg, 3) > 0) {
                  class_uuid_t uuid;
@@ -2077,8 +2092,8 @@ static int mds_setup(struct obd_device *obd, obd_count len, void *buf)
                                label ?: "", label ? "/" : "", str,
                                obd->obd_recovery_timeout / 60,
                                obd->obd_recovery_timeout % 60,
-                              obd->obd_max_recoverable_clients,
-                              (obd->obd_max_recoverable_clients == 1) ? "":"s",
+                              obd->obd_recoverable_clients,
+                              (obd->obd_recoverable_clients == 1) ? "":"s",
                                obd->obd_name);
          } else {
                  LCONSOLE_INFO("MDT %s now serving %s (%s%s%s) with recovery "
@@ -2087,8 +2102,11 @@ static int mds_setup(struct obd_device *obd, obd_count len, void *buf)
                                obd->obd_replayable ? "enabled" : "disabled");
          }
  
+        /* Reduce the initial timeout on an MDS because it doesn't need such
+         * a long timeout as an OST does. Adaptive timeouts will adjust this
+         * value appropriately. */
          if (ldlm_timeout == LDLM_TIMEOUT_DEFAULT)
-                ldlm_timeout = 6;
+                ldlm_timeout = MDS_LDLM_TIMEOUT_DEFAULT;
  
          RETURN(0);
  
@@ -2147,6 +2165,7 @@ static int mds_lov_clean(struct obd_device *obd)
  static int mds_postsetup(struct obd_device *obd)
  {
          struct mds_obd *mds = &obd->u.mds;
+        struct llog_ctxt *ctxt;
          int rc = 0;
          ENTRY;
  
@@ -2158,14 +2177,14 @@ static int mds_postsetup(struct obd_device *obd)
          rc = llog_setup(obd, LLOG_LOVEA_ORIG_CTXT, obd, 0, NULL,
                          &llog_lvfs_ops);
          if (rc)
-                RETURN(rc);
+                GOTO(err_llog, rc);
  
          if (mds->mds_profile) {
                  struct lustre_profile *lprof;
-                /* The profile defines which osc and mdc to connect to, for a 
+                /* The profile defines which osc and mdc to connect to, for a
                     client.  We reuse that here to figure out the name of the
                     lov to use (and ignore lprof->lp_mdc).
-                   The profile was set in the config log with 
+                   The profile was set in the config log with
                     LCFG_MOUNTOPT profilenm oscnm mdcnm */
                  lprof = class_get_profile(mds->mds_profile);
                  if (lprof == NULL) {
@@ -2181,14 +2200,18 @@ static int mds_postsetup(struct obd_device *obd)
  
  err_cleanup:
          mds_lov_clean(obd);
-        llog_cleanup(llog_get_context(obd, LLOG_CONFIG_ORIG_CTXT));
-        llog_cleanup(llog_get_context(obd, LLOG_LOVEA_ORIG_CTXT));
-        RETURN(rc);
+        ctxt = llog_get_context(obd, LLOG_LOVEA_ORIG_CTXT);
+        if (ctxt)
+                llog_cleanup(ctxt);
+err_llog:
+        ctxt = llog_get_context(obd, LLOG_CONFIG_ORIG_CTXT);
+        if (ctxt)
+                llog_cleanup(ctxt);
+        return rc;
  }
  
  int mds_postrecov(struct obd_device *obd)
  {
-        struct llog_ctxt *ctxt;
          int rc;
          ENTRY;
  
@@ -2196,20 +2219,19 @@ int mds_postrecov(struct obd_device *obd)
                  RETURN(0);
  
          LASSERT(!obd->obd_recovering);
-        ctxt = llog_get_context(obd, LLOG_MDS_OST_ORIG_CTXT); 
-        LASSERT(ctxt != NULL);
-        llog_ctxt_put(ctxt);
+
+        /* VBR: update boot epoch after recovery */
+        mds_update_last_epoch(obd);
  
          /* clean PENDING dir */
          rc = mds_cleanup_pending(obd);
          if (rc < 0)
                  GOTO(out, rc);
-
          /* FIXME Does target_finish_recovery really need this to block? */
          /* Notify the LOV, which will in turn call mds_notify for each tgt */
          /* This means that we have to hack obd_notify to think we're obd_set_up
             during mds_lov_connect. */
-        obd_notify(obd->u.mds.mds_osc_obd, NULL, 
+        obd_notify(obd->u.mds.mds_osc_obd, NULL,
                     obd->obd_async_recov ? OBD_NOTIFY_SYNC_NONBLOCK :
                     OBD_NOTIFY_SYNC, NULL);
  
@@ -2315,38 +2337,35 @@ static void fixup_handle_for_resent_req(struct ptlrpc_request *req, int offset,
          struct ldlm_request *dlmreq =
                  lustre_msg_buf(req->rq_reqmsg, offset, sizeof(*dlmreq));
          struct lustre_handle remote_hdl = dlmreq->lock_handle[0];
-        struct list_head *iter;
+        struct ldlm_lock *lock;
  
          if (!(lustre_msg_get_flags(req->rq_reqmsg) & MSG_RESENT))
                  return;
  
-        spin_lock(&exp->exp_ldlm_data.led_lock);
-        list_for_each(iter, &exp->exp_ldlm_data.led_held_locks) {
-                struct ldlm_lock *lock;
-                lock = list_entry(iter, struct ldlm_lock, l_export_chain);
-                if (lock == new_lock)
-                        continue;
-                if (lock->l_remote_handle.cookie == remote_hdl.cookie) {
+        lock = lustre_hash_lookup(exp->exp_lock_hash, &remote_hdl);
+        if (lock) {
+                if (lock != new_lock) {
                          lockh->cookie = lock->l_handle.h_cookie;
                          LDLM_DEBUG(lock, "restoring lock cookie");
-                        DEBUG_REQ(D_DLMTRACE, req,"restoring lock cookie "LPX64,
-                                  lockh->cookie);
+                        DEBUG_REQ(D_DLMTRACE, req, "restoring lock cookie "
+                                  LPX64, lockh->cookie);
                          if (old_lock)
                                  *old_lock = LDLM_LOCK_GET(lock);
-                        spin_unlock(&exp->exp_ldlm_data.led_lock);
+
+                        lh_put(exp->exp_lock_hash, &lock->l_exp_hash);
                          return;
                  }
+                lh_put(exp->exp_lock_hash, &lock->l_exp_hash);
          }
-        spin_unlock(&exp->exp_ldlm_data.led_lock);
  
          /* If the xid matches, then we know this is a resent request,
           * and allow it. (It's probably an OPEN, for which we don't
           * send a lock */
-        if (req->rq_xid ==
+        if (req->rq_xid <=
              le64_to_cpu(exp->exp_mds_data.med_lcd->lcd_last_xid))
                  return;
  
-        if (req->rq_xid ==
+        if (req->rq_xid <=
              le64_to_cpu(exp->exp_mds_data.med_lcd->lcd_last_close_xid))
                  return;
  
@@ -2422,6 +2441,13 @@ static int mds_intent_policy(struct ldlm_namespace *ns,
          else if (it->opc & IT_UNLINK)
                  repsize[repbufcnt++] = mds->mds_max_cookiesize;
  
+        /* if we do recovery we isn't send reply mds state is restored */
+        if (lustre_msg_get_flags(req->rq_reqmsg) & MSG_REPLAY) {
+                repsize[DLM_REPLY_REC_OFF+1] = 0;
+                if (it->opc & IT_UNLINK)
+                        repsize[DLM_REPLY_REC_OFF+2] = 0;
+        }
+
          rc = lustre_pack_reply(req, repbufcnt, repsize, NULL);
          if (rc)
                  RETURN(req->rq_status = rc);
@@ -2429,7 +2455,6 @@ static int mds_intent_policy(struct ldlm_namespace *ns,
          rep = lustre_msg_buf(req->rq_repmsg, DLM_LOCKREPLY_OFF, sizeof(*rep));
          intent_set_disposition(rep, DISP_IT_EXECD);
  
-
          /* execute policy */
          switch ((long)it->opc) {
          case IT_OPEN:
@@ -2456,10 +2481,12 @@ static int mds_intent_policy(struct ldlm_namespace *ns,
                       !intent_disposition(rep, DISP_OPEN_LOCK)) {
                          /* If it is the disconnect error (ENODEV & ENOCONN)
                           * ptlrpc layer should know this imediately, it should
-                         * be replied by rq_stats, otherwise, return it by 
+                         * be replied by rq_stats, otherwise, return it by
                           * intent here
                           */
-                        if (IS_CLIENT_DISCONNECT_ERROR(rep->lock_policy_res2))
+                         /* if VBR failure then return error in rq_stats too */
+                        if (IS_CLIENT_DISCONNECT_ERROR(rep->lock_policy_res2) ||
+                            rep->lock_policy_res2 == -EOVERFLOW)
                                  RETURN(rep->lock_policy_res2);
                          else
                                  RETURN(ELDLM_LOCK_ABORTED);
@@ -2546,20 +2573,18 @@ static int mds_intent_policy(struct ldlm_namespace *ns,
          new_lock->l_writers = 0;
  
          new_lock->l_export = class_export_get(req->rq_export);
-        spin_lock(&req->rq_export->exp_ldlm_data.led_lock);
-        list_add(&new_lock->l_export_chain,
-                 &new_lock->l_export->exp_ldlm_data.led_held_locks);
-        spin_unlock(&req->rq_export->exp_ldlm_data.led_lock);
-
          new_lock->l_blocking_ast = lock->l_blocking_ast;
          new_lock->l_completion_ast = lock->l_completion_ast;
+        new_lock->l_flags &= ~LDLM_FL_LOCAL;
  
          memcpy(&new_lock->l_remote_handle, &lock->l_remote_handle,
                 sizeof(lock->l_remote_handle));
  
-        new_lock->l_flags &= ~LDLM_FL_LOCAL;
-
          unlock_res_and_lock(new_lock);
+
+        lustre_hash_add(new_lock->l_export->exp_lock_hash,
+                        &new_lock->l_remote_handle, 
+                        &new_lock->l_exp_hash);
          LDLM_LOCK_PUT(new_lock);
  
          RETURN(ELDLM_LOCK_REPLACED);
@@ -2588,12 +2613,12 @@ static int mdt_setup(struct obd_device *obd, obd_count len, void *buf)
                  mds_max_threads = mds_min_threads = mds_num_threads;
          } else {
                  /* Base min threads on memory and cpus */
-                mds_min_threads = num_possible_cpus() * num_physpages >> 
+                mds_min_threads = num_possible_cpus() * num_physpages >>
                          (27 - CFS_PAGE_SHIFT);
                  if (mds_min_threads < MDS_THREADS_MIN)
                          mds_min_threads = MDS_THREADS_MIN;
                  /* Largest auto threads start value */
-                if (mds_min_threads > 32) 
+                if (mds_min_threads > 32)
                          mds_min_threads = 32;
                  mds_max_threads = min(MDS_THREADS_MAX, mds_min_threads * 4);
          }
@@ -2604,7 +2629,8 @@ static int mdt_setup(struct obd_device *obd, obd_count len, void *buf)
                                  MDC_REPLY_PORTAL, MDS_SERVICE_WATCHDOG_FACTOR,
                                  mds_handle, LUSTRE_MDS_NAME,
                                  obd->obd_proc_entry, target_print_req,
-                                mds_min_threads, mds_max_threads, "ll_mdt");
+                                mds_min_threads, mds_max_threads, "ll_mdt",
+                                NULL);
  
          if (!mds->mds_service) {
                  CERROR("failed to start service\n");
@@ -2622,7 +2648,7 @@ static int mdt_setup(struct obd_device *obd, obd_count len, void *buf)
                                  mds_handle, "mds_setattr",
                                  obd->obd_proc_entry, target_print_req,
                                  mds_min_threads, mds_max_threads,
-                                "ll_mdt_attr");
+                                "ll_mdt_attr", NULL);
          if (!mds->mds_setattr_service) {
                  CERROR("failed to start getattr service\n");
                  GOTO(err_thread, rc = -ENOMEM);
@@ -2639,7 +2665,7 @@ static int mdt_setup(struct obd_device *obd, obd_count len, void *buf)
                                  mds_handle, "mds_readpage",
                                  obd->obd_proc_entry, target_print_req,
                                  MDS_THREADS_MIN_READPAGE, mds_max_threads,
-                                "ll_mdt_rdpg");
+                                "ll_mdt_rdpg", NULL);
          if (!mds->mds_readpage_service) {
                  CERROR("failed to start readpage service\n");
                  GOTO(err_thread2, rc = -ENOMEM);
@@ -2742,13 +2768,20 @@ static int mds_health_check(struct obd_device *obd)
  static int mds_process_config(struct obd_device *obd, obd_count len, void *buf)
  {
          struct lustre_cfg *lcfg = buf;
-        struct lprocfs_static_vars lvars;
-        int rc;
+        int rc = 0;
+
+        switch(lcfg->lcfg_command) {
+        case LCFG_PARAM: {
+                struct lprocfs_static_vars lvars;
+                lprocfs_mds_init_vars(&lvars);
+
+                rc = class_process_proc_param(PARAM_MDT, lvars.obd_vars, lcfg, obd);
+                break;
+        }
+        default:
+                break;
+        }
  
-        lprocfs_mds_init_vars(&lvars);
-        
-        rc = class_process_proc_param(PARAM_MDT, lvars.obd_vars, lcfg, obd);
-        
          return(rc);
  }
  
@@ -2803,7 +2836,7 @@ static int __init mds_init(void)
                  return rc;
          }
          init_obd_quota_ops(mds_quota_interface_ref, &mds_obd_ops);
-        
+
          lprocfs_mds_init_vars(&lvars);
          class_register_type(&mds_obd_ops, lvars.module_vars, LUSTRE_MDS_NAME);
          lprocfs_mdt_init_vars(&lvars);
@@ -2822,7 +2855,7 @@ static void /*__exit*/ mds_exit(void)
          class_unregister_type(LUSTRE_MDT_NAME);
  }
  
-MODULE_AUTHOR("Cluster File Systems, Inc. <info@clusterfs.com>");
+MODULE_AUTHOR("Sun Microsystems, Inc. <http://www.lustre.org/>");
  MODULE_DESCRIPTION("Lustre Metadata Server (MDS)");
  MODULE_LICENSE("GPL");
  
diff --git a/lustre/mds/lproc_mds.c b/lustre/mds/lproc_mds.c

index c2aacd7..834b67f 100644 (file)
--- a/lustre/mds/lproc_mds.c
+++ b/lustre/mds/lproc_mds.c
@@ -1,33 +1,42 @@
  /* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
   * vim:expandtab:shiftwidth=8:tabstop=8:
   *
- *  Copyright (C) 2002 Cluster File Systems, Inc.
+ * GPL HEADER START
   *
- *   This file is part of the Lustre file system, http://www.lustre.org
- *   Lustre is a trademark of Cluster File Systems, Inc.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   *
- *   You may have signed or agreed to another license before downloading
- *   this software.  If so, you are bound by the terms and conditions
- *   of that agreement, and the following does not apply to you.  See the
- *   LICENSE file included with this distribution for more information.
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
   *
- *   If you did not agree to a different license, then this copy of Lustre
- *   is open source software; you can redistribute it and/or modify it
- *   under the terms of version 2 of the GNU General Public License as
- *   published by the Free Software Foundation.
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
   *
- *   In either case, Lustre is distributed in the hope that it will be
- *   useful, but WITHOUT ANY WARRANTY; without even the implied warranty
- *   of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- *   license text for more details.
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
   *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright  2008 Sun Microsystems, Inc. All rights reserved
+ * Use is subject to license terms.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
   */
  #define DEBUG_SUBSYSTEM S_CLASS
  
  #include <linux/version.h>
-#if (LINUX_VERSION_CODE >= KERNEL_VERSION(2,5,0))
  #include <asm/statfs.h>
-#endif
  #include <obd.h>
  #include <obd_class.h>
  #include <lprocfs_status.h>
@@ -394,154 +403,30 @@ static int lprocfs_rd_nosquash_nid(char *page, char **start, off_t off,
                          libcfs_nid2str(mds->mds_nosquash_nid));
  }
  
-#ifdef HAVE_QUOTA_SUPPORT
-static int lprocfs_mds_rd_switch_qs(char *page, char **start, off_t off,
+static int lprocfs_mds_rd_sync_perm(char *page, char **start, off_t off,
                                      int count, int *eof, void *data)
  {
-        struct obd_device *obd = (struct obd_device *)data;
-        LASSERT(obd != NULL);
-
-        return snprintf(page, count, "changing qunit size is %s\n",
-                        obd->u.obt.obt_qctxt.lqc_switch_qs ?
-                        "enabled" : "disabled");
-}
-
-static int lprocfs_mds_rd_boundary_factor(char *page, char **start, off_t off,
-                                          int count, int *eof, void *data)
-{
-        struct obd_device *obd = (struct obd_device *)data;
-        LASSERT(obd != NULL);
-
-
-        return snprintf(page, count, "%lu\n",
-                        obd->u.obt.obt_qctxt.lqc_cqs_boundary_factor);
-}
-
-static int lprocfs_mds_rd_least_bunit(char *page, char **start, off_t off,
-                                      int count, int *eof, void *data)
-{
-        struct obd_device *obd = (struct obd_device *)data;
-        LASSERT(obd != NULL);
-
-
-        return snprintf(page, count, "%lu\n",
-                        obd->u.obt.obt_qctxt.lqc_cqs_least_bunit);
-}
-
-static int lprocfs_mds_rd_least_iunit(char *page, char **start, off_t off,
-                                      int count, int *eof, void *data)
-{
-        struct obd_device *obd = (struct obd_device *)data;
-        LASSERT(obd != NULL);
-
-
-        return snprintf(page, count, "%lu\n",
-                        obd->u.obt.obt_qctxt.lqc_cqs_least_iunit);
-}
-
-static int lprocfs_mds_rd_qs_factor(char *page, char **start, off_t off,
-                                    int count, int *eof, void *data)
-{
-        struct obd_device *obd = (struct obd_device *)data;
-        LASSERT(obd != NULL);
-
-
-        return snprintf(page, count, "%lu\n",
-                        obd->u.obt.obt_qctxt.lqc_cqs_qs_factor);
-}
-
-static int lprocfs_mds_wr_switch_qs(struct file *file, const char *buffer,
-                                    unsigned long count, void *data)
-{
-        struct obd_device *obd = (struct obd_device *)data;
-        int val, rc;
-        LASSERT(obd != NULL);
-
-        rc = lprocfs_write_helper(buffer, count, &val);
-        if (rc)
-                return rc;
-
-        if (val)
-            obd->u.obt.obt_qctxt.lqc_switch_qs = 1;
-        else
-            obd->u.obt.obt_qctxt.lqc_switch_qs = 0;
-
-        return count;
-}
-
-static int lprocfs_mds_wr_boundary_factor(struct file *file, const char *buffer,
-                                          unsigned long count, void *data)
-{
-        struct obd_device *obd = (struct obd_device *)data;
-        int val, rc;
-        LASSERT(obd != NULL);
-
-        rc = lprocfs_write_helper(buffer, count, &val);
-        if (rc)
-                return rc;
-
-        if (val < 2)
-                return -EINVAL;
-
-        obd->u.obt.obt_qctxt.lqc_cqs_boundary_factor = val;
-        return count;
-}
-
-static int lprocfs_mds_wr_least_bunit(struct file *file, const char *buffer,
-                                  unsigned long count, void *data)
-{
-        struct obd_device *obd = (struct obd_device *)data;
-        int val, rc;
-        LASSERT(obd != NULL);
-
-        rc = lprocfs_write_helper(buffer, count, &val);
-        if (rc)
-                return rc;
-
-        if (val < PTLRPC_MAX_BRW_SIZE ||
-            val >= obd->u.obt.obt_qctxt.lqc_bunit_sz)
-                return -EINVAL;
-
-        obd->u.obt.obt_qctxt.lqc_cqs_least_bunit = val;
-        return count;
-}
+        struct obd_device* obd = (struct obd_device *)data;
  
-static int lprocfs_mds_wr_least_iunit(struct file *file, const char *buffer,
-                                  unsigned long count, void *data)
-{
-        struct obd_device *obd = (struct obd_device *)data;
-        int val, rc;
          LASSERT(obd != NULL);
  
-        rc = lprocfs_write_helper(buffer, count, &val);
-        if (rc)
-                return rc;
-
-        if (val < 1 || val >= obd->u.obt.obt_qctxt.lqc_iunit_sz)
-                return -EINVAL;
-
-        obd->u.obt.obt_qctxt.lqc_cqs_least_iunit = val;
-        return count;
+        return snprintf(page, count, "%d\n", obd->u.mds.mds_sync_permission);
  }
  
-static int lprocfs_mds_wr_qs_factor(struct file *file, const char *buffer,
+static int lprocfs_mds_wr_sync_perm(struct file *file, const char *buffer,
                                      unsigned long count, void *data)
  {
-        struct obd_device *obd = (struct obd_device *)data;
+        struct obd_device *obd = data;
          int val, rc;
-        LASSERT(obd != NULL);
  
          rc = lprocfs_write_helper(buffer, count, &val);
          if (rc)
                  return rc;
  
-        if (val < 2)
-                return -EINVAL;
+        obd->u.mds.mds_sync_permission = !!val;
  
-        obd->u.obt.obt_qctxt.lqc_cqs_qs_factor = val;
          return count;
  }
-#endif
  
  struct lprocfs_vars lprocfs_mds_obd_vars[] = {
          { "uuid",            lprocfs_rd_uuid,        0, 0 },
@@ -554,6 +439,7 @@ struct lprocfs_vars lprocfs_mds_obd_vars[] = {
          { "fstype",          lprocfs_rd_fstype,      0, 0 },
          { "mntdev",          lprocfs_mds_rd_mntdev,  0, 0 },
          { "recovery_status", lprocfs_obd_rd_recovery_status, 0, 0 },
+        { "hash_stats",      lprocfs_obd_rd_hash,    0, 0 },
          { "evict_client",    0,                lprocfs_mds_wr_evict_client, 0 },
          { "evict_ost_nids",  lprocfs_mds_rd_evictostnids,
                                                 lprocfs_mds_wr_evictostnids, 0 },
@@ -564,16 +450,16 @@ struct lprocfs_vars lprocfs_mds_obd_vars[] = {
          { "quota_iunit_sz",  lprocfs_quota_rd_iunit, lprocfs_quota_wr_iunit, 0 },
          { "quota_itune_sz",  lprocfs_quota_rd_itune, lprocfs_quota_wr_itune, 0 },
          { "quota_type",      lprocfs_quota_rd_type,  lprocfs_quota_wr_type, 0 },
-        { "quota_switch_qs", lprocfs_mds_rd_switch_qs,
-                             lprocfs_mds_wr_switch_qs, 0 },
-        { "quota_boundary_factor", lprocfs_mds_rd_boundary_factor,
-                                   lprocfs_mds_wr_boundary_factor, 0 },
-        { "quota_least_bunit", lprocfs_mds_rd_least_bunit,
-                               lprocfs_mds_wr_least_bunit, 0 },
-        { "quota_least_iunit", lprocfs_mds_rd_least_iunit,
-                               lprocfs_mds_wr_least_iunit, 0 },
-        { "quota_qs_factor",   lprocfs_mds_rd_qs_factor,
-                               lprocfs_mds_wr_qs_factor, 0 },
+        { "quota_switch_qs", lprocfs_quota_rd_switch_qs,
+                             lprocfs_quota_wr_switch_qs, 0 },
+        { "quota_boundary_factor", lprocfs_quota_rd_boundary_factor,
+                                   lprocfs_quota_wr_boundary_factor, 0 },
+        { "quota_least_bunit", lprocfs_quota_rd_least_bunit,
+                               lprocfs_quota_wr_least_bunit, 0 },
+        { "quota_least_iunit", lprocfs_quota_rd_least_iunit,
+                               lprocfs_quota_wr_least_iunit, 0 },
+        { "quota_qs_factor",   lprocfs_quota_rd_qs_factor,
+                               lprocfs_quota_wr_qs_factor, 0 },
          { "quota_switch_seconds",  lprocfs_quota_rd_switch_seconds,
                                     lprocfs_quota_wr_switch_seconds, 0 },
  #endif
@@ -590,6 +476,13 @@ struct lprocfs_vars lprocfs_mds_obd_vars[] = {
                               lprocfs_wr_rootsquash, 0 },
          { "nosquash_nid",    lprocfs_rd_nosquash_nid,
                               lprocfs_wr_nosquash_nid, 0 },
+        { "sync_permission", lprocfs_mds_rd_sync_perm,
+                             lprocfs_mds_wr_sync_perm, 0 },
+#ifdef HAVE_DELAYED_RECOVERY
+        { "stale_export_age", lprocfs_obd_rd_stale_export_age,
+                              lprocfs_obd_wr_stale_export_age, 0},
+        { "flush_stale_exports", 0, lprocfs_obd_wr_flush_stale_exports, 0 },
+#endif
          { 0 }
  };
  
diff --git a/lustre/mds/mds_fs.c b/lustre/mds/mds_fs.c

index a747d0b..81b4399 100644 (file)
--- a/lustre/mds/mds_fs.c
+++ b/lustre/mds/mds_fs.c
@@ -1,29 +1,43 @@
  /* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
   * vim:expandtab:shiftwidth=8:tabstop=8:
   *
- *  mds/mds_fs.c
- *  Lustre Metadata Server (MDS) filesystem interface code
+ * GPL HEADER START
   *
- *  Copyright (C) 2002, 2003 Cluster File Systems, Inc.
- *   Author: Andreas Dilger <adilger@clusterfs.com>
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   *
- *   This file is part of the Lustre file system, http://www.lustre.org
- *   Lustre is a trademark of Cluster File Systems, Inc.
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
   *
- *   You may have signed or agreed to another license before downloading
- *   this software.  If so, you are bound by the terms and conditions
- *   of that agreement, and the following does not apply to you.  See the
- *   LICENSE file included with this distribution for more information.
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
   *
- *   If you did not agree to a different license, then this copy of Lustre
- *   is open source software; you can redistribute it and/or modify it
- *   under the terms of version 2 of the GNU General Public License as
- *   published by the Free Software Foundation.
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
   *
- *   In either case, Lustre is distributed in the hope that it will be
- *   useful, but WITHOUT ANY WARRANTY; without even the implied warranty
- *   of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- *   license text for more details.
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright  2008 Sun Microsystems, Inc. All rights reserved
+ * Use is subject to license terms.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/mds/mds_fs.c
+ *
+ * Lustre Metadata Server (MDS) filesystem interface code
+ *
+ * Author: Andreas Dilger <adilger@clusterfs.com>
   */
  
  #ifndef EXPORT_SYMTAB
@@ -36,9 +50,7 @@
  #include <linux/version.h>
  #include <linux/sched.h>
  #include <lustre_quota.h>
-#if (LINUX_VERSION_CODE >= KERNEL_VERSION(2,5,0))
  #include <linux/mount.h>
-#endif
  #include <lustre_mds.h>
  #include <obd_class.h>
  #include <obd_support.h>
@@ -50,15 +62,21 @@
  #include "mds_internal.h"
  
  
-static int mds_export_stats_init(struct obd_device *obd,
+int mds_export_stats_init(struct obd_device *obd,
                                   struct obd_export *exp,
-                                 void *client_nid)
-  {
+                                 void *localdata)
+{
+        lnet_nid_t *client_nid = localdata;
          int rc, num_stats, newnid = 0;
  
          rc = lprocfs_exp_setup(exp, client_nid, &newnid);
-        if (rc)
+        if (rc) {
+                /* Mask error for already created
+                 * /proc entries */
+                if (rc == -EALREADY)
+                        rc = 0;
                  return rc;
+        }
  
          if (newnid) {
                  struct nid_stat *tmp = exp->exp_nid_stats;
@@ -78,11 +96,77 @@ static int mds_export_stats_init(struct obd_device *obd,
                          return rc;
  
                  mds_stats_counter_init(tmp->nid_stats);
+
+                /* Always add in ldlm_stats */
+                tmp->nid_ldlm_stats = lprocfs_alloc_stats(LDLM_LAST_OPC -
+                                                          LDLM_FIRST_OPC,
+                                                          0);
+                if (tmp->nid_ldlm_stats == NULL)
+                        return -ENOMEM;
+
+                lprocfs_init_ldlm_stats(tmp->nid_ldlm_stats);
+
+                rc = lprocfs_register_stats(tmp->nid_proc, "ldlm_stats",
+                                            tmp->nid_ldlm_stats);
+                if (rc)
+                        return rc;
          }
  
          return 0;
  }
  
+/* VBR: to determine the delayed client the lcd should be updated for each new
+ * epoch */
+int mds_update_client_epoch(struct obd_export *exp)
+{
+        struct mds_export_data *med = &exp->exp_mds_data;
+        struct mds_obd *mds = &exp->exp_obd->u.mds;
+        struct lvfs_run_ctxt saved;
+        loff_t off = med->med_lr_off;
+        int rc = 0;
+
+        /* VBR: set client last_epoch to current epoch */
+        if (le32_to_cpu(med->med_lcd->lcd_last_epoch) >=
+                        le32_to_cpu(mds->mds_server_data->lsd_start_epoch))
+                return rc;
+
+        med->med_lcd->lcd_last_epoch = mds->mds_server_data->lsd_start_epoch;
+        push_ctxt(&saved, &exp->exp_obd->obd_lvfs_ctxt, NULL);
+        rc = fsfilt_write_record(exp->exp_obd, mds->mds_rcvd_filp,
+                                 med->med_lcd, sizeof(*med->med_lcd), &off,
+                                 exp->exp_delayed);
+        pop_ctxt(&saved, &exp->exp_obd->obd_lvfs_ctxt, NULL);
+
+        CDEBUG(D_INFO, "update client idx %u last_epoch %#x (%#x)\n",
+               med->med_lr_idx, le32_to_cpu(med->med_lcd->lcd_last_epoch),
+               le32_to_cpu(mds->mds_server_data->lsd_start_epoch));
+
+        return rc;
+}
+
+/* Called after recovery is done on server */
+void mds_update_last_epoch(struct obd_device *obd)
+{
+        struct ptlrpc_request *req;
+        struct mds_obd *mds = &obd->u.mds;
+        __u32 start_epoch;
+
+        /* Increase server epoch after recovery */
+        spin_lock(&mds->mds_transno_lock);
+        start_epoch = lr_epoch(mds->mds_last_transno) + 1;
+        mds->mds_last_transno = (__u64)start_epoch << LR_EPOCH_BITS;
+        mds->mds_server_data->lsd_start_epoch = cpu_to_le32(start_epoch);
+        spin_unlock(&mds->mds_transno_lock);
+
+        /* go through delayed reply queue to find all exports participate in
+         * recovery and set new epoch for them */
+        list_for_each_entry(req, &obd->obd_delayed_reply_queue, rq_list) {
+                LASSERT(!req->rq_export->exp_delayed);
+                mds_update_client_epoch(req->rq_export);
+        }
+        mds_update_server_data(obd, 1);
+}
+
  /* Add client data to the MDS.  We use a bitmap to locate a free space
   * in the last_rcvd file if cl_off is -1 (i.e. a new client).
   * Otherwise, we have just read the data from the last_rcvd file and
@@ -98,7 +182,7 @@ int mds_client_add(struct obd_device *obd, struct obd_export *exp,
          struct mds_export_data *med = &exp->exp_mds_data;
          unsigned long *bitmap = mds->mds_client_bitmap;
          int new_client = (cl_idx == -1);
-        int rc;
+        int rc = 0;
          ENTRY;
  
          LASSERT(bitmap != NULL);
@@ -108,6 +192,10 @@ int mds_client_add(struct obd_device *obd, struct obd_export *exp,
          if (!strcmp(med->med_lcd->lcd_uuid, obd->obd_uuid.uuid))
                  RETURN(0);
  
+        /* VBR: remove expired exports before searching for free slot */
+        if (new_client)
+                class_disconnect_expired_exports(obd);
+
          /* the bitmap operations can handle cl_idx > sizeof(long) * 8, so
           * there's no need for extra complication here
           */
@@ -143,18 +231,32 @@ int mds_client_add(struct obd_device *obd, struct obd_export *exp,
          mds_export_stats_init(obd, exp, localdata);
  
          if (new_client) {
-                struct lvfs_run_ctxt saved;
+                struct lvfs_run_ctxt *saved = NULL;
                  loff_t off = med->med_lr_off;
                  struct file *file = mds->mds_rcvd_filp;
                  void *handle;
  
-                push_ctxt(&saved, &obd->obd_lvfs_ctxt, NULL);
+                OBD_SLAB_ALLOC_PTR(saved, obd_lvfs_ctxt_cache);
+                if (saved == NULL) {
+                        CERROR("cannot allocate memory for run ctxt\n");
+                        RETURN(-ENOMEM);
+                }
+
+                push_ctxt(saved, &obd->obd_lvfs_ctxt, NULL);
                  handle = fsfilt_start(obd, file->f_dentry->d_inode,
                                        FSFILT_OP_SETATTR, NULL);
                  if (IS_ERR(handle)) {
                          rc = PTR_ERR(handle);
                          CERROR("unable to start transaction: rc %d\n", rc);
                  } else {
+                        /* VBR: set client last_transno as mds_last_transno to
+                         * remember last epoch for this client */
+                        med->med_lcd->lcd_last_epoch =
+                                        mds->mds_server_data->lsd_start_epoch;
+                        exp->exp_last_request_time = cfs_time_current_sec();
+                        /* remember first epoch of client for orphan handling */
+                        med->med_lcd->lcd_first_epoch =
+                                  cpu_to_le32(lr_epoch(mds->mds_last_transno));
                          rc = fsfilt_add_journal_cb(obd, 0, handle,
                                                     target_client_add_cb, exp);
                          if (rc == 0) {
@@ -168,7 +270,8 @@ int mds_client_add(struct obd_device *obd, struct obd_export *exp,
                          fsfilt_commit(obd, file->f_dentry->d_inode, handle, 0);
                  }
  
-                pop_ctxt(&saved, &obd->obd_lvfs_ctxt, NULL);
+                pop_ctxt(saved, &obd->obd_lvfs_ctxt, NULL);
+                OBD_SLAB_FREE_PTR(saved, obd_lvfs_ctxt_cache);
  
                  if (rc)
                          return rc;
@@ -176,16 +279,17 @@ int mds_client_add(struct obd_device *obd, struct obd_export *exp,
                         med->med_lr_idx, med->med_lr_off,
                         (unsigned int)sizeof(*med->med_lcd));
          }
-        return 0;
+        return rc;
  }
  
+struct lsd_client_data zero_lcd; /* globals are implicitly zeroed */
+ 
  int mds_client_free(struct obd_export *exp)
  {
          struct mds_export_data *med = &exp->exp_mds_data;
          struct mds_obd *mds = &exp->exp_obd->u.mds;
          struct obd_device *obd = exp->exp_obd;
-        struct lsd_client_data zero_lcd;
-        struct lvfs_run_ctxt saved;
+        struct lvfs_run_ctxt *saved = NULL;
          int rc;
          loff_t off;
          ENTRY;
@@ -202,7 +306,6 @@ int mds_client_free(struct obd_export *exp)
  
          LASSERT(mds->mds_client_bitmap != NULL);
  
-        lprocfs_exp_cleanup(exp);
  
          off = med->med_lr_off;
  
@@ -223,17 +326,31 @@ int mds_client_free(struct obd_export *exp)
          }
  
          if (!(exp->exp_flags & OBD_OPT_FAILOVER)) {
-                memset(&zero_lcd, 0, sizeof(zero_lcd));
-                push_ctxt(&saved, &obd->obd_lvfs_ctxt, NULL);
+                /* Don't force sync on each disconnect if aborting recovery,
+                 * or it does num_clients * num_osts syncs.  b=17194 */
+                int need_sync = (!exp->exp_libclient || exp->exp_need_sync) &&
+                                 !(exp->exp_flags & OBD_OPT_ABORT_RECOV);
+                OBD_SLAB_ALLOC_PTR(saved, obd_lvfs_ctxt_cache);
+                if (saved == NULL) {
+                        CERROR("cannot allocate memory for run ctxt\n");
+                        GOTO(free, rc = -ENOMEM);
+                }
+                push_ctxt(saved, &obd->obd_lvfs_ctxt, NULL);
                  rc = fsfilt_write_record(obd, mds->mds_rcvd_filp, &zero_lcd,
-                                         sizeof(zero_lcd), &off,
-                                         (!exp->exp_libclient ||
-                                          exp->exp_need_sync));
-                pop_ctxt(&saved, &obd->obd_lvfs_ctxt, NULL);
+                                         sizeof(zero_lcd), &off, 0);
+
+                /* Make sure the server's last_transno is up to date. Do this
+                 * after the client is freed so we know all the client's
+                 * transactions have been committed. */
+                if (rc == 0)
+                        mds_update_server_data(exp->exp_obd, need_sync);
+
+                pop_ctxt(saved, &obd->obd_lvfs_ctxt, NULL);
  
                  CDEBUG(rc == 0 ? D_INFO : D_ERROR,
-                       "zeroing out client %s idx %u in %s rc %d\n",
-                       med->med_lcd->lcd_uuid, med->med_lr_idx, LAST_RCVD, rc);
+                       "zero out client %s at idx %u/%llu in %s %ssync rc %d\n",
+                       med->med_lcd->lcd_uuid, med->med_lr_idx, med->med_lr_off,
+                       LAST_RCVD, need_sync ? "" : "a", rc);
          }
  
          if (!test_and_clear_bit(med->med_lr_idx, mds->mds_client_bitmap)) {
@@ -242,14 +359,11 @@ int mds_client_free(struct obd_export *exp)
                  LBUG();
          }
  
-
-        /* Make sure the server's last_transno is up to date. Do this
-         * after the client is freed so we know all the client's
-         * transactions have been committed. */
-        mds_update_server_data(exp->exp_obd, 0);
-
          EXIT;
- free:
+free:
+        if (saved)
+                OBD_SLAB_FREE_PTR(saved, obd_lvfs_ctxt_cache);
+
          OBD_FREE_PTR(med->med_lcd);
          med->med_lcd = NULL;
  
@@ -265,6 +379,85 @@ static int mds_server_free_data(struct mds_obd *mds)
          return 0;
  }
  
+static void mds_add_fake_export(struct obd_device *obd, int num,
+                                struct file *file)
+{
+        struct obd_export *exp;
+        struct lvfs_run_ctxt saved;
+        struct obd_device_target *obt = &obd->u.obt;
+        struct lu_export_data *led;
+        unsigned long *bitmap = obt->obt_client_bitmap;
+        struct lsd_client_data *lcd = NULL;
+        unsigned int idx = 0;
+        loff_t off = 0;
+        int rc = 0;
+
+        while (num > 0) {
+                num--;
+                if (!lcd) {
+                        OBD_ALLOC_PTR(lcd);
+                        if (!lcd)
+                                return;
+                }
+                idx = find_next_zero_bit(bitmap, LR_MAX_CLIENTS, idx);
+                if (idx >= LR_MAX_CLIENTS) {
+                        CERROR("no room for %u clients - fix LR_MAX_CLIENTS\n", idx);
+                        OBD_FREE_PTR(lcd);
+                        break;
+                }
+                if (test_and_set_bit(idx, bitmap)) {
+                        CERROR("Bit %u is set already\n", idx);
+                        continue;
+                }
+                off = le32_to_cpu(obt->obt_lsd->lsd_client_start) +
+                      idx * le16_to_cpu(obt->obt_lsd->lsd_client_size);
+
+                sprintf(lcd->lcd_uuid, "dead-%.16u", idx);
+                CDEBUG(D_INFO, "Create fake export %s, index %u, offset %lu\n",
+                       lcd->lcd_uuid, idx, (unsigned long)off);
+
+                exp = class_new_export(obd, (struct obd_uuid *)lcd->lcd_uuid);
+                if (IS_ERR(exp)) {
+                        if (PTR_ERR(exp) == -EALREADY) {
+                                CERROR("Export %s already exists\n",
+                                       lcd->lcd_uuid);
+                        }
+                        CERROR("Failed to create export %lu\n", PTR_ERR(exp));
+                        OBD_FREE_PTR(lcd);
+                        break;
+                }
+                LASSERT(exp);
+                led = &exp->exp_target_data;
+                led->led_lr_idx = idx;
+                led->led_lr_off = off;
+                led->led_lcd = lcd;
+
+                exp->exp_last_request_time = cfs_time_current_sec();
+                exp->exp_replay_needed = 1;
+                exp->exp_connecting = 0;
+                exp->exp_in_recovery = 0;
+
+                spin_lock_bh(&obd->obd_processing_task_lock);
+                obd->obd_recoverable_clients++;
+                obd->obd_max_recoverable_clients++;
+                spin_unlock_bh(&obd->obd_processing_task_lock);
+
+                class_set_export_delayed(exp);
+                class_export_put(exp);
+
+                lcd->lcd_last_epoch = cpu_to_le32(1);
+                push_ctxt(&saved, &obd->obd_lvfs_ctxt, NULL);
+                rc = fsfilt_write_record(obd, file, lcd, sizeof(*lcd), &off, 0);
+                pop_ctxt(&saved, &obd->obd_lvfs_ctxt, NULL);
+                if (rc) {
+                        CERROR("Failed to create fake client record\n");
+                        OBD_FREE_PTR(lcd);
+                        break;
+                }
+                lcd = NULL;
+        }
+}
+
  static int mds_init_server_data(struct obd_device *obd, struct file *file)
  {
          struct mds_obd *mds = &obd->u.mds;
@@ -273,6 +466,7 @@ static int mds_init_server_data(struct obd_device *obd, struct file *file)
          loff_t off = 0;
          unsigned long last_rcvd_size = i_size_read(file->f_dentry->d_inode);
          __u64 mount_count;
+        __u32 start_epoch;
          int cl_idx, rc = 0;
          ENTRY;
  
@@ -343,9 +537,13 @@ static int mds_init_server_data(struct obd_device *obd, struct file *file)
  
          lsd->lsd_feature_compat = cpu_to_le32(OBD_COMPAT_MDT);
  
+        target_trans_table_init(obd);
          mds->mds_last_transno = le64_to_cpu(lsd->lsd_last_transno);
+        start_epoch = le32_to_cpu(lsd->lsd_start_epoch);
  
-        CDEBUG(D_INODE, "%s: server last_transno: "LPU64"\n",
+        CDEBUG(D_INODE, "%s: server start_epoch: %#x\n",
+               obd->obd_name, start_epoch);
+        CDEBUG(D_INODE, "%s: server last_transno: "LPX64"\n",
                 obd->obd_name, mds->mds_last_transno);
          CDEBUG(D_INODE, "%s: server mount_count: "LPU64"\n",
                 obd->obd_name, mount_count + 1);
@@ -374,6 +572,7 @@ static int mds_init_server_data(struct obd_device *obd, struct file *file)
          for (cl_idx = 0, off = le32_to_cpu(lsd->lsd_client_start);
               off < last_rcvd_size; cl_idx++) {
                  __u64 last_transno;
+                __u32 last_epoch;
                  struct obd_export *exp;
                  struct mds_export_data *med;
  
@@ -401,10 +600,8 @@ static int mds_init_server_data(struct obd_device *obd, struct file *file)
                          continue;
                  }
  
-                last_transno = le64_to_cpu(lcd->lcd_last_transno) >
-                               le64_to_cpu(lcd->lcd_last_close_transno) ?
-                               le64_to_cpu(lcd->lcd_last_transno) :
-                               le64_to_cpu(lcd->lcd_last_close_transno);
+                last_transno = lsd_last_transno(lcd);
+                last_epoch = le32_to_cpu(lcd->lcd_last_epoch);
  
                  /* These exports are cleaned up by mds_disconnect(), so they
                   * need to be set up like real exports as mds_connect() does.
@@ -429,26 +626,42 @@ static int mds_init_server_data(struct obd_device *obd, struct file *file)
                          /* can't fail for existing client */
                          LASSERTF(rc == 0, "rc = %d\n", rc);
  
+                        /* VBR: set export last committed version */
+                        exp->exp_last_committed = last_transno;
+                        /* read last time from disk */
+                        exp->exp_last_request_time = target_trans_table_last_time(exp);
                          lcd = NULL;
  
                          spin_lock(&exp->exp_lock);
                          exp->exp_replay_needed = 1;
                          exp->exp_connecting = 0;
+                        exp->exp_in_recovery = 0;
                          spin_unlock(&exp->exp_lock);
  
+                        spin_lock_bh(&obd->obd_processing_task_lock);
                          obd->obd_recoverable_clients++;
                          obd->obd_max_recoverable_clients++;
+                        spin_unlock_bh(&obd->obd_processing_task_lock);
+
+                        /* VBR: if epoch too old mark export as delayed,
+                         * if epoch is zero then client is pre-vbr one */
+                        if (start_epoch > last_epoch && last_epoch != 0)
+                                class_set_export_delayed(exp);
                          class_export_put(exp);
                  }
  
                  /* Need to check last_rcvd even for duplicated exports. */
-                CDEBUG(D_OTHER, "client at idx %d has last_transno = "LPU64"\n",
-                       cl_idx, last_transno);
+                CDEBUG(D_OTHER, "client at idx %d has last_transno = "LPX64","
+                       "last_epoch %#x\n", cl_idx, last_transno, last_epoch);
  
                  if (last_transno > mds->mds_last_transno)
                          mds->mds_last_transno = last_transno;
          }
  
+        if (unlikely(OBD_FAIL_CHECK(OBD_FAIL_TGT_FAKE_EXP))) {
+                mds_add_fake_export(obd, obd_fail_val, file);
+        }
+
          if (lcd)
                  OBD_FREE_PTR(lcd);
  
@@ -456,8 +669,9 @@ static int mds_init_server_data(struct obd_device *obd, struct file *file)
  
          if (obd->obd_recoverable_clients) {
                  CWARN("RECOVERY: service %s, %d recoverable clients, "
-                      "last_transno "LPU64"\n", obd->obd_name,
-                      obd->obd_recoverable_clients, mds->mds_last_transno);
+                      "%d delayed clients, last_transno "LPU64"\n",
+                      obd->obd_name, obd->obd_recoverable_clients,
+                      obd->obd_delayed_clients, mds->mds_last_transno);
                  obd->obd_next_recovery_transno = obd->obd_last_committed + 1;
                  obd->obd_recovering = 1;
                  obd->obd_recovery_start = 0;
@@ -467,8 +681,11 @@ static int mds_init_server_data(struct obd_device *obd, struct file *file)
                  /* bz13079: this won't be changed for mds */
                  obd->obd_recovery_max_time = OBD_RECOVERY_MAX_TIME;
  #endif
+        } else {
+                LASSERT(!obd->obd_recovering);
+                /* VBR: update boot epoch after recovery */
+                mds_update_last_epoch(obd);
          }
-
          mds->mds_mount_count = mount_count + 1;
          lsd->lsd_mount_count = lsd->lsd_compat14 =
                  cpu_to_le64(mds->mds_mount_count);
@@ -490,7 +707,7 @@ err_msd:
  int mds_fs_setup(struct obd_device *obd, struct vfsmount *mnt)
  {
          struct mds_obd *mds = &obd->u.mds;
-        struct lvfs_run_ctxt saved;
+        struct lvfs_run_ctxt *saved = NULL;
          struct dentry *dentry;
          struct file *file;
          int rc;
@@ -502,9 +719,17 @@ int mds_fs_setup(struct obd_device *obd, struct vfsmount *mnt)
          if (rc)
                  RETURN(rc);
  
+        OBD_SLAB_ALLOC_PTR(saved, obd_lvfs_ctxt_cache);
+        if (saved == NULL) {
+                CERROR("cannot allocate memory for run ctxt\n");
+                RETURN(-ENOMEM);
+        }
+
          mds->mds_vfsmnt = mnt;
          /* why not mnt->mnt_sb instead of mnt->mnt_root->d_inode->i_sb? */
          obd->u.obt.obt_sb = mnt->mnt_root->d_inode->i_sb;
+        obd->u.obt.obt_stale_export_age = STALE_EXPORT_MAXTIME_DEFAULT;
+        spin_lock_init(&obd->u.obt.obt_trans_table_lock);
  
          rc = fsfilt_setup(obd, obd->u.obt.obt_sb);
          if (rc)
@@ -517,8 +742,8 @@ int mds_fs_setup(struct obd_device *obd, struct vfsmount *mnt)
          obd->obd_lvfs_ctxt.cb_ops = mds_lvfs_ops;
  
          /* setup the directory tree */
-        push_ctxt(&saved, &obd->obd_lvfs_ctxt, NULL);
-        dentry = simple_mkdir(current->fs->pwd, "ROOT", 0755, 0);
+        push_ctxt(saved, &obd->obd_lvfs_ctxt, NULL);
+        dentry = simple_mkdir(current->fs->pwd, mnt, "ROOT", 0755, 0);
          if (IS_ERR(dentry)) {
                  rc = PTR_ERR(dentry);
                  CERROR("cannot create ROOT directory: rc = %d\n", rc);
@@ -546,7 +771,7 @@ int mds_fs_setup(struct obd_device *obd, struct vfsmount *mnt)
                  GOTO(err_fid, rc);
          }
  
-        dentry = simple_mkdir(current->fs->pwd, "PENDING", 0777, 1);
+        dentry = simple_mkdir(current->fs->pwd, mnt, "PENDING", 0777, 1);
          if (IS_ERR(dentry)) {
                  rc = PTR_ERR(dentry);
                  CERROR("cannot create PENDING directory: rc = %d\n", rc);
@@ -555,7 +780,7 @@ int mds_fs_setup(struct obd_device *obd, struct vfsmount *mnt)
          mds->mds_pending_dir = dentry;
  
          /* COMPAT_146 */
-        dentry = simple_mkdir(current->fs->pwd, MDT_LOGS_DIR, 0777, 1);
+        dentry = simple_mkdir(current->fs->pwd, mnt, MDT_LOGS_DIR, 0777, 1);
          if (IS_ERR(dentry)) {
                  rc = PTR_ERR(dentry);
                  CERROR("cannot create %s directory: rc = %d\n",
@@ -565,7 +790,7 @@ int mds_fs_setup(struct obd_device *obd, struct vfsmount *mnt)
          mds->mds_logs_dir = dentry;
          /* end COMPAT_146 */
  
-        dentry = simple_mkdir(current->fs->pwd, "OBJECTS", 0777, 1);
+        dentry = simple_mkdir(current->fs->pwd, mnt, "OBJECTS", 0777, 1);
          if (IS_ERR(dentry)) {
                  rc = PTR_ERR(dentry);
                  CERROR("cannot create OBJECTS directory: rc = %d\n", rc);
@@ -616,8 +841,8 @@ int mds_fs_setup(struct obd_device *obd, struct vfsmount *mnt)
          if (rc)
                  GOTO(err_health_check, rc);
  err_pop:
-        pop_ctxt(&saved, &obd->obd_lvfs_ctxt, NULL);
-
+        pop_ctxt(saved, &obd->obd_lvfs_ctxt, NULL);
+        OBD_SLAB_FREE_PTR(saved, obd_lvfs_ctxt_cache);
          return rc;
  
  err_health_check:
@@ -645,9 +870,15 @@ err_fid:
  int mds_fs_cleanup(struct obd_device *obd)
  {
          struct mds_obd *mds = &obd->u.mds;
-        struct lvfs_run_ctxt saved;
+        struct lvfs_run_ctxt *saved = NULL;
          int rc = 0;
  
+        OBD_SLAB_ALLOC_PTR(saved, obd_lvfs_ctxt_cache);
+        if (saved == NULL) {
+                CERROR("cannot allocate memory for run ctxt\n");
+                RETURN(-ENOMEM);
+        }
+
          if (obd->obd_fail)
                  LCONSOLE_WARN("%s: shutting down for failover; client state "
                                "will be preserved.\n", obd->obd_name);
@@ -655,7 +886,7 @@ int mds_fs_cleanup(struct obd_device *obd)
          class_disconnect_exports(obd); /* cleans up client info too */
          mds_server_free_data(mds);
  
-        push_ctxt(&saved, &obd->obd_lvfs_ctxt, NULL);
+        push_ctxt(saved, &obd->obd_lvfs_ctxt, NULL);
          if (mds->mds_rcvd_filp) {
                  rc = filp_close(mds->mds_rcvd_filp, 0);
                  mds->mds_rcvd_filp = NULL;
@@ -686,7 +917,8 @@ int mds_fs_cleanup(struct obd_device *obd)
  
          lquota_fs_cleanup(mds_quota_interface_ref, obd);
  
-        pop_ctxt(&saved, &obd->obd_lvfs_ctxt, NULL);
+        pop_ctxt(saved, &obd->obd_lvfs_ctxt, NULL);
+        OBD_SLAB_FREE_PTR(saved, obd_lvfs_ctxt_cache);
          shrink_dcache_parent(mds->mds_fid_de);
          dput(mds->mds_fid_de);
          LL_DQUOT_OFF(obd->u.obt.obt_sb);
@@ -705,17 +937,23 @@ int mds_obd_create(struct obd_export *exp, struct obdo *oa,
          unsigned int tmpname = ll_rand();
          struct file *filp;
          struct dentry *new_child;
-        struct lvfs_run_ctxt saved;
+        struct lvfs_run_ctxt *saved = NULL;
          char fidname[LL_FID_NAMELEN];
          void *handle;
          struct lvfs_ucred ucred = { 0 };
          int rc = 0, err, namelen;
          ENTRY;
  
+        OBD_SLAB_ALLOC_PTR(saved, obd_lvfs_ctxt_cache);
+        if (saved == NULL) {
+                CERROR("cannot allocate memory for run ctxt\n");
+                RETURN(-ENOMEM);
+        }
+
          /* the owner of object file should always be root */
          cap_raise(ucred.luc_cap, CAP_SYS_RESOURCE);
  
-        push_ctxt(&saved, &exp->exp_obd->obd_lvfs_ctxt, &ucred);
+        push_ctxt(saved, &exp->exp_obd->obd_lvfs_ctxt, &ucred);
  
          sprintf(fidname, "OBJECTS/%u.%u", tmpname, current->pid);
          filp = filp_open(fidname, O_CREAT | O_EXCL, 0666);
@@ -755,8 +993,9 @@ int mds_obd_create(struct obd_export *exp, struct obdo *oa,
                  GOTO(out_dput, rc = PTR_ERR(handle));
  
          lock_kernel();
-        rc = vfs_rename(mds->mds_objects_dir->d_inode, filp->f_dentry,
-                        mds->mds_objects_dir->d_inode, new_child);
+        rc = ll_vfs_rename(mds->mds_objects_dir->d_inode, filp->f_dentry,
+                           filp->f_vfsmnt, mds->mds_objects_dir->d_inode,
+                           new_child, filp->f_vfsmnt);
          unlock_kernel();
          if (rc)
                  CERROR("error renaming new object "LPU64":%u: rc %d\n",
@@ -779,7 +1018,8 @@ out_close:
                          rc = err;
          }
  out_pop:
-        pop_ctxt(&saved, &exp->exp_obd->obd_lvfs_ctxt, &ucred);
+        pop_ctxt(saved, &exp->exp_obd->obd_lvfs_ctxt, &ucred);
+        OBD_SLAB_FREE_PTR(saved, obd_lvfs_ctxt_cache);
          RETURN(rc);
  }
  
@@ -790,7 +1030,7 @@ int mds_obd_destroy(struct obd_export *exp, struct obdo *oa,
          struct mds_obd *mds = &exp->exp_obd->u.mds;
          struct inode *parent_inode = mds->mds_objects_dir->d_inode;
          struct obd_device *obd = exp->exp_obd;
-        struct lvfs_run_ctxt saved;
+        struct lvfs_run_ctxt *saved = NULL;
          struct lvfs_ucred ucred = { 0 };
          char fidname[LL_FID_NAMELEN];
          struct inode *inode = NULL;
@@ -799,8 +1039,14 @@ int mds_obd_destroy(struct obd_export *exp, struct obdo *oa,
          int err, namelen, rc = 0;
          ENTRY;
  
+        OBD_SLAB_ALLOC_PTR(saved, obd_lvfs_ctxt_cache);
+        if (saved == NULL) {
+                CERROR("cannot allocate memory for run ctxt\n");
+                RETURN(-ENOMEM);
+        }
+
          cap_raise(ucred.luc_cap, CAP_SYS_RESOURCE);
-        push_ctxt(&saved, &obd->obd_lvfs_ctxt, &ucred);
+        push_ctxt(saved, &obd->obd_lvfs_ctxt, &ucred);
  
          namelen = ll_fid2str(fidname, oa->o_id, oa->o_generation);
  
@@ -831,7 +1077,7 @@ int mds_obd_destroy(struct obd_export *exp, struct obdo *oa,
             vfs_unlink() context. bug 10409 */
          inode = de->d_inode;
          atomic_inc(&inode->i_count);
-        rc = vfs_unlink(mds->mds_objects_dir->d_inode, de);
+        rc = ll_vfs_unlink(mds->mds_objects_dir->d_inode, de, mds->mds_vfsmnt);
          if (rc)
                  CERROR("error destroying object "LPU64":%u: rc %d\n",
                         oa->o_id, oa->o_generation, rc);
@@ -847,6 +1093,7 @@ out_dput:
          if (inode)
                  iput(inode);
  
-        pop_ctxt(&saved, &obd->obd_lvfs_ctxt, &ucred);
+        pop_ctxt(saved, &obd->obd_lvfs_ctxt, &ucred);
+        OBD_SLAB_FREE_PTR(saved, obd_lvfs_ctxt_cache);
          RETURN(rc);
  }
diff --git a/lustre/mds/mds_internal.h b/lustre/mds/mds_internal.h

index 1b4d6a2..4df631e 100644 (file)
--- a/lustre/mds/mds_internal.h
+++ b/lustre/mds/mds_internal.h
@@ -1,5 +1,37 @@
  /* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
   * vim:expandtab:shiftwidth=8:tabstop=8:
+ *
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright  2008 Sun Microsystems, Inc. All rights reserved
+ * Use is subject to license terms.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
   */
  
  #ifndef _MDS_INTERNAL_H
@@ -9,7 +41,8 @@
  #include <lustre_mds.h>
  
  #define MDT_ROCOMPAT_SUPP       (OBD_ROCOMPAT_LOVOBJID)
-#define MDT_INCOMPAT_SUPP       (OBD_INCOMPAT_MDT | OBD_INCOMPAT_COMMON_LR)
+#define MDT_INCOMPAT_SUPP       (OBD_INCOMPAT_MDT | OBD_INCOMPAT_COMMON_LR | \
+                                 OBD_INCOMPAT_FID)
  
  #define MDS_SERVICE_WATCHDOG_FACTOR 2000
  
@@ -34,11 +67,7 @@ static inline void mds_export_evict(struct obd_export *exp)
  
  #ifdef __KERNEL__
  /* Open counts for files.  No longer atomic, must hold inode->i_sem */
-#if (LINUX_VERSION_CODE >= KERNEL_VERSION(2,5,0))
  # define mds_inode_oatomic(inode)    ((inode)->i_cindex)
-#else
-# define mds_inode_oatomic(inode)    ((inode)->i_attr_flags)
-#endif
  
  #ifdef HAVE_I_ALLOC_SEM
  #define MDS_UP_READ_ORPHAN_SEM(i)          UP_READ_I_ALLOC_SEM(i)
@@ -66,6 +95,13 @@ static inline int mds_orphan_open_count(struct inode *inode)
          return mds_inode_oatomic(inode);
  }
  
+static inline int mds_orphan_needed(struct obd_device *obd,
+                                    struct inode * inode)
+{
+        return (obd->obd_recovering ||
+                mds_orphan_open_count(inode) > 0);
+}
+
  static inline int mds_orphan_open_inc(struct inode *inode)
  {
          LASSERT_MDS_ORPHAN_WRITE_LOCKED(inode);
@@ -122,9 +158,8 @@ int enqueue_ordered_locks(struct obd_device *obd, struct ldlm_res_id *p1_res_id,
                            struct lustre_handle *p2_lockh, int p2_lock_mode,
                            ldlm_policy_data_t *p2_policy);
  void mds_commit_cb(struct obd_device *, __u64 last_rcvd, void *data, int error);
-int mds_finish_transno(struct mds_obd *mds, struct inode *inode, void *handle,
-                       struct ptlrpc_request *req, int rc, __u32 op_data, 
-                       int force_sync);
+int mds_finish_transno(struct mds_obd *, struct inode **, void *,
+                       struct ptlrpc_request *, int, __u32, int force_sync);
  void mds_reconstruct_generic(struct ptlrpc_request *req);
  void mds_req_from_lcd(struct ptlrpc_request *req, struct lsd_client_data *cd);
  int mds_get_parent_child_locked(struct obd_device *obd, struct mds_obd *mds,
@@ -156,9 +191,15 @@ int mds_get_parents_children_locked(struct obd_device *obd,
                                      struct lustre_handle *dlm_handles,
                                      int child_mode);
  
+struct dentry *mds_lookup(struct obd_device *obd,
+                          const char *fid_name,
+                          struct dentry *dparent,
+                          int fid_namelen);
+
  void mds_shrink_reply(struct obd_device *obd, struct ptlrpc_request *req,
                        struct mds_body *body, int md_off);
  int mds_get_cookie_size(struct obd_device *obd, struct lov_mds_md *lmm);
+int mds_version_get_check(struct ptlrpc_request *, struct inode *, int);
  /* mds/mds_lib.c */
  int mds_update_unpack(struct ptlrpc_request *, int offset,
                        struct mds_update_record *);
@@ -166,7 +207,7 @@ int mds_init_ucred(struct lvfs_ucred *ucred, struct ptlrpc_request *req,
                     int offset);
  void mds_exit_ucred(struct lvfs_ucred *ucred, struct mds_obd *obd);
  void mds_root_squash(struct mds_obd *mds, lnet_nid_t *peernid,
-                     __u32 *fsuid, __u32 *fsgid, __u32 *cap,
+                     __u32 *fsuid, __u32 *fsgid, cfs_kernel_cap_t *cap,
                       __u32 *suppgid, __u32 *suppgid2);
  
  /* mds/mds_unlink_open.c */
@@ -174,11 +215,13 @@ int mds_osc_destroy_orphan(struct obd_device *obd, umode_t mode,
                             struct lov_mds_md *lmm, int lmm_size,
                             struct llog_cookie *logcookies, int log_unlink);
  int mds_cleanup_pending(struct obd_device *obd);
+int mds_check_stale_orphan(struct obd_device *obd, struct ll_fid *fid);
  
  /* mds/mds_log.c */
  int mds_log_op_unlink(struct obd_device *obd,
                        struct lov_mds_md *lmm, int lmm_size,
                        struct llog_cookie *logcookies, int cookies_size);
+int mds_log_op_orphan(struct obd_device *, struct lov_stripe_md *, obd_count);
  int mds_log_op_setattr(struct obd_device *obd, struct inode *inode,
                        struct lov_mds_md *lmm, int lmm_size,
                        struct llog_cookie *logcookies, int cookies_size);
@@ -191,10 +234,11 @@ int mds_lov_connect(struct obd_device *obd, char * lov_name);
  int mds_lov_disconnect(struct obd_device *obd);
  
  int mds_lov_write_objids(struct obd_device *obd);
+int mds_lov_prepare_objids(struct obd_device *obd, struct lov_mds_md *lmm);
  void mds_lov_update_objids(struct obd_device *obd, struct lov_mds_md *lmm);
  int mds_lov_clear_orphans(struct mds_obd *mds, struct obd_uuid *ost_uuid);
  
-int mds_lov_start_synchronize(struct obd_device *obd, 
+int mds_lov_start_synchronize(struct obd_device *obd,
                                struct obd_device *watched,
                                void *data, int nonblock);
  int mds_post_mds_lovconf(struct obd_device *obd);
@@ -203,7 +247,8 @@ int mds_notify(struct obd_device *obd, struct obd_device *watched,
  int mds_get_default_md(struct obd_device *obd, struct lov_mds_md *lmm,
                         int *lmmsize);
  int mds_convert_lov_ea(struct obd_device *obd, struct inode *inode,
-                       struct lov_mds_md *lmm, int lmm_size);
+                       struct lov_mds_md *lmm, int lmm_size,
+                       __u64 connect_flags);
  int mds_init_lov_desc(struct obd_device *obd, struct obd_export *osc_exp);
  
  /* mds/mds_open.c */
@@ -221,10 +266,15 @@ int mds_close(struct ptlrpc_request *req, int offset);
  int mds_done_writing(struct ptlrpc_request *req, int offset);
  
  /*mds/mds_join.c*/
-int mds_join_file(struct mds_update_record *rec, struct ptlrpc_request *req, 
+int mds_join_file(struct mds_update_record *rec, struct ptlrpc_request *req,
                    struct dentry *dchild, struct lustre_handle *lockh);
  
  /* mds/mds_fs.c */
+int mds_update_client_epoch(struct obd_export *exp);
+void mds_update_last_epoch(struct obd_device *obd);
+int mds_export_stats_init(struct obd_device *obd,
+                          struct obd_export *exp,
+                          void *client_nid);
  int mds_client_add(struct obd_device *obd, struct obd_export *exp,
                     int cl_off, void *localdata);
  int mds_client_free(struct obd_export *exp);
@@ -242,10 +292,10 @@ int mds_postrecov(struct obd_device *obd);
  int mds_init_export(struct obd_export *exp);
  #ifdef __KERNEL__
  int mds_get_md(struct obd_device *, struct inode *, void *md, int *size,
-               int lock, int flags);
+               int lock, int flags, __u64 connect_flags);
  int mds_pack_md(struct obd_device *, struct lustre_msg *, int offset,
-                struct mds_body *, struct inode *, int lock, int flags);
-void mds_pack_inode2fid(struct ll_fid *fid, struct inode *inode);
+                struct mds_body *, struct inode *, int lock, int flags,
+                __u64 connect_flags);
  void mds_pack_inode2body(struct mds_body *body, struct inode *inode);
  #endif
  int mds_pack_acl(struct mds_export_data *med, struct inode *inode,
diff --git a/lustre/mds/mds_join.c b/lustre/mds/mds_join.c

index af5ab26..d5578f6 100644 (file)
--- a/lustre/mds/mds_join.c
+++ b/lustre/mds/mds_join.c
@@ -1,25 +1,43 @@
  /* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
   * vim:expandtab:shiftwidth=8:tabstop=8:
   *
- *  linux/mds/mds_join.c
- *  Lustre Metadata join handler file
+ * GPL HEADER START
   *
- *  Copyright (c) 2001-2005 Cluster File Systems, Inc.
- *   Author: Wang Di <wangdi@clusterfs.com>
- *   This file is part of Lustre, http://www.lustre.org.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   *
- *   Lustre is free software; you can redistribute it and/or
- *   modify it under the terms of version 2 of the GNU General Public
- *   License as published by the Free Software Foundation.
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
   *
- *   Lustre is distributed in the hope that it will be useful,
- *   but WITHOUT ANY WARRANTY; without even the implied warranty of
- *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- *   GNU General Public License for more details.
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
   *
- *   You should have received a copy of the GNU General Public License
- *   along with Lustre; if not, write to the Free Software
- *   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright  2008 Sun Microsystems, Inc. All rights reserved
+ * Use is subject to license terms.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/mds/mds_join.c
+ *
+ * Lustre Metadata join handler file
+ *
+ * Author: Wang Di <wangdi@clusterfs.com>
   */
  
  #ifndef EXPORT_SYMTAB
@@ -68,7 +86,8 @@ static int mds_insert_join_lmm(struct llog_handle *llh,
          ENTRY;
  
  
-        sz_med = lov_mds_md_size(le32_to_cpu(lmm->lmm_stripe_count));
+        sz_med = lov_mds_md_size(le32_to_cpu(lmm->lmm_stripe_count),
+                                 LOV_MAGIC);
          sz_med += 2 * sizeof(__u64);
          sz_med = size_round(sz_med);
  
@@ -84,7 +103,8 @@ static int mds_insert_join_lmm(struct llog_handle *llh,
          med->med_start = start;
          med->med_len = len;
          memcpy(&med->med_lmm, lmm,
-                lov_mds_md_size(le32_to_cpu(lmm->lmm_stripe_count)));
+                lov_mds_md_size(le32_to_cpu(lmm->lmm_stripe_count),
+                                LOV_MAGIC));
  
          rc = llog_write_rec(llh, &rec, NULL, 0, med, -1);
          OBD_FREE(med, sz_med);
@@ -155,11 +175,12 @@ static int mdsea_cancel_last_extent(struct llog_handle *llh_tail,
                         med->med_start, cbdata->mc_headfile_sz);
                  if (!cbdata->mc_lmm) {
                          int stripe = le32_to_cpu(med->med_lmm.lmm_stripe_count);
-                        OBD_ALLOC(cbdata->mc_lmm, lov_mds_md_size(stripe));
+                        OBD_ALLOC(cbdata->mc_lmm,
+                                  lov_mds_md_size(stripe, LOV_MAGIC));
                          if (!cbdata->mc_lmm)
                                  RETURN(-ENOMEM);
                          memcpy(cbdata->mc_lmm, &med->med_lmm,
-                               lov_mds_md_size(stripe));
+                               lov_mds_md_size(stripe, LOV_MAGIC));
                  }
                  RETURN(LLOG_DEL_RECORD);
          }
@@ -202,7 +223,8 @@ static int  mds_adjust_last_extent(struct llog_handle *llh_head,
  exit:
          if (cbdata && cbdata->mc_lmm)
                  OBD_FREE(cbdata->mc_lmm,
-                         lov_mds_md_size(cbdata->mc_lmm->lmm_stripe_count));
+                         lov_mds_md_size(cbdata->mc_lmm->lmm_stripe_count,
+                                         LOV_MAGIC));
          if (cbdata)
                  OBD_FREE_PTR(cbdata);
  
@@ -212,8 +234,8 @@ exit:
  static void mds_finish_join(struct mds_obd *mds, struct ptlrpc_request *req,
                             struct inode *inode, struct lov_mds_md_join *lmmj)
  {
-        struct mds_body *body = (struct mds_body *)
-                                lustre_msg_buf(req->rq_repmsg, 1, 0);
+        struct mds_body *body = lustre_msg_buf(req->rq_repmsg,DLM_REPLY_REC_OFF,
+                                               sizeof(*body));
          int max_cookiesize = lmmj->lmmj_md.lmm_stripe_count *
                                  sizeof(struct llog_cookie);
          int max_easize = sizeof(*lmmj);
@@ -221,7 +243,7 @@ static void mds_finish_join(struct mds_obd *mds, struct ptlrpc_request *req,
          CDEBUG(D_INFO, "change the max md size from %d to "LPSZ"\n",
                 mds->mds_max_mdsize, sizeof(*lmmj));
  
-        if (mds->mds_max_mdsize < max_easize || 
+        if (mds->mds_max_mdsize < max_easize ||
              mds->mds_max_cookiesize < max_cookiesize) {
                  body->max_mdsize = mds->mds_max_mdsize > max_easize ?
                                     mds->mds_max_mdsize : max_easize;
@@ -236,7 +258,6 @@ static void mds_finish_join(struct mds_obd *mds, struct ptlrpc_request *req,
                  CDEBUG(D_INODE, "updating max_mdsize/max_cookiesize: %d/%d\n",
                         mds->mds_max_mdsize, mds->mds_max_cookiesize);
  
-        mds_pack_inode2fid(&body->fid1, inode);
          mds_pack_inode2body(body, inode);
  }
  
@@ -260,7 +281,7 @@ static int mds_join_unlink_tail_inode(struct mds_update_record *rec,
                  ldlm_lock_decref(lockh, LCK_EX);
  
          head_inode = dchild->d_inode;
-        mdc_pack_fid(&head_fid, head_inode->i_ino, head_inode->i_generation,
+        ll_pack_fid(&head_fid, head_inode->i_ino, head_inode->i_generation,
                        head_inode->i_mode & S_IFMT);
  
          rc = mds_get_parents_children_locked(obd, mds, &join_rec->jr_fid,
@@ -295,7 +316,8 @@ static int mds_join_unlink_tail_inode(struct mds_update_record *rec,
                  GOTO(cleanup, rc);
          }
  
-        rc = mds_get_md(obd, tail_inode, tail_lmm, &lmm_size, 1, 0);
+        rc = mds_get_md(obd, tail_inode, tail_lmm, &lmm_size, 1, 0,
+                        req->rq_export->exp_connect_flags);
          if (rc < 0) /* get md fails */
                  GOTO(cleanup, rc);
  
@@ -303,7 +325,9 @@ static int mds_join_unlink_tail_inode(struct mds_update_record *rec,
                  le32_to_cpu(tail_lmm->lmm_magic) == LOV_MAGIC);
  
          LASSERT(de_tailparent);
-        rc = vfs_unlink(de_tailparent->d_inode, de_tail);
+        LOCK_INODE_MUTEX(de_tailparent->d_inode);
+        rc = ll_vfs_unlink(de_tailparent->d_inode, de_tail, mds->mds_vfsmnt);
+        UNLOCK_INODE_MUTEX(de_tailparent->d_inode);
  
          if (rc == 0) {
                  CDEBUG(D_INODE, "delete the tail inode %lu/%u \n",
@@ -336,6 +360,7 @@ int mds_join_file(struct mds_update_record *rec, struct ptlrpc_request *req,
  {
          struct mds_obd *mds = mds_req2mds(req);
          struct obd_device *obd = req->rq_export->exp_obd;
+        struct inode *inodes[PTLRPC_NUM_VERSIONS] = { NULL };
          struct inode *head_inode = NULL;
          struct lvfs_run_ctxt saved;
          void *handle = NULL;
@@ -383,7 +408,8 @@ int mds_join_file(struct mds_update_record *rec, struct ptlrpc_request *req,
  
          LOCK_INODE_MUTEX(head_inode);
          cleanup_phase = 1;
-        rc = mds_get_md(obd, head_inode, head_lmm, &size, 0, 0);
+        rc = mds_get_md(obd, head_inode, head_lmm, &size, 0, 0,
+                        req->rq_export->exp_connect_flags);
          if (rc < 0)
                  GOTO(cleanup, rc);
  
@@ -478,7 +504,8 @@ int mds_join_file(struct mds_update_record *rec, struct ptlrpc_request *req,
                        sizeof(struct lov_mds_md_join), "lov");
          mds_finish_join(mds, req, head_inode, head_lmmj);
  cleanup:
-        rc = mds_finish_transno(mds, head_inode, handle, req, rc, 0, 0);
+        inodes[0] = head_inode;
+        rc = mds_finish_transno(mds, inodes, handle, req, rc, 0, 0);
          switch(cleanup_phase){
          case 3:
                  llog_close(llh_head);
@@ -503,4 +530,3 @@ cleanup:
          req->rq_status = rc;
          RETURN(rc);
  }
-
diff --git a/lustre/mds/mds_lib.c b/lustre/mds/mds_lib.c

index 6a33465..5b16fba 100644 (file)
--- a/lustre/mds/mds_lib.c
+++ b/lustre/mds/mds_lib.c
@@ -1,25 +1,37 @@
  /* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
   * vim:expandtab:shiftwidth=8:tabstop=8:
   *
- *  Copyright (c) 2003 Cluster File Systems, Inc.
+ * GPL HEADER START
   *
- *   This file is part of the Lustre file system, http://www.lustre.org
- *   Lustre is a trademark of Cluster File Systems, Inc.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   *
- *   You may have signed or agreed to another license before downloading
- *   this software.  If so, you are bound by the terms and conditions
- *   of that agreement, and the following does not apply to you.  See the
- *   LICENSE file included with this distribution for more information.
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
   *
- *   If you did not agree to a different license, then this copy of Lustre
- *   is open source software; you can redistribute it and/or modify it
- *   under the terms of version 2 of the GNU General Public License as
- *   published by the Free Software Foundation.
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
   *
- *   In either case, Lustre is distributed in the hope that it will be
- *   useful, but WITHOUT ANY WARRANTY; without even the implied warranty
- *   of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- *   license text for more details.
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright  2008 Sun Microsystems, Inc. All rights reserved
+ * Use is subject to license terms.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
   */
  
  #define DEBUG_SUBSYSTEM S_MDS
@@ -34,11 +46,7 @@
  #include <linux/stat.h>
  #include <linux/errno.h>
  #include <linux/version.h>
-#if (LINUX_VERSION_CODE < KERNEL_VERSION(2,5,0))
-# include <linux/locks.h>   // for wait_on_buffer
-#else
-# include <linux/buffer_head.h>   // for wait_on_buffer
-#endif
+#include <linux/buffer_head.h>   // for wait_on_buffer
  #include <linux/unistd.h>
  
  #include <asm/system.h>
@@ -53,7 +61,7 @@
  #include <lustre_lib.h>
  #include "mds_internal.h"
  
-void mds_pack_inode2fid(struct ll_fid *fid, struct inode *inode)
+static void mds_pack_inode2fid(struct ll_fid *fid, struct inode *inode)
  {
          fid->id = inode->i_ino;
          fid->generation = inode->i_generation;
@@ -72,6 +80,7 @@ void mds_pack_inode2body(struct mds_body *b, struct inode *inode)
                  b->valid |= OBD_MD_FLSIZE | OBD_MD_FLBLOCKS | OBD_MD_FLATIME |
                              OBD_MD_FLMTIME | OBD_MD_FLRDEV;
  
+        mds_pack_inode2fid(&b->fid1, inode);
          b->ino = inode->i_ino;
          b->atime = LTIME_S(inode->i_atime);
          b->mtime = LTIME_S(inode->i_mtime);
@@ -81,9 +90,7 @@ void mds_pack_inode2body(struct mds_body *b, struct inode *inode)
          b->blocks = inode->i_blocks;
          b->uid = inode->i_uid;
          b->gid = inode->i_gid;
-        b->flags = (b->flags & MDS_BFLAG_EXT_FLAGS) |
-                   ll_inode_to_ext_flags(inode->i_flags,
-                                         !(b->flags & MDS_BFLAG_EXT_FLAGS));
+        b->flags = ll_inode_to_ext_flags(inode->i_flags, MDS_BFLAG_EXT_FLAGS);
          b->rdev = inode->i_rdev;
          /* Return the correct link count for orphan inodes */
          b->nlink = mds_inode_is_orphan(inode) ? 0 : inode->i_nlink;
@@ -144,7 +151,7 @@ static int mds_setattr_unpack(struct ptlrpc_request *req, int offset,
  
          r->ur_uc.luc_fsuid = rec->sa_fsuid;
          r->ur_uc.luc_fsgid = rec->sa_fsgid;
-        r->ur_uc.luc_cap = rec->sa_cap;
+        cfs_kernel_cap_unpack(&r->ur_uc.luc_cap, rec->sa_cap);
          r->ur_uc.luc_suppgid1 = rec->sa_suppgid;
          r->ur_uc.luc_suppgid2 = -1;
          r->ur_fid1 = &rec->sa_fid;
@@ -194,7 +201,7 @@ static int mds_create_unpack(struct ptlrpc_request *req, int offset,
  
          r->ur_uc.luc_fsuid = rec->cr_fsuid;
          r->ur_uc.luc_fsgid = rec->cr_fsgid;
-        r->ur_uc.luc_cap = rec->cr_cap;
+        cfs_kernel_cap_unpack(&r->ur_uc.luc_cap, rec->cr_cap);
          r->ur_uc.luc_suppgid1 = rec->cr_suppgid;
          r->ur_uc.luc_suppgid2 = -1;
          r->ur_fid1 = &rec->cr_fid;
@@ -227,7 +234,7 @@ static int mds_create_unpack(struct ptlrpc_request *req, int offset,
          if (lustre_msg_buflen(req->rq_reqmsg, offset + 3)) {
                  r->ur_dlm = lustre_swab_reqbuf(req, offset + 3,
                                                 sizeof(*r->ur_dlm),
-                                               lustre_swab_ldlm_request); 
+                                               lustre_swab_ldlm_request);
                  if (r->ur_dlm == NULL)
                          RETURN (-EFAULT);
          }
@@ -247,7 +254,7 @@ static int mds_link_unpack(struct ptlrpc_request *req, int offset,
  
          r->ur_uc.luc_fsuid = rec->lk_fsuid;
          r->ur_uc.luc_fsgid = rec->lk_fsgid;
-        r->ur_uc.luc_cap = rec->lk_cap;
+        cfs_kernel_cap_unpack(&r->ur_uc.luc_cap, rec->lk_cap);
          r->ur_uc.luc_suppgid1 = rec->lk_suppgid1;
          r->ur_uc.luc_suppgid2 = rec->lk_suppgid2;
          r->ur_fid1 = &rec->lk_fid1;
@@ -282,7 +289,7 @@ static int mds_unlink_unpack(struct ptlrpc_request *req, int offset,
  
          r->ur_uc.luc_fsuid = rec->ul_fsuid;
          r->ur_uc.luc_fsgid = rec->ul_fsgid;
-        r->ur_uc.luc_cap = rec->ul_cap;
+        cfs_kernel_cap_unpack(&r->ur_uc.luc_cap, rec->ul_cap);
          r->ur_uc.luc_suppgid1 = rec->ul_suppgid;
          r->ur_uc.luc_suppgid2 = -1;
          r->ur_mode = rec->ul_mode;
@@ -295,7 +302,6 @@ static int mds_unlink_unpack(struct ptlrpc_request *req, int offset,
          if (r->ur_name == NULL)
                  RETURN(-EFAULT);
          r->ur_namelen = lustre_msg_buflen(req->rq_reqmsg, offset + 1);
-        
          if (lustre_msg_buflen(req->rq_reqmsg, offset + 2)) {
                  r->ur_dlm = lustre_swab_reqbuf(req, offset + 2,
                                                 sizeof(*r->ur_dlm),
@@ -319,7 +325,7 @@ static int mds_rename_unpack(struct ptlrpc_request *req, int offset,
  
          r->ur_uc.luc_fsuid = rec->rn_fsuid;
          r->ur_uc.luc_fsgid = rec->rn_fsgid;
-        r->ur_uc.luc_cap = rec->rn_cap;
+        cfs_kernel_cap_unpack(&r->ur_uc.luc_cap, rec->rn_cap);
          r->ur_uc.luc_suppgid1 = rec->rn_suppgid1;
          r->ur_uc.luc_suppgid2 = rec->rn_suppgid2;
          r->ur_fid1 = &rec->rn_fid1;
@@ -360,7 +366,7 @@ static int mds_open_unpack(struct ptlrpc_request *req, int offset,
  
          r->ur_uc.luc_fsuid = rec->cr_fsuid;
          r->ur_uc.luc_fsgid = rec->cr_fsgid;
-        r->ur_uc.luc_cap = rec->cr_cap;
+        cfs_kernel_cap_unpack(&r->ur_uc.luc_cap, rec->cr_cap);
          r->ur_uc.luc_suppgid1 = rec->cr_suppgid;
          r->ur_uc.luc_suppgid2 = -1;
          r->ur_fid1 = &rec->cr_fid;
@@ -428,7 +434,7 @@ int mds_update_unpack(struct ptlrpc_request *req, int offset,
  }
  
  void mds_root_squash(struct mds_obd *mds, lnet_nid_t *peernid,
-                     __u32 *fsuid, __u32 *fsgid, __u32 *cap,
+                     __u32 *fsuid, __u32 *fsgid, cfs_kernel_cap_t *kcap,
                       __u32 *suppgid, __u32 *suppgid2)
  {
          if (!mds->mds_squash_uid || *fsuid)
@@ -437,13 +443,13 @@ void mds_root_squash(struct mds_obd *mds, lnet_nid_t *peernid,
          if (*peernid == mds->mds_nosquash_nid)
                  return;
  
-        CDEBUG(D_OTHER, "squash req from %s, (%d:%d/%x)=>(%d:%d/%x)\n",
-               libcfs_nid2str(*peernid), *fsuid, *fsgid, *cap,
-               mds->mds_squash_uid, mds->mds_squash_gid, 0);
+        CDEBUG(D_OTHER, "squash req from %s, (%d:%d)=>(%d:%d)\n",
+               libcfs_nid2str(*peernid), *fsuid, *fsgid,
+               mds->mds_squash_uid, mds->mds_squash_gid);
  
          *fsuid = mds->mds_squash_uid;
          *fsgid = mds->mds_squash_gid;
-        *cap = 0;
+        cfs_kernel_cap_unpack(kcap, 0);
          *suppgid = -1;
          if (suppgid2)
                  *suppgid2 = -1;
@@ -467,13 +473,13 @@ int mds_init_ucred(struct lvfs_ucred *ucred, struct ptlrpc_request *req,
          } else
  #endif
          {
+                cfs_kernel_cap_unpack(&ucred->luc_cap, body->capability);
                  mds_root_squash(mds, &req->rq_peer.nid, &body->fsuid,
-                                &body->fsgid, &body->capability,
+                                &body->fsgid, &ucred->luc_cap,
                                  &body->suppgid, NULL);
  
                  ucred->luc_fsuid = body->fsuid;
                  ucred->luc_fsgid = body->fsgid;
-                ucred->luc_cap = body->capability;
          }
  
          ucred->luc_uce = upcall_cache_get_entry(mds->mds_group_hash,
diff --git a/lustre/mds/mds_log.c b/lustre/mds/mds_log.c

index 189707f..ca76e09 100644 (file)
--- a/lustre/mds/mds_log.c
+++ b/lustre/mds/mds_log.c
@@ -1,30 +1,43 @@
  /* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
   * vim:expandtab:shiftwidth=8:tabstop=8:
   *
- *  lustre/mds/mds_log.c
+ * GPL HEADER START
   *
- *  Copyright (c) 2001-2003 Cluster File Systems, Inc.
- *   Author: Peter Braam <braam@clusterfs.com>
- *   Author: Andreas Dilger <adilger@clusterfs.com>
- *   Author: Phil Schwan <phil@clusterfs.com>
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   *
- *   This file is part of the Lustre file system, http://www.lustre.org
- *   Lustre is a trademark of Cluster File Systems, Inc.
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
   *
- *   You may have signed or agreed to another license before downloading
- *   this software.  If so, you are bound by the terms and conditions
- *   of that agreement, and the following does not apply to you.  See the
- *   LICENSE file included with this distribution for more information.
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
   *
- *   If you did not agree to a different license, then this copy of Lustre
- *   is open source software; you can redistribute it and/or modify it
- *   under the terms of version 2 of the GNU General Public License as
- *   published by the Free Software Foundation.
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
   *
- *   In either case, Lustre is distributed in the hope that it will be
- *   useful, but WITHOUT ANY WARRANTY; without even the implied warranty
- *   of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- *   license text for more details.
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright  2008 Sun Microsystems, Inc. All rights reserved
+ * Use is subject to license terms.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/mds/mds_log.c
+ *
+ * Author: Peter Braam <braam@clusterfs.com>
+ * Author: Andreas Dilger <adilger@clusterfs.com>
+ * Author: Phil Schwan <phil@clusterfs.com>
   */
  
  #define DEBUG_SUBSYSTEM S_MDS
@@ -39,14 +52,12 @@
  #include <obd_class.h>
  #include <lustre_fsfilt.h>
  #include <lustre_mds.h>
-#include <lustre_commit_confd.h>
  #include <lustre_log.h>
-
  #include "mds_internal.h"
  
-static int mds_llog_origin_add(struct llog_ctxt *ctxt,
-                        struct llog_rec_hdr *rec, struct lov_stripe_md *lsm,
-                        struct llog_cookie *logcookies, int numcookies)
+static int mds_llog_origin_add(struct llog_ctxt *ctxt, struct llog_rec_hdr *rec,
+                               struct lov_stripe_md *lsm,
+                               struct llog_cookie *logcookies, int numcookies)
  {
          struct obd_device *obd = ctxt->loc_obd;
          struct obd_device *lov_obd = obd->u.mds.mds_osc_obd;
@@ -61,7 +72,7 @@ static int mds_llog_origin_add(struct llog_ctxt *ctxt,
          RETURN(rc);
  }
  
-static int mds_llog_origin_connect(struct llog_ctxt *ctxt, int count,
+static int mds_llog_origin_connect(struct llog_ctxt *ctxt,
                                     struct llog_logid *logid,
                                     struct llog_gen *gen,
                                     struct obd_uuid *uuid)
@@ -73,7 +84,7 @@ static int mds_llog_origin_connect(struct llog_ctxt *ctxt, int count,
          ENTRY;
  
          lctxt = llog_get_context(lov_obd, ctxt->loc_idx);
-        rc = llog_connect(lctxt, count, logid, gen, uuid);
+        rc = llog_connect(lctxt, logid, gen, uuid);
          llog_ctxt_put(lctxt);
          RETURN(rc);
  }
@@ -93,14 +104,39 @@ static int mds_llog_repl_cancel(struct llog_ctxt *ctxt, struct lov_stripe_md *ls
          RETURN(rc);
  }
  
-int mds_log_op_unlink(struct obd_device *obd, 
+static int mds_llog_add_unlink(struct obd_device *obd,
+                               struct lov_stripe_md *lsm, obd_count count,
+                               struct llog_cookie *logcookie, int cookies)
+{
+        struct llog_unlink_rec *lur;
+        struct llog_ctxt *ctxt;
+        int rc;
+
+        rc = obd_checkmd(obd->u.mds.mds_osc_exp, obd->obd_self_export, lsm);
+        if (rc)
+                RETURN(rc);
+        /* first prepare unlink log record */
+        OBD_ALLOC_PTR(lur);
+        if (!lur)
+                RETURN(rc = -ENOMEM);
+        lur->lur_hdr.lrh_len = lur->lur_tail.lrt_len = sizeof(*lur);
+        lur->lur_hdr.lrh_type = MDS_UNLINK_REC;
+        lur->lur_count = count;
+
+        ctxt = llog_get_context(obd, LLOG_MDS_OST_ORIG_CTXT);
+        rc = llog_add(ctxt, &lur->lur_hdr, lsm, logcookie, cookies);
+        llog_ctxt_put(ctxt);
+
+        OBD_FREE_PTR(lur);
+        RETURN(rc);
+}
+
+int mds_log_op_unlink(struct obd_device *obd,
                        struct lov_mds_md *lmm, int lmm_size,
                        struct llog_cookie *logcookies, int cookies_size)
  {
          struct mds_obd *mds = &obd->u.mds;
          struct lov_stripe_md *lsm = NULL;
-        struct llog_unlink_rec *lur;
-        struct llog_ctxt *ctxt;
          int rc;
          ENTRY;
  
@@ -110,24 +146,27 @@ int mds_log_op_unlink(struct obd_device *obd,
          rc = obd_unpackmd(mds->mds_osc_exp, &lsm, lmm, lmm_size);
          if (rc < 0)
                  RETURN(rc);
-        rc = obd_checkmd(mds->mds_osc_exp, obd->obd_self_export, lsm);
-        if (rc)
-                GOTO(out, rc);
-        /* first prepare unlink log record */
-        OBD_ALLOC(lur, sizeof(*lur));
-        if (!lur)
-                GOTO(out, rc = -ENOMEM);
-        lur->lur_hdr.lrh_len = lur->lur_tail.lrt_len = sizeof(*lur);
-        lur->lur_hdr.lrh_type = MDS_UNLINK_REC;
+        rc = mds_llog_add_unlink(obd, lsm, 0, logcookies,
+                                 cookies_size / sizeof(struct llog_cookie));
+        obd_free_memmd(mds->mds_osc_exp, &lsm);
+        RETURN(rc);
+}
  
-        ctxt = llog_get_context(obd, LLOG_MDS_OST_ORIG_CTXT);
-        rc = llog_add(ctxt, &lur->lur_hdr, lsm, logcookies,
-                      cookies_size / sizeof(struct llog_cookie));
-        llog_ctxt_put(ctxt);
+int mds_log_op_orphan(struct obd_device *obd, struct lov_stripe_md *lsm,
+                      obd_count count)
+{
+        struct mds_obd *mds = &obd->u.mds;
+        struct llog_cookie logcookie;
+        int rc;
+        ENTRY;
  
-        OBD_FREE(lur, sizeof(*lur));
-out:
-        obd_free_memmd(mds->mds_osc_exp, &lsm);
+        if (IS_ERR(mds->mds_osc_obd))
+                RETURN(PTR_ERR(mds->mds_osc_obd));
+
+        rc = obd_checkmd(mds->mds_osc_exp, obd->obd_self_export, lsm);
+        if (rc)
+                RETURN(rc);
+        rc = mds_llog_add_unlink(obd, lsm, count - 1, &logcookie, 1);
          RETURN(rc);
  }
  
@@ -189,6 +228,7 @@ int mds_llog_init(struct obd_device *obd, struct obd_device *tgt,
                    int count, struct llog_catid *logid, struct obd_uuid *uuid)
  {
          struct obd_device *lov_obd = obd->u.mds.mds_osc_obd;
+        struct llog_ctxt *ctxt;
          int rc;
          ENTRY;
  
@@ -200,13 +240,23 @@ int mds_llog_init(struct obd_device *obd, struct obd_device *tgt,
          rc = llog_setup(obd, LLOG_SIZE_REPL_CTXT, tgt, 0, NULL,
                          &mds_size_repl_logops);
          if (rc)
-                RETURN(rc);
+                GOTO(err_llog, rc);
  
          rc = obd_llog_init(lov_obd, tgt, count, logid, uuid);
-        if (rc)
+        if (rc) {
                  CERROR("lov_llog_init err %d\n", rc);
-
+                GOTO(err_cleanup, rc);
+        }
          RETURN(rc);
+err_cleanup:
+        ctxt = llog_get_context(obd, LLOG_SIZE_REPL_CTXT);
+        if (ctxt)
+                llog_cleanup(ctxt);
+err_llog:
+        ctxt = llog_get_context(obd, LLOG_MDS_OST_ORIG_CTXT);
+        if (ctxt)
+                llog_cleanup(ctxt);
+        return rc;
  }
  
  int mds_llog_finish(struct obd_device *obd, int count)
diff --git a/lustre/mds/mds_lov.c b/lustre/mds/mds_lov.c

index 6116d34..7c8262d 100644 (file)
--- a/lustre/mds/mds_lov.c
+++ b/lustre/mds/mds_lov.c
@@ -1,29 +1,43 @@
  /* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
   * vim:expandtab:shiftwidth=8:tabstop=8:
   *
- *  linux/mds/mds_lov.c
- *  Lustre Metadata Server (mds) handling of striped file data
+ * GPL HEADER START
   *
- *  Copyright (C) 2001-2003 Cluster File Systems, Inc.
- *   Author: Peter Braam <braam@clusterfs.com>
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   *
- *   This file is part of the Lustre file system, http://www.lustre.org
- *   Lustre is a trademark of Cluster File Systems, Inc.
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
   *
- *   You may have signed or agreed to another license before downloading
- *   this software.  If so, you are bound by the terms and conditions
- *   of that agreement, and the following does not apply to you.  See the
- *   LICENSE file included with this distribution for more information.
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
   *
- *   If you did not agree to a different license, then this copy of Lustre
- *   is open source software; you can redistribute it and/or modify it
- *   under the terms of version 2 of the GNU General Public License as
- *   published by the Free Software Foundation.
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
   *
- *   In either case, Lustre is distributed in the hope that it will be
- *   useful, but WITHOUT ANY WARRANTY; without even the implied warranty
- *   of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- *   license text for more details.
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright  2008 Sun Microsystems, Inc. All rights reserved
+ * Use is subject to license terms.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/mds/mds_lov.c
+ *
+ * Lustre Metadata Server (mds) handling of striped file data
+ *
+ * Author: Peter Braam <braam@clusterfs.com>
   */
  
  #ifndef EXPORT_SYMTAB
@@ -151,34 +165,201 @@ void mds_lov_destroy_objids(struct obd_device *obd)
  }
  EXPORT_SYMBOL(mds_lov_destroy_objids);
  
+/**
+ * currently exist two ways for know about ost count and max ost index.
+ * first - after ost is connected to mds and sync process finished
+ * second - get from lmm in recovery process, in case when mds not have configs,
+ * and ost isn't registered in mgs.
+ *
+ * \param mds pointer to mds structure
+ * \param index maxium ost index
+ *
+ * \retval -ENOMEM is not hame memory for new page
+ * \retval 0 is update passed
+ */
+static int mds_lov_update_max_ost(struct mds_obd *mds, obd_id index)
+{
+        __u32 page = index / OBJID_PER_PAGE();
+        __u32 off = index % OBJID_PER_PAGE();
+        obd_id *data =  mds->mds_lov_page_array[page];
+
+        if (data == NULL) {
+                OBD_ALLOC(data, MDS_LOV_ALLOC_SIZE);
+                if (data == NULL)
+                        RETURN(-ENOMEM);
+
+                mds->mds_lov_page_array[page] = data;
+        }
+
+        if (index > mds->mds_lov_objid_max_index) {
+                mds->mds_lov_objid_lastpage = page;
+                mds->mds_lov_objid_lastidx = off;
+                mds->mds_lov_objid_max_index = index;
+        }
+
+        /* workaround - New target not in objids file; increase mdsize */
+        /* ld_tgt_count is used as the max index everywhere, despite its name. */
+        if (data[off] == 0) {
+                __u32 stripes;
+
+                data[off] = 1;
+                mds->mds_lov_objid_count++;
+                stripes = min_t(__u32, LOV_MAX_STRIPE_COUNT,
+                                mds->mds_lov_objid_count);
+
+                mds->mds_max_mdsize = lov_mds_md_size(stripes, LOV_MAGIC_V3);
+                mds->mds_max_cookiesize = stripes * sizeof(struct llog_cookie);
+                CDEBUG(D_CONFIG, "updated max_mdsize/max_cookiesize for %d "
+                       "stripes: %d/%d\n", mds->mds_max_mdsize,
+                       mds->mds_max_cookiesize, stripes);
+        }
+
+        EXIT;
+        return 0;
+}
+
+int mds_lov_prepare_objids(struct obd_device *obd, struct lov_mds_md *lmm)
+{
+        int rc = 0;
+        __u32 j;
+        struct lov_ost_data_v1 *lmm_objects;
+
+        /* if we create file without objects - lmm is NULL */
+        if (lmm == NULL)
+                return 0;
+
+        mutex_down(&obd->obd_dev_sem);
+        if (le32_to_cpu(lmm->lmm_magic) == LOV_MAGIC_V3)
+                lmm_objects = ((struct lov_mds_md_v3 *)lmm)->lmm_objects;
+        else
+                lmm_objects = lmm->lmm_objects;
+
+        for (j = 0; j < le32_to_cpu(lmm->lmm_stripe_count); j++) {
+                __u32 i = le32_to_cpu(lmm_objects[j].l_ost_idx);
+                if (mds_lov_update_max_ost(&obd->u.mds, i)) {
+                        rc = -ENOMEM;
+                        break;
+                }
+        }
+        mutex_up(&obd->obd_dev_sem);
+
+        RETURN(rc);
+}
+EXPORT_SYMBOL(mds_lov_prepare_objids);
+
+/*
+ * write llog orphan record about lost ost object,
+ * Special lsm is allocated with single stripe, caller should deallocated it
+ * after use
+ */
+static int mds_log_lost_precreated(struct obd_device *obd,
+                                   struct lov_stripe_md **lsmp, int *stripes,
+                                   obd_id id, obd_count count, int idx)
+{
+        struct lov_stripe_md *lsm = *lsmp;
+        int rc;
+        ENTRY;
+
+        if (*lsmp == NULL) {
+                rc = obd_alloc_memmd(obd->u.mds.mds_osc_exp, &lsm);
+                if (rc < 0)
+                        RETURN(rc);
+                /* need only one stripe, save old value */
+                *stripes = lsm->lsm_stripe_count;
+                lsm->lsm_stripe_count = 1;
+                *lsmp = lsm;
+        }
+
+        lsm->lsm_oinfo[0]->loi_id = id;
+        lsm->lsm_oinfo[0]->loi_gr = 0; /* needed in 2.0 */
+        lsm->lsm_oinfo[0]->loi_ost_idx = idx;
+
+        rc = mds_log_op_orphan(obd, lsm, count);
+        RETURN(rc);
+}
+
  void mds_lov_update_objids(struct obd_device *obd, struct lov_mds_md *lmm)
  {
          struct mds_obd *mds = &obd->u.mds;
          int j;
+        struct lov_ost_data_v1 *lmm_objects;
+#ifndef HAVE_DELAYED_RECOVERY
+        struct lov_stripe_md *lsm = NULL;
+        int stripes = 0;
+#endif
          ENTRY;
  
          /* if we create file without objects - lmm is NULL */
          if (lmm == NULL)
                  return;
  
+        if (le32_to_cpu(lmm->lmm_magic) == LOV_MAGIC_V3)
+                lmm_objects = ((struct lov_mds_md_v3 *)lmm)->lmm_objects;
+        else
+                lmm_objects = lmm->lmm_objects;
+
          for (j = 0; j < le32_to_cpu(lmm->lmm_stripe_count); j++) {
-                int i = le32_to_cpu(lmm->lmm_objects[j].l_ost_idx);
-                obd_id id = le64_to_cpu(lmm->lmm_objects[j].l_object_id);
-                int page = i / OBJID_PER_PAGE();
-                int idx = i % OBJID_PER_PAGE();
-                obd_id *data = mds->mds_lov_page_array[page];
+                __u32 i = le32_to_cpu(lmm_objects[j].l_ost_idx);
+                obd_id id = le64_to_cpu(lmm_objects[j].l_object_id);
+                __u32 page = i / OBJID_PER_PAGE();
+                __u32 idx = i % OBJID_PER_PAGE();
+                obd_id *data;
+
+                data = mds->mds_lov_page_array[page];
  
-                CDEBUG(D_INODE,"update last object for ost %d"
+                CDEBUG(D_INODE,"update last object for ost %u"
                         " - new "LPU64" old "LPU64"\n", i, id, data[idx]);
                  if (id > data[idx]) {
+#ifndef HAVE_DELAYED_RECOVERY
+                        int lost = id - data[idx] - 1;
+                        /* we might have lost precreated objects due to VBR */
+                        if (lost > 0 && obd->obd_recovering) {
+                                CDEBUG(D_HA, "GAP in objids is %u\n", lost);
+                                LASSERT(obd->obd_version_recov);
+                                /* lsm is allocated if NULL */
+                                mds_log_lost_precreated(obd, &lsm, &stripes,
+                                                        data[idx] + 1, lost, i);
+                        }
+#endif
                          data[idx] = id;
                          cfs_bitmap_set(mds->mds_lov_page_dirty, page);
                  }
          }
+#ifndef HAVE_DELAYED_RECOVERY
+        if (lsm) {
+                /* restore stripes number */
+                lsm->lsm_stripe_count = stripes;
+                obd_free_memmd(mds->mds_osc_exp, &lsm);
+        }
+#endif
          EXIT;
+        return;
  }
  EXPORT_SYMBOL(mds_lov_update_objids);
  
+static int mds_lov_update_from_read(struct mds_obd *mds, obd_id *data,
+                                    __u32 count)
+{
+        __u32 i;
+        __u32 stripes;
+
+        for(i = 0; i < count; i++) {
+                if (data[i] == 0)
+                        continue;
+
+                mds->mds_lov_objid_count++;
+                stripes = min_t(__u32, LOV_MAX_STRIPE_COUNT,
+                                mds->mds_lov_objid_count);
+
+                mds->mds_max_mdsize = lov_mds_md_size(stripes, LOV_MAGIC_V3);
+                mds->mds_max_cookiesize = stripes * sizeof(struct llog_cookie);
+                CDEBUG(D_CONFIG, "updated max_mdsize/max_cookiesize for %d stripes: "
+                       "%d/%d\n", stripes, mds->mds_max_mdsize, mds->mds_max_cookiesize);
+        }
+        EXIT;
+        return 0;
+}
+
  static int mds_lov_read_objids(struct obd_device *obd)
  {
          struct mds_obd *mds = &obd->u.mds;
@@ -196,16 +377,16 @@ static int mds_lov_read_objids(struct obd_device *obd)
  
          page = (size/(OBJID_PER_PAGE()*sizeof(obd_id)))+1;
          CDEBUG(D_INFO, "file size %d pages %d\n", (int)size, page);
-        for(i=0; i < page; i++) {
-                obd_id *data =  mds->mds_lov_page_array[i];
+        for (i = 0; i < page; i++) {
+                obd_id *data;
                  loff_t off_old = off;
  
-                LASSERT(data == NULL);
-                OBD_ALLOC(data, MDS_LOV_ALLOC_SIZE);
-                if (data == NULL)
+                LASSERT(mds->mds_lov_page_array[i] == NULL);
+                OBD_ALLOC(mds->mds_lov_page_array[i], MDS_LOV_ALLOC_SIZE);
+                if (mds->mds_lov_page_array[i] == NULL)
                          GOTO(out, rc = -ENOMEM);
  
-                mds->mds_lov_page_array[i] = data;
+                data = mds->mds_lov_page_array[i];
  
                  rc = fsfilt_read_record(obd, mds->mds_lov_objid_filp, data,
                                          OBJID_PER_PAGE()*sizeof(obd_id), &off);
@@ -213,18 +394,20 @@ static int mds_lov_read_objids(struct obd_device *obd)
                          CERROR("Error reading objids %d\n", rc);
                          GOTO(out, rc);
                  }
+
+                count = (off - off_old) / sizeof(obd_id);
+                if (mds_lov_update_from_read(mds, data, count)) {
+                        CERROR("Can't update mds data\n");
+                        GOTO(out, rc = -EIO);
+                }
+
                  if (off == off_old)
                          break; /* eof */
+         }
+         mds->mds_lov_objid_lastpage = i;
+         mds->mds_lov_objid_lastidx = count;
  
-                count += (off-off_old)/sizeof(obd_id);
-        }
-        mds->mds_lov_objid_count = count;
-        if (count) {
-                count --;
-                mds->mds_lov_objid_lastpage = count / OBJID_PER_PAGE();
-                mds->mds_lov_objid_lastidx = count % OBJID_PER_PAGE();
-        }
-        CDEBUG(D_INFO, "Read %u - %u %u objid\n", count,
+        CDEBUG(D_INFO, "Read %u - %u %u objid\n", mds->mds_lov_objid_count,
                 mds->mds_lov_objid_lastpage, mds->mds_lov_objid_lastidx);
  out:
          mds_lov_dump_objids("read",obd);
@@ -254,7 +437,7 @@ int mds_lov_write_objids(struct obd_device *obd)
                  if (i == mds->mds_lov_objid_lastpage)
                          size = (mds->mds_lov_objid_lastidx + 1) * sizeof(obd_id);
  
-               CDEBUG(D_INFO,"write %lld - %ld\n", off, size);
+                CDEBUG(D_INFO,"write %lld - %u\n", off, size);
                  rc = fsfilt_write_record(obd, mds->mds_lov_objid_filp, data,
                                           size, &off, 0);
                  if (rc < 0)
@@ -269,7 +452,7 @@ int mds_lov_write_objids(struct obd_device *obd)
  EXPORT_SYMBOL(mds_lov_write_objids);
  
  static int mds_lov_get_objid(struct obd_device * obd,
-                             __u32 idx)
+                             obd_id idx)
  {
          struct mds_obd *mds = &obd->u.mds;
          unsigned int page;
@@ -280,16 +463,9 @@ static int mds_lov_get_objid(struct obd_device * obd,
  
          page = idx / OBJID_PER_PAGE();
          off = idx % OBJID_PER_PAGE();
-        data = mds->mds_lov_page_array[page];
-        if (data == NULL) {
-                OBD_ALLOC(data, MDS_LOV_ALLOC_SIZE);
-                if (data == NULL)
-                        GOTO(out, rc = -ENOMEM);
-
-                mds->mds_lov_page_array[page] = data;
-        }
  
-        if (data[off] == 0) {
+        data = mds->mds_lov_page_array[page];
+        if (data[off] < 2) {
                  /* We never read this lastid; ask the osc */
                  struct obd_id_info lastid;
                  __u32 size = sizeof(lastid);
@@ -297,18 +473,17 @@ static int mds_lov_get_objid(struct obd_device * obd,
                  lastid.idx = idx;
                  lastid.data = &data[off];
                  rc = obd_get_info(mds->mds_osc_exp, sizeof(KEY_LAST_ID),
-                                  KEY_LAST_ID, &size, &lastid);
+                                  KEY_LAST_ID, &size, &lastid, NULL);
                  if (rc)
                          GOTO(out, rc);
  
-                if (idx > mds->mds_lov_objid_count) {
-                        mds->mds_lov_objid_count = idx;
-                        mds->mds_lov_objid_lastpage = page;
-                        mds->mds_lov_objid_lastidx = off;
-                }
+                /* workaround for clean filter */
+                if (data[off] == 0)
+                        data[off] = 1;
+
                  cfs_bitmap_set(mds->mds_lov_page_dirty, page);
          }
-        CDEBUG(D_INFO, "idx %d - %p - %d/%d - "LPU64"\n",
+        CDEBUG(D_INFO, "idx "LPU64" - %p - %d/%d - "LPU64"\n",
                 idx, data, page, off, data[off]);
  out:
          RETURN(rc);
@@ -348,9 +523,6 @@ static int mds_lov_set_one_nextid(struct obd_device * obd, __u32 idx, obd_id *id
  
          LASSERT(!obd->obd_recovering);
  
-        /* obd->obd_dev_sem must be held so mds_lov_objids doesn't change */
-        LASSERT_SEM_LOCKED(&obd->obd_dev_sem);
-
          info.idx = idx;
          info.data = id;
          rc = obd_set_info_async(mds->mds_osc_exp, sizeof(KEY_NEXT_ID),
@@ -362,25 +534,13 @@ static int mds_lov_set_one_nextid(struct obd_device * obd, __u32 idx, obd_id *id
          RETURN(rc);
  }
  
-static __u32 mds_lov_get_idx(struct obd_export *lov,
-                             struct obd_uuid *ost_uuid)
-{
-        int rc;
-        int valsize = sizeof(ost_uuid);
-
-        rc = obd_get_info(lov, sizeof(KEY_LOV_IDX), KEY_LOV_IDX,
-                          &valsize, ost_uuid);
-        LASSERT(rc >= 0);
-
-        RETURN(rc);
-}
-
  /* Update the lov desc for a new size lov. */
-static int mds_lov_update_desc(struct obd_device *obd, struct obd_export *lov)
+static int mds_lov_update_desc(struct obd_device *obd, __u32 index,
+                               struct obd_uuid *uuid)
  {
          struct mds_obd *mds = &obd->u.mds;
          struct lov_desc *ld;
-        __u32 stripes, valsize = sizeof(mds->mds_lov_desc);
+        __u32 valsize = sizeof(mds->mds_lov_desc);
          int rc = 0;
          ENTRY;
  
@@ -388,30 +548,27 @@ static int mds_lov_update_desc(struct obd_device *obd, struct obd_export *lov)
          if (!ld)
                  RETURN(-ENOMEM);
  
-        rc = obd_get_info(lov, sizeof(KEY_LOVDESC), KEY_LOVDESC,
-                          &valsize, ld);
+        rc = obd_get_info(mds->mds_osc_exp, sizeof(KEY_LOVDESC), KEY_LOVDESC,
+                          &valsize, ld, NULL);
          if (rc)
                  GOTO(out, rc);
  
          /* Don't change the mds_lov_desc until the objids size matches the
             count (paranoia) */
          mds->mds_lov_desc = *ld;
-        CDEBUG(D_CONFIG, "updated lov_desc, tgt_count: %d\n",
-               mds->mds_lov_desc.ld_tgt_count);
+        CDEBUG(D_CONFIG, "updated lov_desc, tgt_count: %d - idx %d / uuid %s\n",
+               mds->mds_lov_desc.ld_tgt_count, index, uuid->uuid);
  
-        stripes = min_t(__u32, LOV_MAX_STRIPE_COUNT,
-                        mds->mds_lov_desc.ld_tgt_count);
-
-        mds->mds_max_mdsize = lov_mds_md_size(stripes);
-        mds->mds_max_cookiesize = stripes * sizeof(struct llog_cookie);
-        CDEBUG(D_CONFIG, "updated max_mdsize/max_cookiesize for %d stripes: "
-               "%d/%d\n", mds->mds_max_mdsize, mds->mds_max_cookiesize,
-               stripes);
+        mutex_down(&obd->obd_dev_sem);
+        rc = mds_lov_update_max_ost(mds, index);
+        mutex_up(&obd->obd_dev_sem);
+        if (rc)
+                GOTO(out, rc = -ENOMEM);
  
          /* If we added a target we have to reconnect the llogs */
          /* We only _need_ to do this at first add (idx), or the first time
             after recovery.  However, it should now be safe to call anytime. */
-        rc = llog_cat_initialize(obd, mds->mds_lov_desc.ld_tgt_count, NULL);
+        rc = llog_cat_initialize(obd, index, uuid);
  
  out:
          OBD_FREE(ld, sizeof(*ld));
@@ -434,10 +591,8 @@ static int mds_lov_update_mds(struct obd_device *obd,
          ENTRY;
  
          /* Don't let anyone else mess with mds_lov_objids now */
-        mutex_down(&obd->obd_dev_sem);
-
          old_count = mds->mds_lov_desc.ld_tgt_count;
-        rc = mds_lov_update_desc(obd, mds->mds_osc_exp);
+        rc = mds_lov_update_desc(obd, idx, &watched->u.cli.cl_target_uuid);
          if (rc)
                  GOTO(out, rc);
  
@@ -476,7 +631,6 @@ static int mds_lov_update_mds(struct obd_device *obd,
          CDEBUG(D_CONFIG, "last object "LPU64" from OST %d rc=%d\n",
                 data[off], idx, rc);
  out:
-        mutex_up(&obd->obd_dev_sem);
          RETURN(rc);
  }
  
@@ -498,8 +652,25 @@ int mds_lov_connect(struct obd_device *obd, char * lov_name)
          mds->mds_osc_obd = class_name2obd(lov_name);
          if (!mds->mds_osc_obd) {
                  CERROR("MDS cannot locate LOV %s\n", lov_name);
-                mds->mds_osc_obd = ERR_PTR(-ENOTCONN);
-                RETURN(-ENOTCONN);
+                GOTO(error_exit, rc = -ENOTCONN);
+        }
+
+        mutex_down(&obd->obd_dev_sem);
+        rc = mds_lov_read_objids(obd);
+        mutex_up(&obd->obd_dev_sem);
+        if (rc) {
+                CERROR("cannot read lov_objids: rc = %d\n", rc);
+                GOTO(error_exit, rc);
+        }
+
+        /* Deny new client connections until we are sure we have some OSTs */
+        obd->obd_no_conn = 1;
+
+        rc = obd_register_observer(mds->mds_osc_obd, obd);
+        if (rc) {
+                CERROR("MDS cannot register as observer of LOV %s (%d)\n",
+                       lov_name, rc);
+                GOTO(error_exit, rc);
          }
  
          OBD_ALLOC(data, sizeof(*data));
@@ -507,7 +678,7 @@ int mds_lov_connect(struct obd_device *obd, char * lov_name)
                  RETURN(-ENOMEM);
          data->ocd_connect_flags = OBD_CONNECT_VERSION | OBD_CONNECT_INDEX |
                  OBD_CONNECT_REQPORTAL | OBD_CONNECT_QUOTA64 | OBD_CONNECT_AT |
-                OBD_CONNECT_CHANGE_QS;
+                OBD_CONNECT_CHANGE_QS | OBD_CONNECT_MDS;
  #ifdef HAVE_LRU_RESIZE_SUPPORT
          data->ocd_connect_flags |= OBD_CONNECT_LRU_RESIZE;
  #endif
@@ -517,64 +688,16 @@ int mds_lov_connect(struct obd_device *obd, char * lov_name)
          OBD_FREE(data, sizeof(*data));
          if (rc) {
                  CERROR("MDS cannot connect to LOV %s (%d)\n", lov_name, rc);
-                mds->mds_osc_obd = ERR_PTR(rc);
-                RETURN(rc);
+                GOTO(error_exit, rc);
          }
          mds->mds_osc_exp = class_conn2export(&conn);
+        /* we not want postrecov in case clean fs, in other cases postrecov will
+         * be called from ldlm. otherwise we can call postrecov twice - in case
+         * short recovery */
  
-        rc = obd_register_observer(mds->mds_osc_obd, obd);
-        if (rc) {
-                CERROR("MDS cannot register as observer of LOV %s (%d)\n",
-                       lov_name, rc);
-                GOTO(err_discon, rc);
-        }
-
-        /* Deny new client connections until we are sure we have some OSTs */
-        obd->obd_no_conn = 1;
-
-        mutex_down(&obd->obd_dev_sem);
-        rc = mds_lov_read_objids(obd);
-        if (rc) {
-                CERROR("cannot read %s: rc = %d\n", "lov_objids", rc);
-                GOTO(err_reg, rc);
-        }
-
-        rc = mds_lov_update_desc(obd, mds->mds_osc_exp);
-        if (rc)
-                GOTO(err_reg, rc);
-
-        /* If we're mounting this code for the first time on an existing FS,
-         * we need to populate the objids array from the real OST values */
-        if (mds->mds_lov_desc.ld_tgt_count > mds->mds_lov_objid_count) {
-                __u32 i = mds->mds_lov_objid_count;
-                for(; i <= mds->mds_lov_desc.ld_tgt_count; i++) {
-                        rc = mds_lov_get_objid(obd, i);
-                        if (rc != 0)
-                                break;
-                }
-                if (rc == 0)
-                        rc = mds_lov_write_objids(obd);
-                if (rc)
-                        CERROR("got last objids from OSTs, but error "
-                                "in update objids file: %d\n", rc);
-        }
-
-        mutex_up(&obd->obd_dev_sem);
-
-        /* I want to see a callback happen when the OBD moves to a
-         * "For General Use" state, and that's when we'll call
-         * set_nextid().  The class driver can help us here, because
-         * it can use the obd_recovering flag to determine when the
-         * the OBD is full available. */
-        if (!obd->obd_recovering)
-                rc = mds_postrecov(obd);
          RETURN(rc);
  
-err_reg:
-        mutex_up(&obd->obd_dev_sem);
-        obd_register_observer(mds->mds_osc_obd, NULL);
-err_discon:
-        obd_disconnect(mds->mds_osc_exp);
+error_exit:
          mds->mds_osc_exp = NULL;
          mds->mds_osc_obd = ERR_PTR(rc);
          RETURN(rc);
@@ -739,9 +862,8 @@ int mds_iocontrol(unsigned int cmd, struct obd_export *exp, int len,
          case OBD_IOC_SET_READONLY: {
                  void *handle;
                  struct inode *inode = obd->u.obt.obt_sb->s_root->d_inode;
-                BDEVNAME_DECLARE_STORAGE(tmp);
                  LCONSOLE_WARN("*** setting obd %s device '%s' read-only ***\n",
-                       obd->obd_name, ll_bdevname(obd->u.obt.obt_sb, tmp));
+                       obd->obd_name, obd->u.obt.obt_sb->s_id);
  
                  handle = fsfilt_start(obd, inode, FSFILT_OP_MKNOD, NULL);
                  if (!IS_ERR(handle))
@@ -813,7 +935,10 @@ static void mds_allow_cli(struct obd_device *obd, unsigned long flag)
                  obd->u.mds.mds_fl_cfglog = 1;
          if (flag & CONFIG_SYNC)
                  obd->u.mds.mds_fl_synced = 1;
-        if (obd->u.mds.mds_fl_cfglog /* bz11778: && obd->u.mds.mds_fl_synced */)
+        if (flag & CONFIG_TARGET)
+                obd->u.mds.mds_fl_target = 1;
+        if (obd->u.mds.mds_fl_cfglog && obd->u.mds.mds_fl_target
+            /* bz11778: && obd->u.mds.mds_fl_synced */)
                  /* Open for clients */
                  obd->obd_no_conn = 0;
  }
@@ -867,8 +992,7 @@ static int __mds_lov_synchronize(void *data)
  
          OBD_FAIL_TIMEOUT(OBD_FAIL_MDS_LLOG_SYNC_TIMEOUT, 60);
  
-        rc = llog_connect(ctxt, obd->u.mds.mds_lov_desc.ld_tgt_count,
-                          NULL, NULL, uuid);
+        rc = llog_connect(ctxt, NULL, NULL, uuid);
          llog_ctxt_put(ctxt);
  
          if (rc != 0) {
@@ -897,7 +1021,7 @@ out:
                  CERROR("%s sync failed %d, deactivating\n", obd_uuid2str(uuid),
                         rc);
                  if (!obd->obd_stopping && mds->mds_osc_obd &&
-                    !mds->mds_osc_obd->obd_stopping && !watched->obd_stopping) 
+                    !mds->mds_osc_obd->obd_stopping && !watched->obd_stopping)
                          obd_notify(mds->mds_osc_obd, watched,
                                     OBD_NOTIFY_INACTIVE, NULL);
          } else {
@@ -926,7 +1050,6 @@ int mds_lov_start_synchronize(struct obd_device *obd,
                                void *data, int nonblock)
  {
          struct mds_lov_sync_info *mlsi;
-        struct mds_obd *mds = &obd->u.mds;
          int rc;
          struct obd_uuid *uuid;
          ENTRY;
@@ -938,12 +1061,10 @@ int mds_lov_start_synchronize(struct obd_device *obd,
          if (mlsi == NULL)
                  RETURN(-ENOMEM);
  
+        LASSERT(data);
          mlsi->mlsi_obd = obd;
          mlsi->mlsi_watched = watched;
-        if (data)
-                mlsi->mlsi_index = *(__u32 *)data;
-        else
-                mlsi->mlsi_index = mds_lov_get_idx(mds->mds_osc_exp, uuid);
+        mlsi->mlsi_index = *(__u32 *)data;
  
          /* Although class_export_get(obd->obd_self_export) would lock
             the MDS in place, since it's only a self-export
@@ -982,20 +1103,30 @@ int mds_notify(struct obd_device *obd, struct obd_device *watched,
          int rc = 0;
          ENTRY;
  
+        CDEBUG(D_CONFIG, "notify %s ev=%d\n", watched->obd_name, ev);
+
          switch (ev) {
          /* We only handle these: */
          case OBD_NOTIFY_ACTIVE:
+                /* lov want one or more _active_ targets for work */
+                mds_allow_cli(obd, CONFIG_TARGET);
+                /* activate event should be pass lov idx as argument */
          case OBD_NOTIFY_SYNC:
          case OBD_NOTIFY_SYNC_NONBLOCK:
+                /* sync event should be pass lov idx as argument */
                  break;
          case OBD_NOTIFY_CONFIG:
                  mds_allow_cli(obd, (unsigned long)data);
+                /* call this only when config is processed and stale_export_age
+                 * value is configured */
+                class_disconnect_expired_exports(obd);
+                /* quota_type has been processed, we can now handle
+                 * incoming quota requests */
+                QUOTA_MASTER_READY(&obd->u.obt.obt_qctxt);
          default:
                  RETURN(0);
          }
  
-        CDEBUG(D_CONFIG, "notify %s ev=%d\n", watched->obd_name, ev);
-
          if (strcmp(watched->obd_type->typ_name, LUSTRE_OSC_NAME) != 0) {
                  CERROR("unexpected notification of %s %s!\n",
                         watched->obd_type->typ_name, watched->obd_name);
@@ -1009,14 +1140,14 @@ int mds_notify(struct obd_device *obd, struct obd_device *watched,
                  /* We still have to fix the lov descriptor for ost's added
                     after the mdt in the config log.  They didn't make it into
                     mds_lov_connect. */
-                mutex_down(&obd->obd_dev_sem);
-                rc = mds_lov_update_desc(obd, obd->u.mds.mds_osc_exp);
-                mutex_up(&obd->obd_dev_sem);
+                LASSERT(data);
+                rc = mds_lov_update_desc(obd, *(__u32 *)data,
+                                          &watched->u.cli.cl_target_uuid);
+
                  mds_allow_cli(obd, CONFIG_SYNC);
                  RETURN(rc);
          }
  
-        LASSERT(!llog_ctxt_null(obd, LLOG_MDS_OST_ORIG_CTXT));
          rc = mds_lov_start_synchronize(obd, watched, data,
                                         !(ev == OBD_NOTIFY_SYNC));
  
@@ -1060,15 +1191,46 @@ int mds_get_default_md(struct obd_device *obd, struct lov_mds_md *lmm,
   * reason.  We will not delete the old lmm data until we have written the
   * new format lmm data in fsfilt_set_md(). */
  int mds_convert_lov_ea(struct obd_device *obd, struct inode *inode,
-                       struct lov_mds_md *lmm, int lmm_size)
+                       struct lov_mds_md *lmm, int lmm_size,
+                       __u64 connect_flags)
  {
          struct lov_stripe_md *lsm = NULL;
          void *handle;
          int rc, err;
          ENTRY;
  
-        if (le32_to_cpu(lmm->lmm_magic) == LOV_MAGIC ||
-            le32_to_cpu(lmm->lmm_magic == LOV_MAGIC_JOIN))
+        if (((connect_flags & OBD_CONNECT_LOV_V3) == 0) &&
+            (le32_to_cpu(lmm->lmm_magic) == LOV_MAGIC_V3)) {
+                /* client does not support LOV_MAGIC_V3, so we have to convert
+                 * it to V1
+                 * we convert the lmm from v3 to v1
+                 * and return the new size (which is smaller)
+                 * the caller supports this way to return the new size */
+                int new_lmm_size;
+
+                lmm->lmm_magic = cpu_to_le32(LOV_MAGIC_V1);
+                /* lmm_stripe_count for non reg files is not used or -1 */
+                if (!S_ISREG(inode->i_mode)) {
+                        new_lmm_size = lov_mds_md_size(0, LOV_MAGIC_V1);
+                } else {
+                        __u32 lmm_stripe_count;
+
+                        lmm_stripe_count = le32_to_cpu(lmm->lmm_stripe_count);
+                        new_lmm_size = lov_mds_md_size(lmm_stripe_count,
+                                                       LOV_MAGIC_V1);
+                        /* move the objects to the new place */
+                        memmove(lmm->lmm_objects,
+                                ((struct lov_mds_md_v3 *)lmm)->lmm_objects,
+                                lmm_stripe_count * sizeof(struct lov_ost_data_v1));
+                }
+                /* even if new size is smaller than old one,
+                 * this should not generate memory leak */
+                RETURN(new_lmm_size);
+        }
+
+        if (le32_to_cpu(lmm->lmm_magic) == LOV_MAGIC_V1 ||
+            le32_to_cpu(lmm->lmm_magic) == LOV_MAGIC_V3 ||
+            le32_to_cpu(lmm->lmm_magic) == LOV_MAGIC_JOIN)
                  RETURN(0);
  
          CDEBUG(D_INODE, "converting LOV EA on %lu/%u from %#08x to %#08x\n",
diff --git a/lustre/mds/mds_open.c b/lustre/mds/mds_open.c

index e7d4023..2dad39d 100644 (file)
--- a/lustre/mds/mds_open.c
+++ b/lustre/mds/mds_open.c
@@ -1,29 +1,44 @@
  /* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
   * vim:expandtab:shiftwidth=8:tabstop=8:
   *
- *  Copyright (c) 2003 Cluster File Systems, Inc.
- *   Author: Peter Braam <braam@clusterfs.com>
- *   Author: Andreas Dilger <adilger@clusterfs.com>
- *   Author: Phil Schwan <phil@clusterfs.com>
- *   Author: Mike Shaver <shaver@clusterfs.com>
+ * GPL HEADER START
   *
- *   This file is part of the Lustre file system, http://www.lustre.org
- *   Lustre is a trademark of Cluster File Systems, Inc.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   *
- *   You may have signed or agreed to another license before downloading
- *   this software.  If so, you are bound by the terms and conditions
- *   of that agreement, and the following does not apply to you.  See the
- *   LICENSE file included with this distribution for more information.
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
   *
- *   If you did not agree to a different license, then this copy of Lustre
- *   is open source software; you can redistribute it and/or modify it
- *   under the terms of version 2 of the GNU General Public License as
- *   published by the Free Software Foundation.
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
   *
- *   In either case, Lustre is distributed in the hope that it will be
- *   useful, but WITHOUT ANY WARRANTY; without even the implied warranty
- *   of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- *   license text for more details.
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright  2008 Sun Microsystems, Inc. All rights reserved
+ * Use is subject to license terms.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/mds/mds_open.c
+ *
+ * Author: Peter Braam <braam@clusterfs.com>
+ * Author: Andreas Dilger <adilger@clusterfs.com>
+ * Author: Phil Schwan <phil@clusterfs.com>
+ * Author: Mike Shaver <shaver@clusterfs.com>
   */
  
  #ifndef EXPORT_SYMTAB
@@ -35,12 +50,8 @@
  #include <linux/module.h>
  #include <linux/init.h>
  #include <linux/version.h>
-#if (LINUX_VERSION_CODE >= KERNEL_VERSION(2,5,0))
-# include <linux/buffer_head.h>
-# include <linux/workqueue.h>
-#else
-# include <linux/locks.h>
-#endif
+#include <linux/buffer_head.h>
+#include <linux/workqueue.h>
  
  #include <obd_class.h>
  #include <obd_lov.h>
@@ -345,8 +356,11 @@ static int mds_create_objects(struct ptlrpc_request *req, int offset,
                  lmm = rec->ur_eadata;
                  LASSERT(lmm);
  
-                if (*handle == NULL)
-                        *handle = fsfilt_start(obd,inode,FSFILT_OP_CREATE,NULL);
+                if (*handle == NULL) {
+                        int stripe_count = le32_to_cpu(lmm->lmm_stripe_count);
+                        *handle = fsfilt_start_log(obd, inode, FSFILT_OP_CREATE,
+                                                   NULL, stripe_count);
+                }
                  if (IS_ERR(*handle)) {
                          rc = PTR_ERR(*handle);
                          *handle = NULL;
@@ -356,11 +370,11 @@ static int mds_create_objects(struct ptlrpc_request *req, int offset,
                  rc = fsfilt_set_md(obd, inode, *handle, lmm, lmm_size, "lov");
                  if (rc)
                          CERROR("open replay failed to set md:%d\n", rc);
-                lmm_buf = lustre_msg_buf(req->rq_repmsg, offset, lmm_size);
-                LASSERT(lmm_buf);
-                memcpy(lmm_buf, lmm, lmm_size);
  
-                *objid = lmm_buf;
+                /* for replay we not need send lmm to client, this not used now */
+                lustre_shrink_reply(req, offset, 0, 1);
+                *objid = lmm;
+
                  RETURN(rc);
          }
  
@@ -380,7 +394,6 @@ static int mds_create_objects(struct ptlrpc_request *req, int offset,
  
          obdo_from_inode(oinfo.oi_oa, inode, OBD_MD_FLTYPE | OBD_MD_FLATIME |
                          OBD_MD_FLMTIME | OBD_MD_FLCTIME);
-
          if (!(rec->ur_flags & MDS_OPEN_HAS_OBJS)) {
                  /* check if things like lfs setstripe are sending us the ea */
                  if (rec->ur_flags & MDS_OPEN_HAS_EA) {
@@ -390,22 +403,24 @@ static int mds_create_objects(struct ptlrpc_request *req, int offset,
                          if (rc)
                                  GOTO(out_oa, rc);
                  } else {
-                        OBD_ALLOC(lmm, mds->mds_max_mdsize);
+                        __u32 lmm_sz = mds->mds_max_mdsize;
+                        OBD_ALLOC(lmm, lmm_sz);
                          if (lmm == NULL)
                                  GOTO(out_oa, rc = -ENOMEM);
  
-                        lmm_size = mds->mds_max_mdsize;
+                        lmm_size = lmm_sz;
                          rc = mds_get_md(obd, dchild->d_parent->d_inode,
-                                        lmm, &lmm_size, 1, 0);
+                                        lmm, &lmm_size, 1, 0,
+                                        req->rq_export->exp_connect_flags);
                          if (rc > 0)
                                  rc = obd_iocontrol(OBD_IOC_LOV_SETSTRIPE,
                                                     mds->mds_osc_exp,
                                                     0, &oinfo.oi_md, lmm);
-                        OBD_FREE(lmm, mds->mds_max_mdsize);
+                        OBD_FREE(lmm, lmm_sz);
                          if (rc)
                                  GOTO(out_oa, rc);
                  }
-                rc = obd_create(mds->mds_osc_exp, oinfo.oi_oa, 
+                rc = obd_create(mds->mds_osc_exp, oinfo.oi_oa,
                                  &oinfo.oi_md, &oti);
                  if (rc) {
                          int level = D_ERROR;
@@ -528,7 +543,7 @@ static void reconstruct_open(struct mds_update_record *rec, int offset,
          parent = mds_fid2dentry(mds, rec->ur_fid1, NULL);
          if (IS_ERR(parent)) {
                  rc = PTR_ERR(parent);
-                LCONSOLE_WARN("Parent "LPU64"/%u lookup error %d." 
+                LCONSOLE_WARN("Parent "LPU64"/%u lookup error %d."
                                " Evicting client %s with export %s.\n",
                                rec->ur_fid1->id, rec->ur_fid1->generation, rc,
                                obd_uuid2str(&exp->exp_client_uuid),
@@ -538,15 +553,16 @@ static void reconstruct_open(struct mds_update_record *rec, int offset,
                  return;
          }
  
-        dchild = ll_lookup_one_len(rec->ur_name, parent, rec->ur_namelen - 1);
+        dchild = mds_lookup(obd, rec->ur_name, parent, rec->ur_namelen - 1);
          if (IS_ERR(dchild)) {
                  rc = PTR_ERR(dchild);
-                LCONSOLE_WARN("Child "LPU64"/%u lookup error %d." 
+                LCONSOLE_WARN("Child "LPU64"/%u lookup error %d."
                                " Evicting client %s with export %s.\n",
                                rec->ur_fid1->id, rec->ur_fid1->generation, rc,
                                obd_uuid2str(&exp->exp_client_uuid),
                                obd_export_nid2str(exp));
                  mds_export_evict(exp);
+                l_dput(parent);
                  EXIT;
                  return;
          }
@@ -563,11 +579,11 @@ static void reconstruct_open(struct mds_update_record *rec, int offset,
                  GOTO(out_dput, 0);
          }
  
-        mds_pack_inode2fid(&body->fid1, dchild->d_inode);
          mds_pack_inode2body(body, dchild->d_inode);
          if (S_ISREG(dchild->d_inode->i_mode)) {
                  rc = mds_pack_md(obd, req->rq_repmsg, DLM_REPLY_REC_OFF + 1,
-                                 body, dchild->d_inode, 1, 0);
+                                 body, dchild->d_inode, 1, 0,
+                                 req->rq_export->exp_connect_flags);
  
                  if (rc)
                          LASSERT(rc == req->rq_status);
@@ -632,9 +648,16 @@ static void reconstruct_open(struct mds_update_record *rec, int offset,
           * to detect a re-open */
          if (mfd == NULL) {
                  if (rec->ur_flags & MDS_OPEN_JOIN_FILE) {
+#if LUSTRE_FIX >= 50
+                        /* Allow file join in beta builds to allow debugging */
                          rc = mds_join_file(rec, req, dchild, NULL);
                          if (rc)
                                  GOTO(out_dput, rc);
+#else
+                        CWARN("file join is not supported in this version of "
+                              "Lustre\n");
+                        GOTO(out_dput, req->rq_status = rc = -EOPNOTSUPP);
+#endif
                  }
                  mntget(mds->mds_vfsmnt);
                  CERROR("Re-opened file \n");
@@ -664,6 +687,52 @@ static void reconstruct_open(struct mds_update_record *rec, int offset,
          EXIT;
  }
  
+/* if client disconnects during recovery it may resend opens which were replayed
+ * on server but their transno less then last_transno on server so they will not
+ * be detected as reconstructs */
+static int open_replay_reconstruct(struct ptlrpc_request *req)
+{
+        struct mds_export_data *med = &req->rq_export->exp_mds_data;
+        struct mds_file_data *mfd = NULL;
+        struct list_head *t;
+
+        if (!(lustre_msg_get_flags(req->rq_reqmsg) & MSG_RESENT))
+                return 0;
+
+        if (!(lustre_msg_get_flags(req->rq_reqmsg) & MSG_REPLAY))
+                return 0;
+
+        /* if mfd exists then replay was done already */
+        spin_lock(&med->med_open_lock);
+        list_for_each(t, &med->med_open_head) {
+                mfd = list_entry(t, struct mds_file_data, mfd_list);
+                if (mfd->mfd_xid == req->rq_xid) {
+                        mds_mfd_addref(mfd);
+                        break;
+                }
+                mfd = NULL;
+        }
+        spin_unlock(&med->med_open_lock);
+
+        if (mfd) {
+                struct mds_body *body = lustre_msg_buf(req->rq_repmsg,
+                                                       DLM_REPLY_REC_OFF,
+                                                       sizeof(*body));
+                __u64 *pre_versions = lustre_msg_get_versions(req->rq_reqmsg);
+
+                body->handle.cookie = mfd->mfd_handle.h_cookie;
+                CDEBUG(D_INODE, "resend mfd %p, cookie "LPX64"\n", mfd,
+                       mfd->mfd_handle.h_cookie);
+                mds_mfd_put(mfd);
+                lustre_msg_set_versions(req->rq_repmsg, pre_versions);
+                lustre_msg_set_transno(req->rq_repmsg,
+                                       lustre_msg_get_transno(req->rq_reqmsg));
+                lustre_msg_set_status(req->rq_repmsg, 0);
+                return 1;
+        }
+        return 0;
+}
+
  /* do NOT or the MAY_*'s, you'll get the weakest */
  static int accmode(struct inode *inode, int flags)
  {
@@ -705,7 +774,8 @@ static int mds_finish_open(struct ptlrpc_request *req, struct dentry *dchild,
          if (S_ISREG(dchild->d_inode->i_mode) &&
              !(body->valid & OBD_MD_FLEASIZE)) {
                  rc = mds_pack_md(obd, req->rq_repmsg, DLM_REPLY_REC_OFF + 1,
-                                 body, dchild->d_inode, 0, 0);
+                                 body, dchild->d_inode, 0, 0,
+                                 req->rq_export->exp_connect_flags);
                  if (rc) {
                          UNLOCK_INODE_MUTEX(dchild->d_inode);
                          RETURN(rc);
@@ -718,11 +788,18 @@ static int mds_finish_open(struct ptlrpc_request *req, struct dentry *dchild,
                          RETURN(-EEXIST);
                  }
                  if (rec->ur_flags & MDS_OPEN_JOIN_FILE) {
+#if LUSTRE_FIX >= 50
+                        /* Allow file join in beta builds to allow debugging */
                          UNLOCK_INODE_MUTEX(dchild->d_inode);
                          rc = mds_join_file(rec, req, dchild, lockh);
                          if (rc)
                                  RETURN(rc);
                          LOCK_INODE_MUTEX(dchild->d_inode);
+#else
+                        CWARN("file join is not supported in this version of "
+                              "Lustre\n");
+                        RETURN(-EOPNOTSUPP);
+#endif
                  }
                  if (!(body->valid & OBD_MD_FLEASIZE) &&
                      !(body->valid & OBD_MD_FLMODEASIZE)) {
@@ -760,6 +837,9 @@ static int mds_finish_open(struct ptlrpc_request *req, struct dentry *dchild,
                          RETURN(rc);
          }
  
+        if ((rc = mds_lov_prepare_objids(obd,lmm)) != 0)
+                RETURN(rc);
+
          intent_set_disposition(rep, DISP_OPEN_OPEN);
          mfd = mds_dentry_open(dchild, mds->mds_vfsmnt, flags, req);
          if (IS_ERR(mfd))
@@ -770,7 +850,7 @@ static int mds_finish_open(struct ptlrpc_request *req, struct dentry *dchild,
  
          mds_lov_update_objids(obd, lmm);
  
-        if (rc) /* coverity[deadcode] */
+        if (rc)
                  mds_mfd_unlink(mfd, 1);
  
          mds_mfd_put(mfd);
@@ -781,6 +861,7 @@ static int mds_open_by_fid(struct ptlrpc_request *req, struct ll_fid *fid,
                             struct mds_body *body, int flags,
                             struct mds_update_record *rec,struct ldlm_reply *rep)
  {
+        struct obd_device *obd = req->rq_export->exp_obd;
          struct mds_obd *mds = mds_req2mds(req);
          struct dentry *dchild;
          char fidname[LL_FID_NAMELEN];
@@ -789,7 +870,7 @@ static int mds_open_by_fid(struct ptlrpc_request *req, struct ll_fid *fid,
          ENTRY;
  
          fidlen = ll_fid2str(fidname, fid->id, fid->generation);
-        dchild = ll_lookup_one_len(fidname, mds->mds_pending_dir, fidlen);
+        dchild = mds_lookup(obd, fidname, mds->mds_pending_dir, fidlen);
          if (IS_ERR(dchild)) {
                  rc = PTR_ERR(dchild);
                  CERROR("error looking up %s in PENDING: rc = %d\n",fidname, rc);
@@ -810,13 +891,12 @@ static int mds_open_by_fid(struct ptlrpc_request *req, struct ll_fid *fid,
                          RETURN(PTR_ERR(dchild));
          }
  
-        mds_pack_inode2fid(&body->fid1, dchild->d_inode);
          mds_pack_inode2body(body, dchild->d_inode);
          intent_set_disposition(rep, DISP_LOOKUP_EXECD);
          intent_set_disposition(rep, DISP_LOOKUP_POS);
  
          rc = mds_finish_open(req, dchild, body, flags, &handle, rec, rep, NULL);
-        rc = mds_finish_transno(mds, dchild->d_inode, handle,
+        rc = mds_finish_transno(mds, NULL, handle,
                                  req, rc, rep ? rep->lock_policy_res1 : 0, 0);
          /* XXX what do we do here if mds_finish_transno itself failed? */
  
@@ -885,6 +965,7 @@ int mds_open(struct mds_update_record *rec, int offset,
          struct ldlm_reply *rep = NULL;
          struct mds_body *body = NULL;
          struct dentry *dchild = NULL, *dparent = NULL;
+        struct inode *inodes[PTLRPC_NUM_VERSIONS] = { NULL };
          struct mds_export_data *med;
          struct lustre_handle parent_lockh;
          int rc = 0, cleanup_phase = 0, acc_mode, created = 0;
@@ -896,10 +977,9 @@ int mds_open(struct mds_update_record *rec, int offset,
          int child_mode = LCK_CR;
          /* Always returning LOOKUP lock if open succesful to guard
             dentry on client. */
-        ldlm_policy_data_t policy = {.l_inodebits={MDS_INODELOCK_LOOKUP}};
-        struct ldlm_res_id child_res_id = { .name = {0}};
          int lock_flags = 0;
          int rec_pending = 0;
+        int use_parent, need_open_lock;
          unsigned int gid = current->fsgid;
          ENTRY;
  
@@ -923,6 +1003,10 @@ int mds_open(struct mds_update_record *rec, int offset,
                  LBUG();
          }
  
+        /* check the open resent|replay case */
+        if (open_replay_reconstruct(req))
+                RETURN(0);
+
          MDS_CHECK_RESENT(req, reconstruct_open(rec, offset, req, child_lockh));
  
          /* Step 0: If we are passed a fid, then we assume the client already
@@ -942,6 +1026,13 @@ int mds_open(struct mds_update_record *rec, int offset,
                          RETURN(-EFAULT);
                  }
  
+                /** check there is no stale orphan with same inode number */
+                if (rec->ur_flags & MDS_OPEN_CREAT) {
+                        rc = mds_check_stale_orphan(obd, rec->ur_fid2);
+                        if (rc)
+                                RETURN(rc);
+                }
+
                  rc = mds_open_by_fid(req, rec->ur_fid2, body, rec->ur_flags,
                                       rec, rep);
                  if (rc != -ENOENT) {
@@ -973,17 +1064,45 @@ int mds_open(struct mds_update_record *rec, int offset,
                  RETURN(-ENOMEM);
          }
  
-        /* Step 1: Find and lock the parent */
          if (rec->ur_flags & (MDS_OPEN_CREAT | MDS_OPEN_JOIN_FILE))
                  parent_mode = LCK_EX;
-        dparent = mds_fid2locked_dentry(obd, rec->ur_fid1, NULL, parent_mode,
-                                        &parent_lockh, rec->ur_name,
-                                        rec->ur_namelen - 1,
-                                        MDS_INODELOCK_UPDATE);
-        if (IS_ERR(dparent)) {
-                rc = PTR_ERR(dparent);
+
+        /* We cannot use acc_mode here, because it is zeroed in case of
+           creating a file, so we get wrong lockmode */
+        if (rec->ur_flags & FMODE_WRITE)
+                child_mode = LCK_CW;
+        else if (rec->ur_flags & MDS_FMODE_EXEC)
+                child_mode = LCK_PR;
+        else
+                child_mode = LCK_CR;
+
+        /* join file and nfsd can't need lookup dchild as use parent for it */
+        use_parent = (!(lustre_msg_get_flags(req->rq_reqmsg) & MSG_REPLAY) &&
+                     (rec->ur_flags & MDS_OPEN_LOCK) && (rec->ur_namelen == 1)) ||
+                     (rec->ur_flags & MDS_OPEN_JOIN_FILE);
+
+        need_open_lock = !(lustre_msg_get_flags(req->rq_reqmsg) & MSG_REPLAY) &&
+                           (rec->ur_flags & MDS_OPEN_LOCK);
+
+        /* Try to lock both parent and child first. If child is not found,
+         * return only locked parent.  This is enough to prevent other
+         * threads from changing this directory until creation is finished. */
+        rc = mds_get_parent_child_locked(obd, &obd->u.mds,
+                                         rec->ur_fid1,
+                                         &parent_lockh,
+                                         &dparent, parent_mode,
+                                         MDS_INODELOCK_UPDATE,
+                                         use_parent ? NULL : rec->ur_name,
+                                         rec->ur_namelen,
+                                         (rec->ur_flags & MDS_OPEN_LOCK) ?
+                                                child_lockh : NULL,
+                                         &dchild, child_mode,
+                                         MDS_INODELOCK_LOOKUP |
+                                         MDS_INODELOCK_OPEN);
+
+        if (rc) {
                  if (rc != -ENOENT) {
-                        CERROR("parent "LPU64"/%u lookup error %d\n",
+                        CERROR("parent "LPU64"/%u lookup/take lock error %d\n",
                                 rec->ur_fid1->id, rec->ur_fid1->generation, rc);
                  } else {
                          /* Just cannot find parent - make it look like
@@ -997,34 +1116,20 @@ int mds_open(struct mds_update_record *rec, int offset,
  
          cleanup_phase = 1; /* parent dentry and lock */
  
-        if (rec->ur_flags & MDS_OPEN_JOIN_FILE) {
+        if (use_parent)
                  dchild = dget(dparent);
-                cleanup_phase = 2; /* child dentry */
-                acc_mode = accmode(dchild->d_inode, rec->ur_flags);
-                GOTO(found_child, rc);
-        }
  
-        /* Step 2: Lookup the child */
-      
-        if (!(lustre_msg_get_flags(req->rq_reqmsg) & MSG_REPLAY) &&
-            (rec->ur_flags & MDS_OPEN_LOCK) && (rec->ur_namelen == 1)) {
-                /* hack for nfsd with no_subtree_check, it will use anon
-                 * dentry w/o filename to open the file. the anon dentry's
-                 * parent was set to itself, so rec->ur_fid1 is the file.
-                 * And in MDC it cannot derive the dentry's parent dentry,
-                 * hence the file's name, so we hack here in MDS, 
-                 * refer to bug 13030. */
-                dchild = mds_fid2dentry(mds, rec->ur_fid1, NULL);
-        } else {
-                dchild = ll_lookup_one_len(rec->ur_name, dparent,
-                                           rec->ur_namelen - 1);
-        }
          if (IS_ERR(dchild)) {
                  rc = PTR_ERR(dchild);
                  dchild = NULL; /* don't confuse mds_finish_transno() below */
                  GOTO(cleanup, rc);
          }
  
+        if (rec->ur_flags & MDS_OPEN_JOIN_FILE) {
+                acc_mode = accmode(dchild->d_inode, rec->ur_flags);
+                GOTO(found_child, rc);
+        }
+
          cleanup_phase = 2; /* child dentry */
  
          intent_set_disposition(rep, DISP_LOOKUP_EXECD);
@@ -1047,15 +1152,28 @@ int mds_open(struct mds_update_record *rec, int offset,
                  if (req->rq_export->exp_connect_flags & OBD_CONNECT_RDONLY)
                          GOTO(cleanup, rc = -EROFS);
  
+                /** check there is no stale orphan with same inode number */
+                rc = mds_check_stale_orphan(obd, rec->ur_fid2);
+                if (rc)
+                        GOTO(cleanup, rc);
+
+                /* version recovery check */
+                rc = mds_version_get_check(req, dparent->d_inode, 0);
+                if (rc)
+                        GOTO(cleanup_no_trans, rc);
+
                  if (dparent->d_inode->i_mode & S_ISGID)
                          gid = dparent->d_inode->i_gid;
                  else
                          gid = current->fsgid;
  
                  /* we try to get enough quota to write here, and let ldiskfs
-                 * decide if it is out of quota or not b=14783 */
+                 * decide if it is out of quota or not b=14783
+                 * FIXME: after CMD is used, pointer to obd_trans_info* couldn't
+                 * be NULL, b=14840 */
                  lquota_chkquota(mds_quota_interface_ref, obd,
-                                current->fsuid, gid, 1, &rec_pending);
+                                current->fsuid, gid, 1, &rec_pending,
+                                NULL, NULL, 0);
  
                  intent_set_disposition(rep, DISP_OPEN_CREATE);
                  handle = fsfilt_start(obd, dparent->d_inode, FSFILT_OP_CREATE,
@@ -1069,7 +1187,9 @@ int mds_open(struct mds_update_record *rec, int offset,
                  dp.ldp_ptr = req;
                  dp.ldp_inum = ino;
  
+                LOCK_INODE_MUTEX(dparent->d_inode);
                  rc = ll_vfs_create(dparent->d_inode, dchild, rec->ur_mode,NULL);
+                UNLOCK_INODE_MUTEX(dparent->d_inode);
                  if (dchild->d_fsdata == (void *)(unsigned long)ino)
                          dchild->d_fsdata = NULL;
  
@@ -1078,15 +1198,24 @@ int mds_open(struct mds_update_record *rec, int offset,
                          GOTO(cleanup, rc);
                  }
                  inode = dchild->d_inode;
+                created = 1;
                  if (ino) {
-                        LASSERT(ino == inode->i_ino);
+                        if (ino != inode->i_ino) {
+                                /* FID support is needed to replay this
+                                 * correctly. Now fail gracefully like there is
+                                 * version mismatch */
+                                if (req->rq_export->exp_delayed)
+                                        rc = -EOVERFLOW;
+                                else
+                                        rc = -EFAULT;
+                                GOTO(cleanup, rc);
+                        }
                          /* Written as part of setattr */
                          inode->i_generation = rec->ur_fid2->generation;
                          CDEBUG(D_HA, "recreated ino %lu with gen %u\n",
                                 inode->i_ino, inode->i_generation);
                  }
  
-                created = 1;
                  LTIME_S(iattr.ia_atime) = rec->ur_time;
                  LTIME_S(iattr.ia_ctime) = rec->ur_time;
                  LTIME_S(iattr.ia_mtime) = rec->ur_time;
@@ -1112,6 +1241,14 @@ int mds_open(struct mds_update_record *rec, int offset,
                  acc_mode = 0;           /* Don't check for permissions */
          } else {
                  acc_mode = accmode(dchild->d_inode, rec->ur_flags);
+                /* Child previously existed so the lookup and lock is already
+                 * done, so no further locking is needed. */
+                /* for nfs and join - we need two locks for same fid, but
+                 * with different mode */
+                if (need_open_lock && !use_parent)  {
+                        intent_set_disposition(rep, DISP_OPEN_LOCK);
+                        need_open_lock = 0;
+                }
          }
  
          LASSERTF(!mds_inode_is_orphan(dchild->d_inode),
@@ -1120,7 +1257,6 @@ int mds_open(struct mds_update_record *rec, int offset,
                   dchild->d_inode->i_ino, dchild->d_inode->i_generation);
  
  found_child:
-        mds_pack_inode2fid(&body->fid1, dchild->d_inode);
          mds_pack_inode2body(body, dchild->d_inode);
  
          if (S_ISREG(dchild->d_inode->i_mode)) {
@@ -1175,28 +1311,18 @@ found_child:
                  GOTO(cleanup, rc = -EAGAIN);
          }
  
-        /* Obtain OPEN lock as well */
-        policy.l_inodebits.bits |= MDS_INODELOCK_OPEN;
-
-        /* We cannot use acc_mode here, because it is zeroed in case of
-           creating a file, so we get wrong lockmode */
-        if (rec->ur_flags & FMODE_WRITE)
-                child_mode = LCK_CW;
-        else if (rec->ur_flags & MDS_FMODE_EXEC)
-                child_mode = LCK_PR;
-        else
-                child_mode = LCK_CR;
+        if (need_open_lock) {
+                ldlm_policy_data_t policy = { .l_inodebits = { MDS_INODELOCK_LOOKUP | MDS_INODELOCK_OPEN } };
+                struct ldlm_res_id child_res_id;
  
-        if (!(lustre_msg_get_flags(req->rq_reqmsg) & MSG_REPLAY) && 
-             (rec->ur_flags & MDS_OPEN_LOCK)) {
                  /* In case of replay we do not get a lock assuming that the
                     caller has it already */
                  child_res_id.name[0] = dchild->d_inode->i_ino;
                  child_res_id.name[1] = dchild->d_inode->i_generation;
  
                  rc = ldlm_cli_enqueue_local(obd->obd_namespace, &child_res_id,
-                                            LDLM_IBITS, &policy, child_mode, 
-                                            &lock_flags, ldlm_blocking_ast, 
+                                            LDLM_IBITS, &policy, child_mode,
+                                            &lock_flags, ldlm_blocking_ast,
                                              ldlm_completion_ast, NULL, NULL,
                                              0, NULL, child_lockh);
                  if (rc != ELDLM_OK)
@@ -1204,7 +1330,6 @@ found_child:
  
                  /* Let mds_intent_policy know that we have a lock to return */
                  intent_set_disposition(rep, DISP_OPEN_LOCK);
-                cleanup_phase = 3;
          }
  
          if (!S_ISREG(dchild->d_inode->i_mode) &&
@@ -1221,22 +1346,23 @@ found_child:
          GOTO(cleanup, rc);
  
   cleanup:
-        rc = mds_finish_transno(mds, dchild ? dchild->d_inode : NULL, handle,
-                                req, rc, rep ? rep->lock_policy_res1 : 0, 0);
+        inodes[0] = (!created || IS_ERR(dparent)) ? NULL : dparent->d_inode;
+        inodes[1] = (created && dchild) ? dchild->d_inode : NULL;
+        rc = mds_finish_transno(mds, inodes, handle, req, rc,
+                                rep ? rep->lock_policy_res1 : 0, 0);
  
   cleanup_no_trans:
          if (rec_pending)
                  lquota_pending_commit(mds_quota_interface_ref, obd,
-                                      current->fsuid, gid, 1);
+                                      current->fsuid, gid, rec_pending);
          switch (cleanup_phase) {
-        case 3:
-                if (rc)
-                        /* It is safe to leave IT_OPEN_LOCK set, if rc is not 0,
-                         * mds_intent_policy won't try to return any locks */
-                        ldlm_lock_decref(child_lockh, child_mode);
          case 2:
                  if (rc && created) {
-                        int err = vfs_unlink(dparent->d_inode, dchild);
+                        int err;
+                        LOCK_INODE_MUTEX(dparent->d_inode);
+                        err = ll_vfs_unlink(dparent->d_inode, dchild,
+                                            mds->mds_vfsmnt);
+                        UNLOCK_INODE_MUTEX(dparent->d_inode);
                          if (err) {
                                  CERROR("unlink(%.*s) in error path: %d\n",
                                         dchild->d_name.len, dchild->d_name.name,
@@ -1248,8 +1374,14 @@ found_child:
                          qpids[USRQUOTA] = dparent->d_inode->i_uid;
                          qpids[GRPQUOTA] = dparent->d_inode->i_gid;
                  }
-                l_dput(dchild);
          case 1:
+                if (dchild) {
+                        l_dput(dchild);
+                        /* It is safe to leave IT_OPEN_LOCK set, if rc is not 0,
+                         * mds_intent_policy won't try to return any locks */
+                        if (rc && child_lockh->cookie)
+                                ldlm_lock_decref(child_lockh, child_mode);
+                }
                  if (dparent == NULL)
                          break;
  
@@ -1262,6 +1394,7 @@ found_child:
          /* trigger dqacq on the owner of child and parent */
          lquota_adjust(mds_quota_interface_ref, obd, qcids, qpids, rc,
                        FSFILT_OP_CREATE);
+
          RETURN(rc);
  }
  
@@ -1304,8 +1437,9 @@ int mds_mfd_close(struct ptlrpc_request *req, int offset,
          CDEBUG(D_INODE, "inode %p ino %s nlink %d orphan %d\n", inode, fidname,
                 inode->i_nlink, mds_orphan_open_count(inode));
  
-        last_orphan = mds_orphan_open_dec_test(inode) &&
-                      mds_inode_is_orphan(inode);
+        last_orphan = (mds_orphan_open_dec_test(inode) &&
+                       mds_inode_is_orphan(inode) &&
+                       !obd->obd_recovering);
  
          /* this is half of the actual "close" */
          if (mfd->mfd_mode & FMODE_WRITE) {
@@ -1314,7 +1448,7 @@ int mds_mfd_close(struct ptlrpc_request *req, int offset,
          } else if (mfd->mfd_mode & MDS_FMODE_EXEC) {
                  mds_allow_write_access(inode);
          }
-        /* here writecount change also needs protection from orphan write sem. 
+        /* here writecount change also needs protection from orphan write sem.
           * so drop orphan write sem after mds_put_write_access, bz 12888. */
          MDS_UP_WRITE_ORPHAN_SEM(inode);
  
@@ -1345,7 +1479,8 @@ int mds_mfd_close(struct ptlrpc_request *req, int offset,
  
                  cleanup_phase = 2; /* dput(pending_child) when finished */
                  if (S_ISDIR(pending_child->d_inode->i_mode)) {
-                        rc = vfs_rmdir(pending_dir, pending_child);
+                        rc = ll_vfs_rmdir(pending_dir, pending_child,
+                                          mds->mds_vfsmnt);
                          if (rc)
                                  CERROR("error unlinking orphan dir %s: rc %d\n",
                                         fidname,rc);
@@ -1372,52 +1507,36 @@ int mds_mfd_close(struct ptlrpc_request *req, int offset,
                  dp.ldp_inum = 0;
                  dp.ldp_ptr = req;
                  pending_child->d_fsdata = (void *) &dp;
-                rc = vfs_unlink(pending_dir, pending_child);
+                rc = ll_vfs_unlink(pending_dir, pending_child, mds->mds_vfsmnt);
                  if (rc)
                          CERROR("error unlinking orphan %s: rc %d\n",fidname,rc);
  
                  goto out; /* Don't bother updating attrs on unlinked inode */
          }
  
-#if 0
-        if (request_body != NULL && mfd->mfd_mode & FMODE_WRITE && rc == 0) {
-                /* Update the on-disk attributes if this was the last write
-                 * close, and all information was provided (i.e., rc == 0)
-                 *
-                 * XXX this should probably be abstracted with mds_reint_setattr
-                 */
-
-                if (request_body->valid & OBD_MD_FLMTIME &&
-                    LTIME_S(iattr.ia_mtime) > LTIME_S(inode->i_mtime)) {
-                        LTIME_S(iattr.ia_mtime) = request_body->mtime;
-                        iattr.ia_valid |= ATTR_MTIME;
-                }
-                if (request_body->valid & OBD_MD_FLCTIME &&
-                    LTIME_S(iattr.ia_ctime) > LTIME_S(inode->i_ctime)) {
-                        LTIME_S(iattr.ia_ctime) = request_body->ctime;
-                        iattr.ia_valid |= ATTR_CTIME;
+        if (request_body != NULL) {
+                /* Only start a transaction to write out only the atime if it
+                 * is more out-of-date than the specified limit.  If we are
+                 * already going to write out the atime then do it anyway. */
+                if ((request_body->valid & OBD_MD_FLATIME) &&
+                    ((LTIME_S(iattr.ia_atime) >
+                      LTIME_S(inode->i_atime) + mds->mds_atime_diff) ||
+                     (iattr.ia_valid != 0 &&
+                      LTIME_S(iattr.ia_atime) > LTIME_S(inode->i_atime)))) {
+                        LTIME_S(iattr.ia_atime) = request_body->atime;
+                        iattr.ia_valid |= ATTR_ATIME;
                  }
  
-                /* XXX can't set block count with fsfilt_setattr (!) */
-                if (request_body->valid & OBD_MD_FLSIZE) {
-                        iattr.ia_valid |= ATTR_SIZE;
+                /* Store a rough estimate of the file size on the MDS for
+                 * tools like e2scan and HSM that are just using this for *
+                 * rough decision making and will get the proper size later.
+                 * * This is NOT guaranteed to be correct with multiple *
+                 * writers, but is only needed until SOM is done. b=11063 */
+                if ((request_body->valid & OBD_MD_FLSIZE) &&
+                    (iattr.ia_valid != 0)) {
                          iattr.ia_size = request_body->size;
+                        iattr.ia_valid |= ATTR_SIZE;
                  }
-                /* iattr.ia_blocks = request_body->blocks */
-
-        }
-#endif
-        if (request_body != NULL && request_body->valid & OBD_MD_FLATIME) {
-                /* Only start a transaction to write out only the atime if
-                 * it is more out-of-date than the specified limit.  If we
-                 * are already going to write out the atime then do it anyway.
-                 * */
-                LTIME_S(iattr.ia_atime) = request_body->atime;
-                if ((LTIME_S(iattr.ia_atime) >
-                     LTIME_S(inode->i_atime) + mds->mds_atime_diff) ||
-                    (iattr.ia_valid != 0 &&
-                     LTIME_S(iattr.ia_atime) > LTIME_S(inode->i_atime)))
-                        iattr.ia_valid |= ATTR_ATIME;
          }
  
          if (iattr.ia_valid != 0) {
@@ -1440,7 +1559,7 @@ out:
  
   cleanup:
          if (req != NULL && reply_body != NULL) {
-                rc = mds_finish_transno(mds, pending_dir, handle, req, rc, 0, 0);
+                rc = mds_finish_transno(mds, NULL, handle, req, rc, 0, 0);
          } else if (handle) {
                  int err = fsfilt_commit(obd, pending_dir, handle, 0);
                  if (err) {
@@ -1518,15 +1637,16 @@ int mds_close(struct ptlrpc_request *req, int offset)
          inode = mfd->mfd_dentry->d_inode;
          /* child orphan sem protects orphan_dec_test && is_orphan race */
          MDS_DOWN_WRITE_ORPHAN_SEM(inode); /* mds_mfd_close drops this */
-        if (mds_inode_is_orphan(inode) && mds_orphan_open_count(inode) == 1) {
+        if (!obd->obd_recovering &&
+            mds_inode_is_orphan(inode) && mds_orphan_open_count(inode) == 1) {
                  body = lustre_msg_buf(req->rq_repmsg, REPLY_REC_OFF,
                                        sizeof(*body));
                  LASSERT(body != NULL);
  
-                mds_pack_inode2fid(&body->fid1, inode);
                  mds_pack_inode2body(body, inode);
                  mds_pack_md(obd, req->rq_repmsg, REPLY_REC_OFF + 1, body, inode,
-                            MDS_PACK_MD_LOCK, 0);
+                            MDS_PACK_MD_LOCK, 0,
+                            req->rq_export->exp_connect_flags);
          }
  
          push_ctxt(&saved, &obd->obd_lvfs_ctxt, NULL);
diff --git a/lustre/mds/mds_reint.c b/lustre/mds/mds_reint.c

index e12d6a0..b6ce9f7 100644 (file)
--- a/lustre/mds/mds_reint.c
+++ b/lustre/mds/mds_reint.c
@@ -1,31 +1,45 @@
  /* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
   * vim:expandtab:shiftwidth=8:tabstop=8:
   *
- *  linux/mds/mds_reint.c
- *  Lustre Metadata Server (mds) reintegration routines
+ * GPL HEADER START
   *
- *  Copyright (C) 2002-2005 Cluster File Systems, Inc.
- *   Author: Peter Braam <braam@clusterfs.com>
- *   Author: Andreas Dilger <adilger@clusterfs.com>
- *   Author: Phil Schwan <phil@clusterfs.com>
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   *
- *   This file is part of the Lustre file system, http://www.lustre.org
- *   Lustre is a trademark of Cluster File Systems, Inc.
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
   *
- *   You may have signed or agreed to another license before downloading
- *   this software.  If so, you are bound by the terms and conditions
- *   of that agreement, and the following does not apply to you.  See the
- *   LICENSE file included with this distribution for more information.
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
   *
- *   If you did not agree to a different license, then this copy of Lustre
- *   is open source software; you can redistribute it and/or modify it
- *   under the terms of version 2 of the GNU General Public License as
- *   published by the Free Software Foundation.
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
   *
- *   In either case, Lustre is distributed in the hope that it will be
- *   useful, but WITHOUT ANY WARRANTY; without even the implied warranty
- *   of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- *   license text for more details.
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright  2008 Sun Microsystems, Inc. All rights reserved
+ * Use is subject to license terms.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/mds/mds_reint.c
+ *
+ * Lustre Metadata Server (mds) reintegration routines
+ *
+ * Author: Peter Braam <braam@clusterfs.com>
+ * Author: Andreas Dilger <adilger@clusterfs.com>
+ * Author: Phil Schwan <phil@clusterfs.com>
   */
  
  #ifndef EXPORT_SYMTAB
@@ -43,13 +57,15 @@
  #include <lustre_dlm.h>
  #include <lustre_fsfilt.h>
  #include <lustre_ucache.h>
+#include <lustre_net.h>
  
  #include "mds_internal.h"
  
  void mds_commit_cb(struct obd_device *obd, __u64 transno, void *data,
                     int error)
  {
-        obd_transno_commit_cb(obd, transno, error);
+        struct obd_export *exp = data;
+        obd_transno_commit_cb(obd, transno, exp, error);
  }
  
  struct mds_logcancel_data {
@@ -60,6 +76,38 @@ struct mds_logcancel_data {
          struct llog_cookie      mlcd_cookies[0];
  };
  
+/** lookup child dentry in parent dentry according to the name.
+ *  if dentry is found, delete "lustre_mdt_attrs" EA (with name "lma")
+ *  if it exists by checking OBD_INCOMPAT_FID.
+ */
+struct dentry *mds_lookup(struct obd_device *obd, const char *fid_name,
+                          struct dentry *dparent, int fid_namelen)
+{
+        struct dentry *dchild;
+        struct lr_server_data *lsd = obd->u.mds.mds_server_data;
+        EXIT;
+
+        dchild = ll_lookup_one_len(fid_name, dparent, fid_namelen);
+        if (!IS_ERR(dchild) &&
+            unlikely((lsd->lsd_feature_incompat & OBD_INCOMPAT_FID) ||
+                      OBD_FAIL_CHECK(OBD_FAIL_MDS_REMOVE_COMMON_EA))) {
+                struct inode *inode = dchild->d_inode; 
+                void         *handle;
+                if (inode != NULL) {
+                        handle = fsfilt_start(obd, inode, FSFILT_OP_SETATTR,
+                                              NULL);
+                        if (!IS_ERR(handle)) {
+                                LOCK_INODE_MUTEX(inode);
+                                fsfilt_set_md(obd, inode, handle, NULL, 0,
+                                              "lma");
+                                /* result is ignored. */
+                                UNLOCK_INODE_MUTEX(inode);
+                                fsfilt_commit(obd, inode, handle, 0);
+                        }
+                }
+        }
+        RETURN(dchild);
+}
  
  static void mds_cancel_cookies_cb(struct obd_device *obd, __u64 transno,
                                    void *cb_data, int error)
@@ -69,7 +117,7 @@ static void mds_cancel_cookies_cb(struct obd_device *obd, __u64 transno,
          struct llog_ctxt *ctxt;
          int rc;
  
-        obd_transno_commit_cb(obd, transno, error);
+        obd_transno_commit_cb(obd, transno, NULL, error);
  
          CDEBUG(D_RPCTRACE, "cancelling %d cookies\n",
                 (int)(mlcd->mlcd_cookielen / sizeof(*mlcd->mlcd_cookies)));
@@ -81,13 +129,13 @@ static void mds_cancel_cookies_cb(struct obd_device *obd, __u64 transno,
                         (int)(mlcd->mlcd_cookielen/sizeof(*mlcd->mlcd_cookies)),
                         rc);
          } else {
-                ///* XXX 0 normally, SENDNOW for debug */);
                  rc = obd_checkmd(obd->u.mds.mds_osc_exp, obd->obd_self_export,
                                   lsm);
                  if (rc)
                          CERROR("Can not revalidate lsm %p \n", lsm);
  
                  ctxt = llog_get_context(obd,mlcd->mlcd_cookies[0].lgc_subsys+1);
+                /* XXX 0 normally, SENDNOW for debug */
                  rc = llog_cancel(ctxt, lsm, mlcd->mlcd_cookielen /
                                                  sizeof(*mlcd->mlcd_cookies),
                                   mlcd->mlcd_cookies, OBD_LLOG_FL_SENDNOW);
@@ -102,9 +150,65 @@ static void mds_cancel_cookies_cb(struct obd_device *obd, __u64 transno,
          OBD_FREE(mlcd, mlcd->mlcd_size);
  }
  
+/* fsfilt_set_version return old version. use that here */
+static void mds_versions_set(struct obd_device *obd,
+                             struct inode **inodes, __u64 version)
+{
+        int i;
+
+        if (inodes == NULL)
+                return;
+
+        for (i = 0; i < PTLRPC_NUM_VERSIONS; i++)
+                if (inodes[i] != NULL)
+                        fsfilt_set_version(obd, inodes[i], version);
+}
+
+int mds_version_get_check(struct ptlrpc_request *req, struct inode *inode,
+                          int index)
+{
+        /* version recovery */
+        struct obd_device *obd = req->rq_export->exp_obd;
+        __u64 curr_version, *pre_versions;
+        ENTRY;
+
+        if (inode == NULL || !exp_connect_vbr(req->rq_export))
+                RETURN(0);
+
+        curr_version = fsfilt_get_version(obd, inode);
+        if ((__s64)curr_version == -EOPNOTSUPP)
+                RETURN(0);
+        /* VBR: version is checked always because costs nothing */
+        if (lustre_msg_get_transno(req->rq_reqmsg) != 0) {
+                pre_versions = lustre_msg_get_versions(req->rq_reqmsg);
+                LASSERT(index < PTLRPC_NUM_VERSIONS);
+                /* Sanity check for malformed buffers */
+                if (pre_versions == NULL) {
+                        CERROR("No versions in request buffer\n");
+                        spin_lock(&req->rq_export->exp_lock);
+                        req->rq_export->exp_vbr_failed = 1;
+                        spin_unlock(&req->rq_export->exp_lock);
+                        RETURN(-EOVERFLOW);
+                } else if (pre_versions[index] != curr_version) {
+                        CDEBUG(D_INODE, "Version mismatch "LPX64" != "LPX64"\n",
+                               pre_versions[index], curr_version);
+                        spin_lock(&req->rq_export->exp_lock);
+                        req->rq_export->exp_vbr_failed = 1;
+                        spin_unlock(&req->rq_export->exp_lock);
+                        RETURN(-EOVERFLOW);
+                }
+        }
+        /* save pre-versions in reply */
+        LASSERT(req->rq_repmsg != NULL);
+        pre_versions = lustre_msg_get_versions(req->rq_repmsg);
+        if (pre_versions)
+                pre_versions[index] = curr_version;
+        RETURN(0);
+}
+
  /* Assumes caller has already pushed us into the kernel context. */
-int mds_finish_transno(struct mds_obd *mds, struct inode *inode, void *handle,
-                       struct ptlrpc_request *req, int rc, __u32 op_data, 
+int mds_finish_transno(struct mds_obd *mds, struct inode **inodes, void *handle,
+                       struct ptlrpc_request *req, int rc, __u32 op_data,
                         int force_sync)
  {
          struct mds_export_data *med = &req->rq_export->exp_mds_data;
@@ -114,6 +218,8 @@ int mds_finish_transno(struct mds_obd *mds, struct inode *inode, void *handle,
          int err;
          loff_t off;
          int log_pri = D_RPCTRACE;
+        struct inode *inode = inodes ? inodes[0] : NULL;
+        int version_set = handle ? 1 : 0;
          ENTRY;
  
          if (IS_ERR(handle)) {
@@ -151,24 +257,33 @@ int mds_finish_transno(struct mds_obd *mds, struct inode *inode, void *handle,
                                 obd->obd_name,
                                 libcfs_nid2str(req->rq_export->exp_connection->c_peer.nid),
                                 transno, rc);
-                        transno = 0;
                  }
          } else if (transno == 0) {
                  spin_lock(&mds->mds_transno_lock);
                  transno = ++mds->mds_last_transno;
                  spin_unlock(&mds->mds_transno_lock);
+                /* VBR: set versions */
+                if (inodes && version_set)
+                        mds_versions_set(obd, inodes, transno);
          } else {
                  spin_lock(&mds->mds_transno_lock);
                  if (transno > mds->mds_last_transno)
                          mds->mds_last_transno = transno;
                  spin_unlock(&mds->mds_transno_lock);
+
+                /* VBR: replay case. Copy version from replay req and
+                 * set new versions */
+                mds_versions_set(obd, inodes, transno);
          }
  
          req->rq_transno = transno;
          lustre_msg_set_transno(req->rq_repmsg, transno);
+
+        if (transno == 0)
+                LASSERT(rc != 0);
          if (lustre_msg_get_opc(req->rq_reqmsg) == MDS_CLOSE) {
-                prev_transno = le64_to_cpu(lcd->lcd_last_close_transno);
-                lcd->lcd_last_close_transno = cpu_to_le64(transno);
+                if (transno != 0)
+                        lcd->lcd_last_close_transno = cpu_to_le64(transno);
                  lcd->lcd_last_close_xid = cpu_to_le64(req->rq_xid);
                  lcd->lcd_last_close_result = cpu_to_le32(rc);
                  lcd->lcd_last_close_data = cpu_to_le32(op_data);
@@ -177,15 +292,23 @@ int mds_finish_transno(struct mds_obd *mds, struct inode *inode, void *handle,
                  if (((lustre_msg_get_flags(req->rq_reqmsg) &
                        (MSG_RESENT | MSG_REPLAY)) == 0) ||
                      (transno > prev_transno)) {
-                        lcd->lcd_last_transno = cpu_to_le64(transno);
+                        /* VBR: save versions in last_rcvd for reconstruct. */
+                        __u64 *pre_versions = lustre_msg_get_versions(req->rq_repmsg);
+                        if (pre_versions) {
+                                lcd->lcd_pre_versions[0] = cpu_to_le64(pre_versions[0]);
+                                lcd->lcd_pre_versions[1] = cpu_to_le64(pre_versions[1]);
+                                lcd->lcd_pre_versions[2] = cpu_to_le64(pre_versions[2]);
+                                lcd->lcd_pre_versions[3] = cpu_to_le64(pre_versions[3]);
+                        }
+                        if (transno != 0)
+                                lcd->lcd_last_transno = cpu_to_le64(transno);
                          lcd->lcd_last_xid     = cpu_to_le64(req->rq_xid);
                          lcd->lcd_last_result  = cpu_to_le32(rc);
                          lcd->lcd_last_data    = cpu_to_le32(op_data);
                  }
          }
-        /* update the server data to not lose the greatest transno. Bug 11125 */
-        if ((transno == 0) && (prev_transno == mds->mds_last_transno))
-                mds_update_server_data(obd, 0);
+        /** update trans table */
+        target_trans_table_update(req->rq_export, transno);
  
          if (off <= 0) {
                  CERROR("client idx %d has offset %lld\n", med->med_lr_idx, off);
@@ -194,15 +317,15 @@ int mds_finish_transno(struct mds_obd *mds, struct inode *inode, void *handle,
                  struct obd_export *exp = req->rq_export;
  
                  if (!force_sync)
-                        force_sync = fsfilt_add_journal_cb(exp->exp_obd,transno, 
-                                                          handle, mds_commit_cb,
-                                                          NULL);
+                        force_sync = fsfilt_add_journal_cb(obd, transno,
+                                                           handle, mds_commit_cb,
+                                                           exp);
  
                  err = fsfilt_write_record(obd, mds->mds_rcvd_filp, lcd,
                                            sizeof(*lcd), &off,
                                            force_sync | exp->exp_need_sync);
                  if (force_sync)
-                        mds_commit_cb(obd, transno, NULL, err);
+                        mds_commit_cb(obd, transno, exp, err);
          }
  
          if (err) {
@@ -320,7 +443,15 @@ int mds_fix_attr(struct inode *inode, struct mds_update_record *rec)
                          attr->ia_valid |= ATTR_MODE;
                  }
          } else if (ia_valid & ATTR_MODE) {
-                int mode = attr->ia_mode;
+                int mode;
+                if (!(attr->ia_valid & ATTR_FORCE)) {
+                        mode = inode->i_mode;
+                        if (((mode & S_ISUID) && (!(attr->ia_mode & S_ISUID))) ||
+                            ((mode & S_ISGID) && (mode & S_IXGRP) &&
+                            (!(attr->ia_mode & S_ISGID))))
+                                attr->ia_valid |= ATTR_FORCE;
+                }
+                mode = attr->ia_mode;
                  /* chmod */
                  if (attr->ia_mode == (umode_t)-1)
                          mode = inode->i_mode;
@@ -380,19 +511,34 @@ void mds_steal_ack_locks(struct ptlrpc_request *req)
          spin_unlock(&exp->exp_lock);
  }
  
+/**
+ * VBR: restore versions
+ */
+void mds_vbr_reconstruct(struct ptlrpc_request *req,
+                         struct lsd_client_data *lcd)
+{
+        __u64 pre_versions[4] = {0};
+        pre_versions[0] = le64_to_cpu(lcd->lcd_pre_versions[0]);
+        pre_versions[1] = le64_to_cpu(lcd->lcd_pre_versions[1]);
+        pre_versions[2] = le64_to_cpu(lcd->lcd_pre_versions[2]);
+        pre_versions[3] = le64_to_cpu(lcd->lcd_pre_versions[3]);
+        lustre_msg_set_versions(req->rq_repmsg, pre_versions);
+}
+
  void mds_req_from_lcd(struct ptlrpc_request *req, struct lsd_client_data *lcd)
  {
          if (lustre_msg_get_opc(req->rq_reqmsg) == MDS_CLOSE) {
                  req->rq_transno = le64_to_cpu(lcd->lcd_last_close_transno);
-                lustre_msg_set_transno(req->rq_repmsg, req->rq_transno);
                  req->rq_status = le32_to_cpu(lcd->lcd_last_close_result);
-                lustre_msg_set_status(req->rq_repmsg, req->rq_status);
          } else {
                  req->rq_transno = le64_to_cpu(lcd->lcd_last_transno);
-                lustre_msg_set_transno(req->rq_repmsg, req->rq_transno);
                  req->rq_status = le32_to_cpu(lcd->lcd_last_result);
-                lustre_msg_set_status(req->rq_repmsg, req->rq_status);
+                mds_vbr_reconstruct(req, lcd);
          }
+        if (req->rq_status != 0)
+                req->rq_transno = 0;
+        lustre_msg_set_transno(req->rq_repmsg, req->rq_transno);
+        lustre_msg_set_status(req->rq_repmsg, req->rq_status);
          DEBUG_REQ(D_RPCTRACE, req, "restoring transno "LPD64"/status %d",
                    req->rq_transno, req->rq_status);
  
@@ -425,7 +571,6 @@ static void reconstruct_reint_setattr(struct mds_update_record *rec,
          }
  
          body = lustre_msg_buf(req->rq_repmsg, offset, sizeof(*body));
-        mds_pack_inode2fid(&body->fid1, de->d_inode);
          mds_pack_inode2body(body, de->d_inode);
  
          /* Don't return OST-specific attributes if we didn't just set them */
@@ -511,6 +656,7 @@ static int mds_reint_setattr(struct mds_update_record *rec, int offset,
          struct obd_device *obd = req->rq_export->exp_obd;
          struct mds_body *body;
          struct dentry *de;
+        struct inode *inodes[PTLRPC_NUM_VERSIONS] = { NULL };
          struct inode *inode = NULL;
          struct lustre_handle lockh;
          void *handle = NULL;
@@ -518,7 +664,10 @@ static int mds_reint_setattr(struct mds_update_record *rec, int offset,
          struct lov_mds_md *lmm = NULL;
          struct llog_cookie *logcookies = NULL;
          int lmm_size = 0, need_lock = 1, cookie_size = 0;
-        int rc = 0, cleanup_phase = 0, err, locked = 0, sync = 0;
+        int rc = 0, cleanup_phase = 0, err = 0, locked = 0, sync = 0;
+        int do_vbr = rec->ur_iattr.ia_valid &
+                     (ATTR_MODE|ATTR_UID|ATTR_GID|
+                      ATTR_FROM_OPEN|ATTR_RAW|ATTR_ATTR_FLAG);
          unsigned int qcids[MAXQUOTAS] = { 0, 0 };
          unsigned int qpids[MAXQUOTAS] = { rec->ur_iattr.ia_uid, 
                                            rec->ur_iattr.ia_gid };
@@ -578,6 +727,12 @@ static int mds_reint_setattr(struct mds_update_record *rec, int offset,
  
          OBD_FAIL_WRITE(obd, OBD_FAIL_MDS_REINT_SETATTR_WRITE, inode->i_sb);
  
+        /* VBR: update version if attr changed are important for recovery */
+        if (do_vbr) {
+                rc = mds_version_get_check(req, inode, 0);
+                if (rc)
+                        GOTO(cleanup_no_trans, rc);
+        }
          /* start a log jounal handle if needed */
          if (S_ISREG(inode->i_mode) &&
              rec->ur_iattr.ia_valid & (ATTR_UID | ATTR_GID)) {
@@ -587,7 +742,8 @@ static int mds_reint_setattr(struct mds_update_record *rec, int offset,
                          GOTO(cleanup, rc = -ENOMEM);
  
                  cleanup_phase = 2;
-                rc = mds_get_md(obd, inode, lmm, &lmm_size, need_lock, 0);
+                rc = mds_get_md(obd, inode, lmm, &lmm_size, need_lock, 0,
+                                req->rq_export->exp_connect_flags);
                  if (rc < 0)
                          GOTO(cleanup, rc);
                  rc = 0;
@@ -638,16 +794,15 @@ static int mds_reint_setattr(struct mds_update_record *rec, int offset,
                          GOTO(cleanup, rc);
  
                  lum = rec->ur_eadata;
-                /* if { size, offset, count } = { 0, -1, 0 } (i.e. all default
-                 * values specified) then delete default striping from dir. */
+                /* if { size, offset, count } = { 0, -1, 0 } and no pool
+                 * (i.e. all default values specified) then delete default
+                 * striping from dir. */
                  if (S_ISDIR(inode->i_mode) &&
-                    ((lum->lmm_stripe_size == 0 &&
+                    (lum->lmm_stripe_size == 0 &&
                        lum->lmm_stripe_offset ==
                        (typeof(lum->lmm_stripe_offset))(-1) &&
-                      lum->lmm_stripe_count == 0) ||
-                    /* lmm_stripe_size == -1 is deprecated in 1.4.6 */
-                    lum->lmm_stripe_size ==
-                    (typeof(lum->lmm_stripe_size))(-1))){
+                      lum->lmm_stripe_count == 0 &&
+                      le32_to_cpu(lum->lmm_magic) != LOV_USER_MAGIC_V3)){
                          rc = fsfilt_set_md(obd, inode, handle, NULL, 0, "lov");
                          if (rc)
                                  GOTO(cleanup, rc);
@@ -668,7 +823,6 @@ static int mds_reint_setattr(struct mds_update_record *rec, int offset,
          }
  
          body = lustre_msg_buf(req->rq_repmsg, offset, sizeof(*body));
-        mds_pack_inode2fid(&body->fid1, inode);
          mds_pack_inode2body(body, inode);
  
          /* don't return OST-specific attributes if we didn't just set them. */
@@ -688,7 +842,7 @@ static int mds_reint_setattr(struct mds_update_record *rec, int offset,
                          mlcd->mlcd_eadatalen = rec->ur_eadatalen;
                          mlcd->mlcd_cookielen = rec->ur_cookielen;
                          mlcd->mlcd_lmm = (void *)&mlcd->mlcd_cookies +
-                                mlcd->mlcd_cookielen;
+                                         mlcd->mlcd_cookielen;
                          memcpy(&mlcd->mlcd_cookies, rec->ur_logcookies,
                                 mlcd->mlcd_cookielen);
                          memcpy(mlcd->mlcd_lmm, rec->ur_eadata,
@@ -702,7 +856,15 @@ static int mds_reint_setattr(struct mds_update_record *rec, int offset,
          if (mlcd != NULL)
                  sync = fsfilt_add_journal_cb(req->rq_export->exp_obd, 0, handle,
                                               mds_cancel_cookies_cb, mlcd);
-        err = mds_finish_transno(mds, inode, handle, req, rc, 0, sync);
+
+        /* permission changes may require sync operation */
+        if (rc == 0 && ia_valid & (ATTR_MODE|ATTR_UID|ATTR_GID))
+                sync |= mds->mds_sync_permission;
+        inodes[0] = inode;
+        err = mds_finish_transno(mds, do_vbr ? inodes : NULL, handle, req,
+                                 rc, 0, sync);
+
+ cleanup_no_trans:
          /* do mds to ost setattr if needed */
          if (!rc && !err && lmm_size)
                  mds_osc_setattr_async(obd, inode, lmm, lmm_size,
@@ -770,7 +932,8 @@ static void reconstruct_reint_create(struct mds_update_record *rec, int offset,
                  EXIT;
                  return;
          }
-        child = ll_lookup_one_len(rec->ur_name, parent, rec->ur_namelen - 1);
+        child = mds_lookup(exp->exp_obd, rec->ur_name, parent,
+                           rec->ur_namelen - 1);
          if (IS_ERR(child)) {
                  rc = PTR_ERR(child);
                  LCONSOLE_WARN("Child "LPU64"/%u lookup error %d." 
@@ -779,12 +942,12 @@ static void reconstruct_reint_create(struct mds_update_record *rec, int offset,
                                obd_uuid2str(&exp->exp_client_uuid),
                                obd_export_nid2str(exp));
                  mds_export_evict(exp);
+                l_dput(parent);
                  EXIT;
                  return;
          }
  
          body = lustre_msg_buf(req->rq_repmsg, offset, sizeof(*body));
-        mds_pack_inode2fid(&body->fid1, child->d_inode);
          mds_pack_inode2body(body, child->d_inode);
  
          l_dput(parent);
@@ -799,10 +962,11 @@ static int mds_reint_create(struct mds_update_record *rec, int offset,
          struct mds_obd *mds = mds_req2mds(req);
          struct obd_device *obd = req->rq_export->exp_obd;
          struct dentry *dchild = NULL;
+        struct inode *inodes[PTLRPC_NUM_VERSIONS] = { NULL };
          struct inode *dir = NULL;
          void *handle = NULL;
          struct lustre_handle lockh;
-        int rc = 0, err, type = rec->ur_mode & S_IFMT, cleanup_phase = 0;
+        int rc = 0, err = 0, type = rec->ur_mode & S_IFMT, cleanup_phase = 0;
          int created = 0;
          unsigned int qcids[MAXQUOTAS] = { current->fsuid, current->fsgid };
          unsigned int qpids[MAXQUOTAS] = { 0, 0 };
@@ -844,9 +1008,10 @@ static int mds_reint_create(struct mds_update_record *rec, int offset,
  
          ldlm_lock_dump_handle(D_OTHER, &lockh);
  
-        dchild = ll_lookup_one_len(rec->ur_name, dparent, rec->ur_namelen - 1);
+        dchild = mds_lookup(obd, rec->ur_name, dparent, rec->ur_namelen - 1);
          if (IS_ERR(dchild)) {
                  rc = PTR_ERR(dchild);
+                dchild = NULL;
                  CDEBUG(D_DENTRY, "child lookup error %d\n", rc);
                  GOTO(cleanup, rc);
          }
@@ -861,6 +1026,16 @@ static int mds_reint_create(struct mds_update_record *rec, int offset,
                  GOTO(cleanup, rc = -EROFS);
          }
  
+        /** check there is no stale orphan with same inode number */
+        rc = mds_check_stale_orphan(obd, rec->ur_fid2);
+        if (rc)
+                GOTO(cleanup, rc);
+
+        /* version recovery check */
+        rc = mds_version_get_check(req, dir, 0);
+        if (rc)
+                GOTO(cleanup_no_trans, rc);
+
          if (dir->i_mode & S_ISGID && S_ISDIR(rec->ur_mode))
                  rec->ur_mode |= S_ISGID;
  
@@ -874,16 +1049,20 @@ static int mds_reint_create(struct mds_update_record *rec, int offset,
                  gid = current->fsgid;
  
          /* we try to get enough quota to write here, and let ldiskfs
-         * decide if it is out of quota or not b=14783 */
+         * decide if it is out of quota or not b=14783
+         * FIXME: after CMD is used, pointer to obd_trans_info* couldn't
+         * be NULL, b=14840 */
          lquota_chkquota(mds_quota_interface_ref, obd,
-                        current->fsuid, gid, 1, &rec_pending);
+                        current->fsuid, gid, 1, &rec_pending, NULL, NULL, 0);
  
          switch (type) {
          case S_IFREG:{
                  handle = fsfilt_start(obd, dir, FSFILT_OP_CREATE, NULL);
                  if (IS_ERR(handle))
                          GOTO(cleanup, rc = PTR_ERR(handle));
+                LOCK_INODE_MUTEX(dir);
                  rc = ll_vfs_create(dir, dchild, rec->ur_mode, NULL);
+                UNLOCK_INODE_MUTEX(dir);
                  mds_counter_incr(req->rq_export, LPROC_MDS_MKNOD);
                  EXIT;
                  break;
@@ -892,7 +1071,9 @@ static int mds_reint_create(struct mds_update_record *rec, int offset,
                  handle = fsfilt_start(obd, dir, FSFILT_OP_MKDIR, NULL);
                  if (IS_ERR(handle))
                          GOTO(cleanup, rc = PTR_ERR(handle));
-                rc = vfs_mkdir(dir, dchild, rec->ur_mode);
+                LOCK_INODE_MUTEX(dir);
+                rc = ll_vfs_mkdir(dir, dchild, mds->mds_vfsmnt, rec->ur_mode);
+                UNLOCK_INODE_MUTEX(dir);
                  mds_counter_incr(req->rq_export, LPROC_MDS_MKDIR);
                  EXIT;
                  break;
@@ -901,10 +1082,13 @@ static int mds_reint_create(struct mds_update_record *rec, int offset,
                  handle = fsfilt_start(obd, dir, FSFILT_OP_SYMLINK, NULL);
                  if (IS_ERR(handle))
                          GOTO(cleanup, rc = PTR_ERR(handle));
+                LOCK_INODE_MUTEX(dir);
                  if (rec->ur_tgt == NULL)        /* no target supplied */
                          rc = -EINVAL;           /* -EPROTO? */
                  else
-                        rc = ll_vfs_symlink(dir, dchild, rec->ur_tgt, S_IALLUGO);
+                        rc = ll_vfs_symlink(dir, dchild, mds->mds_vfsmnt, 
+                                            rec->ur_tgt, S_IALLUGO);
+                UNLOCK_INODE_MUTEX(dir);
                  mds_counter_incr(req->rq_export, LPROC_MDS_MKNOD);
                  EXIT;
                  break;
@@ -917,7 +1101,10 @@ static int mds_reint_create(struct mds_update_record *rec, int offset,
                  handle = fsfilt_start(obd, dir, FSFILT_OP_MKNOD, NULL);
                  if (IS_ERR(handle))
                          GOTO(cleanup, rc = PTR_ERR(handle));
-                rc = vfs_mknod(dir, dchild, rec->ur_mode, rdev);
+                LOCK_INODE_MUTEX(dir);
+                rc = ll_vfs_mknod(dir, dchild, mds->mds_vfsmnt, rec->ur_mode, 
+                                  rdev);
+                UNLOCK_INODE_MUTEX(dir);
                  mds_counter_incr(req->rq_export, LPROC_MDS_MKNOD);
                  EXIT;
                  break;
@@ -950,7 +1137,13 @@ static int mds_reint_create(struct mds_update_record *rec, int offset,
                          ATTR_MTIME | ATTR_CTIME;
  
                  if (rec->ur_fid2->id) {
-                        LASSERT(rec->ur_fid2->id == inode->i_ino);
+                        if (rec->ur_fid2->id != inode->i_ino) {
+                                if (req->rq_export->exp_delayed)
+                                        rc = -EOVERFLOW;
+                                else
+                                        rc = -EFAULT;
+                                GOTO(cleanup, rc);
+                        }
                          inode->i_generation = rec->ur_fid2->generation;
                          /* Dirtied and committed by the upcoming setattr. */
                          CDEBUG(D_INODE, "recreated ino %lu with gen %u\n",
@@ -970,9 +1163,10 @@ static int mds_reint_create(struct mds_update_record *rec, int offset,
                          CERROR("error on parent setattr: rc = %d\n", rc);
  
                  if (S_ISDIR(inode->i_mode)) {
-                        struct lov_mds_md lmm;
+                        struct lov_mds_md_v3 lmm;
                          int lmm_size = sizeof(lmm);
-                        rc = mds_get_md(obd, dir, &lmm, &lmm_size, 1, 0);
+                        rc = mds_get_md(obd, dir, &lmm, &lmm_size, 1, 0,
+                                        req->rq_export->exp_connect_flags);
                          if (rc > 0) {
                                  LOCK_INODE_MUTEX(inode);
                                  rc = fsfilt_set_md(obd, inode, handle,
@@ -985,16 +1179,19 @@ static int mds_reint_create(struct mds_update_record *rec, int offset,
                  }
  
                  body = lustre_msg_buf(req->rq_repmsg, offset, sizeof(*body));
-                mds_pack_inode2fid(&body->fid1, inode);
                  mds_pack_inode2body(body, inode);
          }
          EXIT;
  
  cleanup:
-        err = mds_finish_transno(mds, dir, handle, req, rc, 0, 0);
+        inodes[0] = dir;
+        inodes[1] = dchild ? dchild->d_inode : NULL;
+        err = mds_finish_transno(mds, inodes, handle, req, rc, 0, 0);
+
+cleanup_no_trans:
          if (rec_pending)
                  lquota_pending_commit(mds_quota_interface_ref, obd,
-                                      current->fsuid, gid, 1);
+                                      current->fsuid, gid, rec_pending);
  
          if (rc && created) {
                  /* Destroy the file we just created.  This should not need
@@ -1004,12 +1201,16 @@ cleanup:
                   */
                  switch (type) {
                  case S_IFDIR:
-                        err = vfs_rmdir(dir, dchild);
+                        LOCK_INODE_MUTEX(dir);
+                        err = ll_vfs_rmdir(dir, dchild, mds->mds_vfsmnt);
+                        UNLOCK_INODE_MUTEX(dir);
                          if (err)
                                  CERROR("rmdir in error path: %d\n", err);
                          break;
                  default:
-                        err = vfs_unlink(dir, dchild);
+                        LOCK_INODE_MUTEX(dir);
+                        err = ll_vfs_unlink(dir, dchild, mds->mds_vfsmnt);
+                        UNLOCK_INODE_MUTEX(dir);
                          if (err)
                                  CERROR("unlink in error path: %d\n", err);
                          break;
@@ -1291,6 +1492,10 @@ static int mds_verify_child(struct obd_device *obd,
          int rc = 0, cleanup_phase = 2; /* parent, child locks */
          ENTRY;
  
+        /* not want child - not check it */
+        if (name == NULL)
+                RETURN(0);
+
          vchild = ll_lookup_one_len(name, dparent, namelen - 1);
          if (IS_ERR(vchild))
                  GOTO(cleanup, rc = PTR_ERR(vchild));
@@ -1305,6 +1510,12 @@ static int mds_verify_child(struct obd_device *obd,
  
                  RETURN(0);
          }
+        /* resouce is changed, but not want child lock, return new child */
+        if (child_lockh == NULL) {
+                dput(dchild);
+                *dchildp = vchild;
+                GOTO(cleanup, rc = 0);
+        }
  
          CDEBUG(D_DLMTRACE, "child inode changed: %p != %p (%lu != "LPU64")\n",
                 vchild->d_inode, dchild ? dchild->d_inode : 0,
@@ -1344,6 +1555,7 @@ static int mds_verify_child(struct obd_device *obd,
                          GOTO(cleanup, rc = -EIO);
          } else {
                  memset(child_res_id, 0, sizeof(*child_res_id));
+                memset(child_lockh, 0, sizeof(*child_lockh));
          }
  
          EXIT;
@@ -1378,6 +1590,7 @@ int mds_get_parent_child_locked(struct obd_device *obd, struct mds_obd *mds,
          struct ldlm_res_id parent_res_id = { .name = {0} };
          ldlm_policy_data_t parent_policy = {.l_inodebits = { parent_lockpart }};
          ldlm_policy_data_t child_policy = {.l_inodebits = { child_lockpart }};
+        static struct ldlm_res_id child_res_id_nolock = { .name = {0} };
          struct inode *inode;
          int rc = 0, cleanup_phase = 0;
          ENTRY;
@@ -1398,8 +1611,10 @@ int mds_get_parent_child_locked(struct obd_device *obd, struct mds_obd *mds,
  
          cleanup_phase = 1; /* parent dentry */
  
+        if (name == NULL)
+                GOTO(retry_locks, rc);
          /* Step 2: Lookup child (without DLM lock, to get resource name) */
-        *dchildp = ll_lookup_one_len(name, *dparentp, namelen - 1);
+        *dchildp = mds_lookup(obd, name, *dparentp, namelen - 1);
          if (IS_ERR(*dchildp)) {
                  rc = PTR_ERR(*dchildp);
                  CDEBUG(D_INODE, "child lookup error %d\n", rc);
@@ -1407,6 +1622,7 @@ int mds_get_parent_child_locked(struct obd_device *obd, struct mds_obd *mds,
          }
  
          cleanup_phase = 2; /* child dentry */
+
          inode = (*dchildp)->d_inode;
          if (inode != NULL) {
                  if (is_bad_inode(inode)) {
@@ -1440,16 +1656,13 @@ retry_locks:
           *         exist, we still have to lock the parent and re-lookup. */
          rc = enqueue_ordered_locks(obd,&parent_res_id,parent_lockh,parent_mode,
                                     &parent_policy,
-                                   &child_res_id, child_lockh, child_mode,
+                                   child_lockh ? &child_res_id :
+                                                 &child_res_id_nolock,
+                                   child_lockh, child_mode,
                                     &child_policy);
          if (rc)
                  GOTO(cleanup, rc);
  
-        if (!(*dchildp)->d_inode)
-                cleanup_phase = 3; /* parent lock */
-        else
-                cleanup_phase = 4; /* child lock */
-
          /* Step 4: Re-lookup child to verify it hasn't changed since locking */
          rc = mds_verify_child(obd, &parent_res_id, parent_lockh, *dparentp,
                                parent_mode, &child_res_id, child_lockh, dchildp,
@@ -1457,17 +1670,12 @@ retry_locks:
          if (rc > 0)
                  goto retry_locks;
          if (rc < 0) {
-                cleanup_phase = 2;
                  GOTO(cleanup, rc);
          }
  
  cleanup:
          if (rc) {
                  switch (cleanup_phase) {
-                case 4:
-                        ldlm_lock_decref(child_lockh, child_mode);
-                case 3:
-                        ldlm_lock_decref(parent_lockh, parent_mode);
                  case 2:
                          l_dput(*dchildp);
                  case 1:
@@ -1523,7 +1731,7 @@ static int mds_orphan_add_link(struct mds_update_record *rec,
                 S_ISDIR(inode->i_mode) ? "dir" :
                  S_ISREG(inode->i_mode) ? "file" : "other",rec->ur_name,fidname);
  
-        if (mds_orphan_open_count(inode) == 0 || inode->i_nlink != 0)
+        if (!mds_orphan_needed(obd, inode) || inode->i_nlink != 0)
                  RETURN(0);
  
          pending_child = lookup_one_len(fidname, mds->mds_pending_dir, fidlen);
@@ -1540,7 +1748,8 @@ static int mds_orphan_add_link(struct mds_update_record *rec,
           * for linking and return real mode back then -bzzz */
          mode = inode->i_mode;
          inode->i_mode = S_IFREG;
-        rc = vfs_link(dentry, pending_dir, pending_child);
+        rc = ll_vfs_link(dentry, mds->mds_vfsmnt, pending_dir, pending_child,
+                         mds->mds_vfsmnt);
          if (rc)
                  CERROR("error linking orphan %s to PENDING: rc = %d\n",
                         rec->ur_name, rc);
@@ -1588,10 +1797,10 @@ void mds_shrink_reply(struct obd_device *obd, struct ptlrpc_request *req,
  
          CDEBUG(D_INFO, "Shrink to md_size %d cookie_size %d \n", md_size,
                 cookie_size);
- 
+
          lustre_shrink_reply(req, md_off, md_size, 1);
-        
-        lustre_shrink_reply(req, md_off + (md_size > 0), cookie_size, 0); 
+
+        lustre_shrink_reply(req, md_off + (md_size > 0), cookie_size, 1); 
  }
  
  static int mds_reint_unlink(struct mds_update_record *rec, int offset,
@@ -1602,6 +1811,7 @@ static int mds_reint_unlink(struct mds_update_record *rec, int offset,
          struct mds_obd *mds = mds_req2mds(req);
          struct obd_device *obd = req->rq_export->exp_obd;
          struct mds_body *body = NULL;
+        struct inode *inodes[PTLRPC_NUM_VERSIONS] = { NULL };
          struct inode *child_inode = NULL;
          struct lustre_handle parent_lockh, child_lockh, child_reuse_lockh;
          void *handle = NULL;
@@ -1651,6 +1861,16 @@ static int mds_reint_unlink(struct mds_update_record *rec, int offset,
  
          cleanup_phase = 2; /* dchild has a lock */
  
+        /* VBR: version recovery check for parent */
+        rc = mds_version_get_check(req, dparent->d_inode, 0);
+        if (rc)
+                GOTO(cleanup_no_trans, rc);
+
+        /* version recovery check */
+        rc = mds_version_get_check(req, child_inode, 1);
+        if (rc)
+                GOTO(cleanup_no_trans, rc);
+
          /* We have to do these checks ourselves, in case we are making an
           * orphan.  The client tells us whether rmdir() or unlink() was called,
           * so we need to return appropriate errors (bug 72). */
@@ -1692,15 +1912,15 @@ static int mds_reint_unlink(struct mds_update_record *rec, int offset,
           * only do the object removal later if no open files/links remain. */
          if ((S_ISDIR(child_inode->i_mode) && child_inode->i_nlink == 2) ||
              child_inode->i_nlink == 1) {
-                if (mds_orphan_open_count(child_inode) > 0) {
+                if (mds_orphan_needed(obd, child_inode)) {
                          /* need to lock pending_dir before transaction */
                          LOCK_INODE_MUTEX(mds->mds_pending_dir->d_inode);
                          cleanup_phase = 5; /* UNLOCK_INODE_MUTEX(mds->mds_pending_dir->d_inode); */
                  } else if (S_ISREG(child_inode->i_mode)) {
-                        mds_pack_inode2fid(&body->fid1, child_inode);
                          mds_pack_inode2body(body, child_inode);
                          mds_pack_md(obd, req->rq_repmsg, offset + 1, body,
-                                    child_inode, MDS_PACK_MD_LOCK, 0);
+                                    child_inode, MDS_PACK_MD_LOCK, 0,
+                                    req->rq_export->exp_connect_flags);
                  }
          }
  
@@ -1715,7 +1935,9 @@ static int mds_reint_unlink(struct mds_update_record *rec, int offset,
                                        NULL);
                  if (IS_ERR(handle))
                          GOTO(cleanup, rc = PTR_ERR(handle));
-                rc = vfs_rmdir(dparent->d_inode, dchild);
+                LOCK_INODE_MUTEX(dparent->d_inode);
+                rc = ll_vfs_rmdir(dparent->d_inode, dchild, mds->mds_vfsmnt);
+                UNLOCK_INODE_MUTEX(dparent->d_inode);
                  mds_counter_incr(req->rq_export, LPROC_MDS_RMDIR);
                  break;
          case S_IFREG: {
@@ -1726,7 +1948,9 @@ static int mds_reint_unlink(struct mds_update_record *rec, int offset,
                                            le32_to_cpu(lmm->lmm_stripe_count));
                  if (IS_ERR(handle))
                          GOTO(cleanup, rc = PTR_ERR(handle));
-                rc = vfs_unlink(dparent->d_inode, dchild);
+                LOCK_INODE_MUTEX(dparent->d_inode);
+                rc = ll_vfs_unlink(dparent->d_inode, dchild, mds->mds_vfsmnt);
+                UNLOCK_INODE_MUTEX(dparent->d_inode);
                  mds_counter_incr(req->rq_export, LPROC_MDS_UNLINK);
                  break;
          }
@@ -1739,7 +1963,9 @@ static int mds_reint_unlink(struct mds_update_record *rec, int offset,
                                        NULL);
                  if (IS_ERR(handle))
                          GOTO(cleanup, rc = PTR_ERR(handle));
-                rc = vfs_unlink(dparent->d_inode, dchild);
+                LOCK_INODE_MUTEX(dparent->d_inode);
+                rc = ll_vfs_unlink(dparent->d_inode, dchild, mds->mds_vfsmnt);
+                UNLOCK_INODE_MUTEX(dparent->d_inode);
                  mds_counter_incr(req->rq_export, LPROC_MDS_UNLINK);
                  break;
          default:
@@ -1750,7 +1976,7 @@ static int mds_reint_unlink(struct mds_update_record *rec, int offset,
          }
  
          if (rc == 0 && child_inode->i_nlink == 0) {
-                if (mds_orphan_open_count(child_inode) > 0)
+                if (mds_orphan_needed(obd, child_inode))
                          rc = mds_orphan_add_link(rec, obd, dchild);
  
                  if (rc == 1)
@@ -1778,20 +2004,30 @@ cleanup:
                  struct iattr iattr;
                  int err;
  
+                /* update ctime of unlinked file, even if last link is
+                   removed because open-unlinked file can be statted */
+                iattr.ia_valid = ATTR_CTIME;
+                LTIME_S(iattr.ia_ctime) = rec->ur_time;
+                err = fsfilt_setattr(obd, dchild, handle, &iattr, 0);
+                if (err)
+                        CERROR("error on unlinked inode time update: "
+                               "rc = %d\n", err);
+
+                /* update mtime and ctime of parent directory*/
                  iattr.ia_valid = ATTR_MTIME | ATTR_CTIME;
                  LTIME_S(iattr.ia_mtime) = rec->ur_time;
                  LTIME_S(iattr.ia_ctime) = rec->ur_time;
-
                  err = fsfilt_setattr(obd, dparent, handle, &iattr, 0);
                  if (err)
                          CERROR("error on parent setattr: rc = %d\n", err);
          }
-
-        rc = mds_finish_transno(mds, dparent ? dparent->d_inode : NULL,
-                                handle, req, rc, 0, 0);
+        inodes[0] = dparent ? dparent->d_inode : NULL;
+        inodes[1] = child_inode;
+        rc = mds_finish_transno(mds, inodes, handle, req, rc, 0, 0);
          if (!rc)
                  (void)obd_set_info_async(mds->mds_osc_exp, sizeof(KEY_UNLINKED),
                                           KEY_UNLINKED, 0, NULL, NULL);
+cleanup_no_trans:
          switch(cleanup_phase) {
          case 5: /* pending_dir semaphore */
                  UNLOCK_INODE_MUTEX(mds->mds_pending_dir->d_inode);
@@ -1824,7 +2060,7 @@ cleanup:
          }
          req->rq_status = rc;
  
-        mds_shrink_reply(obd, req, body, REPLY_REC_OFF + 1);
+        mds_shrink_reply(obd, req, body, offset + 1);
  
          /* trigger dqrel on the owner of child and parent */
          lquota_adjust(mds_quota_interface_ref, obd, qcids, qpids, rc,
@@ -1840,6 +2076,7 @@ static int mds_reint_link(struct mds_update_record *rec, int offset,
          struct dentry *de_src = NULL;
          struct dentry *de_tgt_dir = NULL;
          struct dentry *dchild = NULL;
+        struct inode *inodes[PTLRPC_NUM_VERSIONS] = { NULL };
          struct mds_obd *mds = mds_req2mds(req);
          struct lustre_handle *handle = NULL, tgt_dir_lockh, src_lockh;
          struct ldlm_res_id src_res_id = { .name = {0} };
@@ -1865,7 +2102,7 @@ static int mds_reint_link(struct mds_update_record *rec, int offset,
  
          if (rec->ur_dlm)
                  ldlm_request_cancel(req, rec->ur_dlm, 0);
-        
+
          /* Step 1: Lookup the source inode and target directory by FID */
          de_src = mds_fid2dentry(mds, rec->ur_fid1, NULL);
          if (IS_ERR(de_src))
@@ -1901,6 +2138,16 @@ static int mds_reint_link(struct mds_update_record *rec, int offset,
  
          cleanup_phase = 3; /* locks */
  
+        /* version recovery check */
+        /* directory check */
+        rc = mds_version_get_check(req, de_tgt_dir->d_inode, 0);
+        if (rc)
+                GOTO(cleanup_no_trans, rc);
+        /* inode version check */
+        rc = mds_version_get_check(req, de_src->d_inode, 1);
+        if (rc)
+                GOTO(cleanup_no_trans, rc);
+
          if (mds_inode_is_orphan(de_src->d_inode)) {
                  CDEBUG(D_INODE, "an attempt to link an orphan inode %lu/%u\n",
                         de_src->d_inode->i_ino,
@@ -1909,9 +2156,10 @@ static int mds_reint_link(struct mds_update_record *rec, int offset,
          }
  
          /* Step 3: Lookup the child */
-        dchild = ll_lookup_one_len(rec->ur_name, de_tgt_dir, rec->ur_namelen-1);
+        dchild = mds_lookup(obd, rec->ur_name, de_tgt_dir, rec->ur_namelen-1);
          if (IS_ERR(dchild)) {
                  rc = PTR_ERR(dchild);
+                dchild = NULL;
                  if (rc != -EPERM && rc != -EACCES && rc != -ENAMETOOLONG)
                          CERROR("child lookup error %d\n", rc);
                  GOTO(cleanup, rc);
@@ -1936,14 +2184,40 @@ static int mds_reint_link(struct mds_update_record *rec, int offset,
          if (IS_ERR(handle))
                  GOTO(cleanup, rc = PTR_ERR(handle));
  
-        rc = vfs_link(de_src, de_tgt_dir->d_inode, dchild);
+        LOCK_INODE_MUTEX(de_tgt_dir->d_inode);
+        rc = ll_vfs_link(de_src, mds->mds_vfsmnt, de_tgt_dir->d_inode, dchild,
+                         mds->mds_vfsmnt);
+        UNLOCK_INODE_MUTEX(de_tgt_dir->d_inode);
          if (rc && rc != -EPERM && rc != -EACCES)
                  CERROR("vfs_link error %d\n", rc);
+        if (rc == 0) {
+                struct iattr iattr;
+                int err;
+
+                /* update ctime of old file */
+                iattr.ia_valid = ATTR_CTIME;
+                LTIME_S(iattr.ia_ctime) = rec->ur_time;
+                err = fsfilt_setattr(obd, de_src, handle, &iattr, 0);
+                if (err)
+                        CERROR("error on old inode time update: "
+                               "rc = %d\n", err);
+
+                /* update mtime and ctime of target directory */
+                iattr.ia_valid = ATTR_MTIME | ATTR_CTIME;
+                LTIME_S(iattr.ia_mtime) = rec->ur_time;
+                LTIME_S(iattr.ia_ctime) = rec->ur_time;
+                err = fsfilt_setattr(obd, de_tgt_dir, handle, &iattr, 0);
+                if (err)
+                        CERROR("error on target dir inode time update: "
+                               "rc = %d\n", err);
+        }
  cleanup:
-        rc = mds_finish_transno(mds, de_tgt_dir ? de_tgt_dir->d_inode : NULL,
-                                handle, req, rc, 0, 0);
+        inodes[0] = de_tgt_dir ? de_tgt_dir->d_inode : NULL;
+        inodes[1] = (dchild && !IS_ERR(dchild)) ? dchild->d_inode : NULL;
+        rc = mds_finish_transno(mds, inodes, handle, req, rc, 0, 0);
          EXIT;
  
+cleanup_no_trans:
          switch (cleanup_phase) {
          case 4: /* child dentry */
                  l_dput(dchild);
@@ -2013,7 +2287,8 @@ int mds_get_parents_children_locked(struct obd_device *obd,
          /* Only dentry should disappear, but the inode itself would be
             intact otherwise. */
          ldlm_policy_data_t c1_policy = {.l_inodebits = {MDS_INODELOCK_LOOKUP}};
-        /* If something is going to be replaced, both dentry and inode locks are           needed */
+        /* If something is going to be replaced, both dentry and inode locks are
+         * needed */
          ldlm_policy_data_t c2_policy = {.l_inodebits = {MDS_INODELOCK_FULL}};
          struct ldlm_res_id *maxres_src, *maxres_tgt;
          struct inode *inode;
@@ -2048,7 +2323,7 @@ int mds_get_parents_children_locked(struct obd_device *obd,
          p2_res_id.name[1] = (*de_tgtdirp)->d_inode->i_generation;
  
          /* Step 3: Lookup the source child entry */
-        *de_oldp = ll_lookup_one_len(old_name, *de_srcdirp, old_len - 1);
+        *de_oldp = mds_lookup(obd, old_name, *de_srcdirp, old_len - 1);
          if (IS_ERR(*de_oldp)) {
                  rc = PTR_ERR(*de_oldp);
                  CDEBUG(D_INODE, "old child lookup error (%.*s): rc %d\n",
@@ -2072,7 +2347,7 @@ int mds_get_parents_children_locked(struct obd_device *obd,
          /* Step 4: Lookup the target child entry */
          if (!new_name)
                  GOTO(retry_locks, rc);
-        *de_newp = ll_lookup_one_len(new_name, *de_tgtdirp, new_len - 1);
+        *de_newp = mds_lookup(obd, new_name, *de_tgtdirp, new_len - 1);
          if (IS_ERR(*de_newp)) {
                  rc = PTR_ERR(*de_newp);
                  CDEBUG(D_DENTRY, "new child lookup error (%.*s): rc %d\n",
@@ -2083,8 +2358,14 @@ int mds_get_parents_children_locked(struct obd_device *obd,
          cleanup_phase = 4; /* target dentry */
  
          inode = (*de_newp)->d_inode;
-        if (inode != NULL)
+        if (inode != NULL) {
+                if (is_bad_inode(inode)) {
+                        CERROR("bad inode returned %lu/%u\n",
+                               inode->i_ino, inode->i_generation);
+                        GOTO(cleanup, rc = -ENOENT);
+                }
                  inode = igrab(inode);
+        }
          if (inode == NULL)
                  goto retry_locks;
  
@@ -2098,8 +2379,6 @@ retry_locks:
          maxres_tgt = &p2_res_id;
          cleanup_phase = 4; /* target dentry */
  
-        if (c1_res_id.name[0] != 0 && res_gt(&c1_res_id, &p1_res_id,NULL,NULL))
-                maxres_src = &c1_res_id;
          if (c2_res_id.name[0] != 0 && res_gt(&c2_res_id, &p2_res_id,NULL,NULL))
                  maxres_tgt = &c2_res_id;
  
@@ -2136,6 +2415,11 @@ retry_locks:
  
          if (!new_name)
                  GOTO(cleanup, rc);
+
+        /* Safe to skip check for child res being all zero */
+        if (res_gt(&c1_res_id, maxres_src, NULL, NULL))
+                maxres_src = &c1_res_id;
+
          /* Step 6b: Re-lookup target child to verify it hasn't changed */
          rc = mds_verify_child(obd, &p2_res_id, &dlm_handles[1], *de_tgtdirp,
                                parent_mode, &c2_res_id, &dlm_handles[3], de_newp,
@@ -2186,6 +2470,7 @@ static int mds_reint_rename(struct mds_update_record *rec, int offset,
          struct dentry *de_old = NULL;
          struct dentry *de_new = NULL;
          struct inode *old_inode = NULL, *new_inode = NULL;
+        struct inode *inodes[PTLRPC_NUM_VERSIONS] = { NULL };
          struct mds_obd *mds = mds_req2mds(req);
          struct lustre_handle dlm_handles[4];
          struct mds_body *body = NULL;
@@ -2204,7 +2489,7 @@ static int mds_reint_rename(struct mds_update_record *rec, int offset,
                    rec->ur_fid2->id, rec->ur_fid2->generation, rec->ur_tgt);
  
          mds_counter_incr(req->rq_export, LPROC_MDS_RENAME);
-        
+
          MDS_CHECK_RESENT(req, mds_reconstruct_generic(req));
  
          if (rec->ur_dlm)
@@ -2224,6 +2509,20 @@ static int mds_reint_rename(struct mds_update_record *rec, int offset,
          old_inode = de_old->d_inode;
          new_inode = de_new->d_inode;
  
+        /* version recovery check */
+        rc = mds_version_get_check(req, de_srcdir->d_inode, 0);
+        if (rc)
+                GOTO(cleanup_no_trans, rc);
+        rc = mds_version_get_check(req, old_inode, 1);
+        if (rc)
+                GOTO(cleanup_no_trans, rc);
+        rc = mds_version_get_check(req, de_tgtdir->d_inode, 2);
+        if (rc)
+                GOTO(cleanup_no_trans, rc);
+        rc = mds_version_get_check(req, new_inode, 3);
+        if (rc)
+                GOTO(cleanup_no_trans, rc);
+
          if (new_inode != NULL)
                  lock_count = 4;
  
@@ -2268,15 +2567,15 @@ static int mds_reint_rename(struct mds_update_record *rec, int offset,
  
          if ((S_ISDIR(new_inode->i_mode) && new_inode->i_nlink == 2) ||
              new_inode->i_nlink == 1) {
-                if (mds_orphan_open_count(new_inode) > 0) {
+                if (mds_orphan_needed(obd, new_inode)) {
                          /* need to lock pending_dir before transaction */
                          LOCK_INODE_MUTEX(mds->mds_pending_dir->d_inode);
                          cleanup_phase = 4; /* UNLOCK_INODE_MUTEX(mds->mds_pending_dir->d_inode); */
                  } else if (S_ISREG(new_inode->i_mode)) {
-                        mds_pack_inode2fid(&body->fid1, new_inode);
                          mds_pack_inode2body(body, new_inode);
                          mds_pack_md(obd, req->rq_repmsg, offset + 1, body,
-                                    new_inode, MDS_PACK_MD_LOCK, 0);
+                                    new_inode, MDS_PACK_MD_LOCK, 0,
+                                    req->rq_export->exp_connect_flags);
                  }
          }
  
@@ -2284,12 +2583,10 @@ no_unlink:
          OBD_FAIL_WRITE(obd, OBD_FAIL_MDS_REINT_RENAME_WRITE,
                         de_srcdir->d_inode->i_sb);
  
-#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,5,0)
          /* Check if we are moving old entry into its child. 2.6 does not
             check for this in vfs_rename() anymore */
          if (is_subdir(de_new, de_old))
                  GOTO(cleanup, rc = -EINVAL);
-#endif
  
          lmm = lustre_msg_buf(req->rq_repmsg, offset + 1, 0);
          handle = fsfilt_start_log(obd, de_tgtdir->d_inode, FSFILT_OP_RENAME,
@@ -2302,11 +2599,64 @@ no_unlink:
          de_old->d_fsdata = req;
          de_new->d_fsdata = req;
  
-        rc = vfs_rename(de_srcdir->d_inode, de_old, de_tgtdir->d_inode, de_new);
+        rc = ll_vfs_rename(de_srcdir->d_inode, de_old, mds->mds_vfsmnt, 
+                           de_tgtdir->d_inode, de_new, mds->mds_vfsmnt);
          unlock_kernel();
  
+        if (rc == 0) {
+                struct iattr iattr;
+                int err;
+
+                /* update ctime of renamed file */
+                iattr.ia_valid = ATTR_CTIME;
+                LTIME_S(iattr.ia_ctime) = rec->ur_time;
+                if (S_ISDIR(de_old->d_inode->i_mode) &&
+                    de_srcdir->d_inode != de_tgtdir->d_inode) {
+                        /* cross directory rename of a directory, ".."
+                           changed, update mtime also */
+                        iattr.ia_valid |= ATTR_MTIME;
+                        LTIME_S(iattr.ia_mtime) = rec->ur_time;
+                }
+                err = fsfilt_setattr(obd, de_old, handle, &iattr, 0);
+                if (err)
+                        CERROR("error on old inode time update: "
+                               "rc = %d\n", err);
+
+                if (de_new->d_inode) {
+                        /* target file exists, update its ctime as it
+                           gets unlinked */
+                        iattr.ia_valid = ATTR_CTIME;
+                        LTIME_S(iattr.ia_ctime) = rec->ur_time;
+                        err = fsfilt_setattr(obd, de_new, handle, &iattr, 0);
+                        if (err)
+                                CERROR("error on target inode time update: "
+                                       "rc = %d\n", err);
+                }
+
+                /* update mtime and ctime of old directory */
+                iattr.ia_valid = ATTR_MTIME | ATTR_CTIME;
+                LTIME_S(iattr.ia_mtime) = rec->ur_time;
+                LTIME_S(iattr.ia_ctime) = rec->ur_time;
+                err = fsfilt_setattr(obd, de_srcdir, handle, &iattr, 0); 
+                if (err)
+                        CERROR("error on old dir inode update: "
+                               "rc = %d\n", err);
+
+                if (de_srcdir->d_inode != de_tgtdir->d_inode) {
+                        /* cross directory rename, update
+                           mtime and ctime of new directory */
+                        iattr.ia_valid = ATTR_MTIME | ATTR_CTIME;
+                        LTIME_S(iattr.ia_mtime) = rec->ur_time;
+                        LTIME_S(iattr.ia_ctime) = rec->ur_time;
+                        err = fsfilt_setattr(obd, de_tgtdir, handle, &iattr, 0);
+                        if (err)
+                                CERROR("error on new dir inode time update: "
+                                       "rc = %d\n", err);
+                }
+        }
+
          if (rc == 0 && new_inode != NULL && new_inode->i_nlink == 0) {
-                if (mds_orphan_open_count(new_inode) > 0)
+                if (mds_orphan_needed(obd, new_inode))
                          rc = mds_orphan_add_link(rec, obd, de_new);
  
                  if (rc == 1)
@@ -2334,9 +2684,13 @@ no_unlink:
  
          GOTO(cleanup, rc);
  cleanup:
-        rc = mds_finish_transno(mds, de_tgtdir ? de_tgtdir->d_inode : NULL,
-                                handle, req, rc, 0, 0);
+        inodes[0] = de_srcdir && !IS_ERR(de_srcdir) ? de_srcdir->d_inode : NULL;
+        inodes[1] = old_inode;
+        inodes[2] = de_tgtdir && !IS_ERR(de_tgtdir) ? de_tgtdir->d_inode : NULL;
+        inodes[3] = new_inode;
+        rc = mds_finish_transno(mds, inodes, handle, req, rc, 0, 0);
  
+cleanup_no_trans:
          switch (cleanup_phase) {
          case 4:
                  UNLOCK_INODE_MUTEX(mds->mds_pending_dir->d_inode);
@@ -2370,6 +2724,8 @@ cleanup:
          }
          req->rq_status = rc;
  
+        mds_shrink_reply(obd, req, body, offset + 1);
+
          /* acquire/release qunit */
          lquota_adjust(mds_quota_interface_ref, obd, qcids, qpids, rc,
                        FSFILT_OP_RENAME);
@@ -2406,7 +2762,7 @@ int mds_reint_rec(struct mds_update_record *rec, int offset,
                   * NB root's creds are believed... */
                  LASSERT (req->rq_uid != 0);
                  rec->ur_uc.luc_fsuid = req->rq_uid;
-                rec->ur_uc.luc_cap = 0;
+                cfs_kernel_cap_unpack(&rec->ur_uc.luc_cap, 0);
          }
  #endif
  
diff --git a/lustre/mds/mds_unlink_open.c b/lustre/mds/mds_unlink_open.c

index 053c549..74daa05 100644 (file)
--- a/lustre/mds/mds_unlink_open.c
+++ b/lustre/mds/mds_unlink_open.c
@@ -1,30 +1,43 @@
  /* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
   * vim:expandtab:shiftwidth=8:tabstop=8:
   *
- *  lustre/mds/mds_orphan.c
+ * GPL HEADER START
   *
- *  Copyright (c) 2001-2003 Cluster File Systems, Inc.
- *   Author: Peter Braam <braam@clusterfs.com>
- *   Author: Andreas Dilger <adilger@clusterfs.com>
- *   Author: Phil Schwan <phil@clusterfs.com>
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   *
- *   This file is part of the Lustre file system, http://www.lustre.org
- *   Lustre is a trademark of Cluster File Systems, Inc.
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
   *
- *   You may have signed or agreed to another license before downloading
- *   this software.  If so, you are bound by the terms and conditions
- *   of that agreement, and the following does not apply to you.  See the
- *   LICENSE file included with this distribution for more information.
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
   *
- *   If you did not agree to a different license, then this copy of Lustre
- *   is open source software; you can redistribute it and/or modify it
- *   under the terms of version 2 of the GNU General Public License as
- *   published by the Free Software Foundation.
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
   *
- *   In either case, Lustre is distributed in the hope that it will be
- *   useful, but WITHOUT ANY WARRANTY; without even the implied warranty
- *   of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- *   license text for more details.
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright  2008 Sun Microsystems, Inc. All rights reserved
+ * Use is subject to license terms.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/mds/mds_unlink_open.c
+ *
+ * Author: Peter Braam <braam@clusterfs.com>
+ * Author: Andreas Dilger <adilger@clusterfs.com>
+ * Author: Phil Schwan <phil@clusterfs.com>
   */
  
  /* code for handling open unlinked files */
@@ -41,7 +54,6 @@
  #include <obd_class.h>
  #include <lustre_fsfilt.h>
  #include <lustre_mds.h>
-#include <lustre_commit_confd.h>
  #include <lvfs.h>
  
  #include "mds_internal.h"
@@ -110,12 +122,12 @@ static int mds_unlink_orphan(struct obd_device *obd, struct dentry *dchild,
          ENTRY;
  
          LASSERT(mds->mds_osc_obd != NULL);
-        
+
          /* We don't need to do any of these other things for orhpan dirs,
           * especially not mds_get_md (may get a default LOV EA, bug 4554) */
          mode = inode->i_mode;
          if (S_ISDIR(mode)) {
-                rc = vfs_rmdir(pending_dir, dchild);
+                rc = ll_vfs_rmdir(pending_dir, dchild, mds->mds_vfsmnt);
                  if (rc)
                          CERROR("error %d unlinking dir %*s from PENDING\n",
                                 rc, dchild->d_name.len, dchild->d_name.name);
@@ -127,7 +139,7 @@ static int mds_unlink_orphan(struct obd_device *obd, struct dentry *dchild,
          if (lmm == NULL)
                  RETURN(-ENOMEM);
  
-        rc = mds_get_md(obd, inode, lmm, &lmm_size, 1, 0);
+        rc = mds_get_md(obd, inode, lmm, &lmm_size, 1, 0, 0);
          if (rc < 0)
                  GOTO(out_free_lmm, rc);
  
@@ -140,7 +152,7 @@ static int mds_unlink_orphan(struct obd_device *obd, struct dentry *dchild,
                  GOTO(out_free_lmm, rc);
          }
  
-        rc = vfs_unlink(pending_dir, dchild);
+        rc = ll_vfs_unlink(pending_dir, dchild, mds->mds_vfsmnt);
          if (rc) {
                  CERROR("error %d unlinking orphan %.*s from PENDING\n",
                         rc, dchild->d_name.len, dchild->d_name.name);
@@ -171,6 +183,19 @@ out_free_lmm:
          RETURN(rc);
  }
  
+static __u64 mds_orphans_max_version(struct obd_device *obd)
+{
+        struct obd_export *exp;
+        __u32 epoch = lr_epoch(obd->u.mds.mds_last_transno);
+        spin_lock(&obd->obd_dev_lock);
+        list_for_each_entry(exp, &obd->obd_delayed_exports, exp_obd_chain) {
+                struct lu_export_data *led = &exp->exp_target_data;
+                epoch = min(epoch, le32_to_cpu(led->led_lcd->lcd_first_epoch));
+        }
+        spin_unlock(&obd->obd_dev_lock);
+        return (__u64)epoch << LR_EPOCH_BITS;
+}
+
  /* Delete inodes which were previously open-unlinked but were not reopened
   * during MDS recovery for whatever reason (e.g. client also failed, recovery
   * aborted, etc). */
@@ -186,6 +211,7 @@ int mds_cleanup_pending(struct obd_device *obd)
          struct list_head dentry_list;
          char d_name[LL_FID_NAMELEN];
          unsigned long inum;
+        __u64 max_version;
          int i = 0, rc = 0, item = 0, namlen;
          ENTRY;
  
@@ -204,13 +230,19 @@ int mds_cleanup_pending(struct obd_device *obd)
          if (IS_ERR(file))
                  GOTO(err_pop, rc = PTR_ERR(file));
  
-        INIT_LIST_HEAD(&dentry_list);
+        CFS_INIT_LIST_HEAD(&dentry_list);
          rc = l_readdir(file, &dentry_list);
          filp_close(file, 0);
          if (rc < 0)
                  GOTO(err_out, rc);
  
+        /** Get maximum version for orphans to delete. All other orphans may be
+         *  needed for delayed clients */
+        max_version = mds_orphans_max_version(obd);
+
          list_for_each_entry_safe(dirent, n, &dentry_list, lld_list) {
+                __u64 version;
+
                  i++;
                  list_del(&dirent->lld_list);
  
@@ -218,7 +250,7 @@ int mds_cleanup_pending(struct obd_device *obd)
                  LASSERT(sizeof(d_name) >= namlen + 1);
                  strcpy(d_name, dirent->lld_name);
                  inum = dirent->lld_ino;
-                OBD_FREE(dirent, sizeof(*dirent));
+                OBD_FREE_PTR(dirent);
  
                  CDEBUG(D_INODE, "entry %d of PENDING DIR: %s\n", i, d_name);
  
@@ -254,6 +286,17 @@ int mds_cleanup_pending(struct obd_device *obd)
                                obd->obd_name, d_name);
                          GOTO(next, rc = 0);
                  }
+                /** Keep orphans for possible use by delayed exports. Remove
+                 * orphans with version lower than minimal one of all exports */
+                version = fsfilt_get_version(obd, child_inode);
+                if ((__s64)version != -EOPNOTSUPP &&
+                    version >= max_version) {
+                        MDS_UP_READ_ORPHAN_SEM(child_inode);
+                        CDEBUG(D_INFO,
+                               "%s: orphan %s is needed for delayed exports\n",
+                               obd->obd_name, d_name);
+                        GOTO(next, rc = 0);
+                }
                  MDS_UP_READ_ORPHAN_SEM(child_inode);
  
                  rc = mds_unlink_orphan(obd, dchild, child_inode, pending_dir);
@@ -284,3 +327,77 @@ err_mntget:
          l_dput(mds->mds_pending_dir);
          goto err_pop;
  }
+
+/**
+ * Determine there is no orphan with the same inode number. That may happens
+ * since unlink replay don't delete inode but keep orphan for delayed clients.
+ * Therefore replays like 'create, unlink, create' will fail due to inode can't
+ * be reused.
+ */
+int mds_check_stale_orphan(struct obd_device *obd, struct ll_fid *fid)
+{
+        struct mds_obd *mds = &obd->u.mds;
+        char fidname[32];
+        struct dentry *result;
+        struct inode *inode, *pending_dir = mds->mds_pending_dir->d_inode;
+        int fidlen = 0, rc = 0;
+
+        /* no need in checks*/
+        if (fid->id == 0 || obd->obd_recovering == 0)
+                RETURN(0);
+
+        /** open by fid like mds_fid2dentry does */
+        snprintf(fidname, sizeof(fidname), "0x%lx", (unsigned long)(fid->id));
+        fidlen = strlen(fidname);
+        result = mds_lookup(obd, fidname, mds->mds_fid_de, fidlen);
+        if (IS_ERR(result))
+                RETURN(0);
+        inode = result->d_inode;
+        if (!inode)
+                GOTO(out, rc = 0);
+
+        LOCK_INODE_MUTEX(pending_dir);
+        MDS_DOWN_READ_ORPHAN_SEM(inode);
+        if (mds_inode_is_orphan(inode)) {
+                struct dentry *orphan;
+
+                /* The exactly same inode can't be orphan */
+                LASSERT(inode->i_generation != fid->generation);
+
+                if (mds_orphan_open_count(inode) > 0) {
+                        CERROR("Orphan "LPU64"/%u is in use!\n",
+                               fid->id, fid->generation);
+                        GOTO(unlock_child, rc = -EFAULT);
+                }
+
+                /** Found orphan in pending dir and delete it */
+                fidlen = ll_fid2str(fidname, fid->id, inode->i_generation);
+                orphan = lookup_one_len(fidname, mds->mds_pending_dir, fidlen);
+                if (IS_ERR(orphan)) {
+                        rc = PTR_ERR(orphan);
+                        CERROR("error looking up %s in PENDING: rc = %d\n",
+                                fidname, rc);
+                        GOTO(unlock_child, rc);
+                }
+                if (orphan->d_inode != inode) {
+                        l_dput(orphan);
+                        CWARN("%s: Found wrong orphan %s %p/%p\n",
+                              obd->obd_name, fidname, orphan->d_inode, inode);
+                        GOTO(unlock_child, rc = -EFAULT);
+                }
+                MDS_UP_READ_ORPHAN_SEM(inode);
+
+                rc = mds_unlink_orphan(obd, orphan, inode, pending_dir);
+                CDEBUG(D_INODE, "%s: removed orphan %s: rc %d\n",
+                       obd->obd_name, fidname, rc);
+                l_dput(orphan);
+                GOTO(unlock, rc);
+        }
+unlock_child:
+        MDS_UP_READ_ORPHAN_SEM(inode);
+unlock:
+        UNLOCK_INODE_MUTEX(pending_dir);
+out:
+        l_dput(result);
+        RETURN(0);
+}
diff --git a/lustre/mds/mds_xattr.c b/lustre/mds/mds_xattr.c

index 6f663ed..accac2d 100644 (file)
--- a/lustre/mds/mds_xattr.c
+++ b/lustre/mds/mds_xattr.c
@@ -1,28 +1,41 @@
  /* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
   * vim:expandtab:shiftwidth=8:tabstop=8:
   *
- *  linux/mds/mds_xattr.c
- *  Lustre Metadata Server (mds) extended attributes handling
+ * GPL HEADER START
   *
- *  Copyright (C) 2004-2005 Cluster File Systems, Inc.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   *
- *   This file is part of the Lustre file system, http://www.lustre.org
- *   Lustre is a trademark of Cluster File Systems, Inc.
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
   *
- *   You may have signed or agreed to another license before downloading
- *   this software.  If so, you are bound by the terms and conditions
- *   of that agreement, and the following does not apply to you.  See the
- *   LICENSE file included with this distribution for more information.
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
   *
- *   If you did not agree to a different license, then this copy of Lustre
- *   is open source software; you can redistribute it and/or modify it
- *   under the terms of version 2 of the GNU General Public License as
- *   published by the Free Software Foundation.
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
   *
- *   In either case, Lustre is distributed in the hope that it will be
- *   useful, but WITHOUT ANY WARRANTY; without even the implied warranty
- *   of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- *   license text for more details.
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright  2008 Sun Microsystems, Inc. All rights reserved
+ * Use is subject to license terms.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/mds/mds_xattr.c
+ *
+ * Lustre Metadata Server (mds) extended attributes handling
   */
  
  #ifndef EXPORT_SYMTAB
@@ -132,10 +145,8 @@ static int mds_getxattr_internal(struct obd_device *obd,
                  DEBUG_REQ(D_INODE, req, "getxattr %s", xattr_name);
  
                  if (inode->i_op && inode->i_op->getxattr) {
-                        lock_24kernel();
                          rc = inode->i_op->getxattr(dentry, xattr_name,
                                                     buf, buflen);
-                        unlock_24kernel();
                  }
  
                  if (rc < 0 && rc != -ENODATA && rc != -EOPNOTSUPP &&
@@ -144,11 +155,8 @@ static int mds_getxattr_internal(struct obd_device *obd,
          } else if (reqbody->valid & OBD_MD_FLXATTRLS) {
                  DEBUG_REQ(D_INODE, req, "listxattr");
  
-                if (inode->i_op && inode->i_op->listxattr) {
-                        lock_24kernel();
+                if (inode->i_op && inode->i_op->listxattr)
                          rc = inode->i_op->listxattr(dentry, buf, buflen);
-                        unlock_24kernel();
-                }
                  if (rc < 0)
                          CDEBUG(D_OTHER, "listxattr failed: %d\n", rc);
          } else
@@ -218,12 +226,13 @@ int mds_setxattr_internal(struct ptlrpc_request *req, struct mds_body *body)
          struct obd_device *obd = req->rq_export->exp_obd;
          struct dentry *de;
          struct inode *inode = NULL;
+        struct inode *inodes[PTLRPC_NUM_VERSIONS] = { NULL };
          struct lustre_handle lockh;
          void *handle = NULL;
          char *xattr_name;
          char *xattr = NULL;
          int xattrlen;
-        int rc = -EOPNOTSUPP, err = 0;
+        int rc = -EOPNOTSUPP, err = 0, sync = 0;
          __u64 lockpart;
          ENTRY;
  
@@ -250,6 +259,9 @@ int mds_setxattr_internal(struct ptlrpc_request *req, struct mds_body *body)
          if (strncmp(xattr_name, "trusted.", 8) == 0) {
                  if (strcmp(xattr_name + 8, XATTR_LUSTRE_MDS_LOV_EA) == 0)
                          GOTO(out, rc = -EACCES);
+                if (strcmp(xattr_name + 8, "lma") == 0 &&
+                    !OBD_FAIL_CHECK(OBD_FAIL_MDS_ALLOW_COMMON_EA_SETTING))
+                        GOTO(out, rc = 0);
          }
  
          if (!(req->rq_export->exp_connect_flags & OBD_CONNECT_XATTR) &&
@@ -270,6 +282,11 @@ int mds_setxattr_internal(struct ptlrpc_request *req, struct mds_body *body)
  
          OBD_FAIL_WRITE(obd, OBD_FAIL_MDS_SETXATTR_WRITE, inode->i_sb);
  
+        /* version recovery check */
+        rc = mds_version_get_check(req, inode, 0);
+        if (rc)
+                GOTO(out_dput, rc);
+
          /* filter_op simply use setattr one */
          handle = fsfilt_start(obd, inode, FSFILT_OP_SETATTR, NULL);
          if (IS_ERR(handle))
@@ -289,18 +306,14 @@ int mds_setxattr_internal(struct ptlrpc_request *req, struct mds_body *body)
                                                         REQ_REC_OFF+2, xattrlen);
  
                          LOCK_INODE_MUTEX(inode);
-                        lock_24kernel();
                          rc = inode->i_op->setxattr(de, xattr_name, xattr,
                                                     xattrlen, body->flags);
-                        unlock_24kernel();
                          UNLOCK_INODE_MUTEX(inode);
                  }
          } else if (body->valid & OBD_MD_FLXATTRRM) {
                  if (inode->i_op && inode->i_op->removexattr) {
                          LOCK_INODE_MUTEX(inode);
-                        lock_24kernel();
                          rc = inode->i_op->removexattr(de, xattr_name);
-                        unlock_24kernel();
                          UNLOCK_INODE_MUTEX(inode);
                  }
          } else {
@@ -310,8 +323,11 @@ int mds_setxattr_internal(struct ptlrpc_request *req, struct mds_body *body)
  
          LASSERT(rc <= 0);
  out_trans:
-        err = mds_finish_transno(mds, inode, handle, req, rc, 0, 0);
-
+        /* security-replated changes may require sync */
+        if (!strcmp(xattr_name, XATTR_NAME_ACL_ACCESS))
+                sync = mds->mds_sync_permission;
+        inodes[0] = inode;
+        err = mds_finish_transno(mds, inodes, handle, req, rc, 0, sync);
  out_dput:
          l_dput(de);
          if (rc)
diff --git a/lustre/mgc/autoMakefile.am b/lustre/mgc/autoMakefile.am

index 8b88691..db9a433 100644 (file)
--- a/lustre/mgc/autoMakefile.am
+++ b/lustre/mgc/autoMakefile.am
@@ -1,7 +1,38 @@
-# Copyright (C) 2006  Cluster File Systems, Inc.
  #
-# This code is issued under the GNU General Public License.
-# See the file COPYING in this distribution
+# GPL HEADER START
+#
+# DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License version 2 only,
+# as published by the Free Software Foundation.
+#
+# This program is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+# General Public License version 2 for more details (a copy is included
+# in the LICENSE file that accompanied this code).
+#
+# You should have received a copy of the GNU General Public License
+# version 2 along with this program; If not, see
+# http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+#
+# Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+# CA 95054 USA or visit www.sun.com if you need additional information or
+# have any questions.
+#
+# GPL HEADER END
+#
+
+#
+# Copyright  2008 Sun Microsystems, Inc. All rights reserved
+# Use is subject to license terms.
+#
+
+#
+# This file is part of Lustre, http://www.lustre.org/
+# Lustre is a trademark of Sun Microsystems, Inc.
+#
  
  if LIBLUSTRE
  noinst_LIBRARIES = libmgc.a
diff --git a/lustre/mgc/libmgc.c b/lustre/mgc/libmgc.c

index c8fd0d5..bb5f415 100644 (file)
--- a/lustre/mgc/libmgc.c
+++ b/lustre/mgc/libmgc.c
@@ -1,28 +1,44 @@
  /* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
   * vim:expandtab:shiftwidth=8:tabstop=8:
   *
- *  lustre/mgc/mgc_request.c
- *  Lustre Management Client
+ * GPL HEADER START
   *
- *  Copyright (C) 2006 Cluster File Systems, Inc.
- *   Author: Nathan Rutman <nathan@clusterfs.com>
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   *
- *   This file is part of Lustre, http://www.lustre.org
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
   *
- *   Lustre is free software; you can redistribute it and/or
- *   modify it under the terms of version 2 of the GNU General Public
- *   License as published by the Free Software Foundation.
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
   *
- *   Lustre is distributed in the hope that it will be useful,
- *   but WITHOUT ANY WARRANTY; without even the implied warranty of
- *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- *   GNU General Public License for more details.
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
   *
- *   You should have received a copy of the GNU General Public License
- *   along with Lustre; if not, write to the Free Software
- *   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
   *
+ * GPL HEADER END
   */
+/*
+ * Copyright  2008 Sun Microsystems, Inc. All rights reserved
+ * Use is subject to license terms.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/mgc/libmgc.c
+ *
+ * Lustre Management Client
+ * Author: Nathan Rutman <nathan@clusterfs.com>
+ */
+
  /* Minimal MGC for liblustre: only used to read the config log from the MGS
     at setup time, no updates. */
   
@@ -73,12 +89,12 @@ static int mgc_precleanup(struct obd_device *obd, enum obd_cleanup_stage stage)
          switch (stage) {
          case OBD_CLEANUP_EARLY: 
          case OBD_CLEANUP_EXPORTS:
-                break;
-        case OBD_CLEANUP_SELF_EXP:
                  rc = obd_llog_finish(obd, 0);
                  if (rc != 0)
                          CERROR("failed to cleanup llogging subsystems\n");
                  break;
+        case OBD_CLEANUP_SELF_EXP:
+                break;
          case OBD_CLEANUP_OBD:
                  break;
          }
@@ -145,4 +161,3 @@ int __init mgc_init(void)
  {
          return class_register_type(&mgc_obd_ops, NULL, LUSTRE_MGC_NAME);
  }
-
diff --git a/lustre/mgc/lproc_mgc.c b/lustre/mgc/lproc_mgc.c

index 1b1fd12..ba98f1b 100644 (file)
--- a/lustre/mgc/lproc_mgc.c
+++ b/lustre/mgc/lproc_mgc.c
@@ -1,26 +1,37 @@
  /* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
   * vim:expandtab:shiftwidth=8:tabstop=8:
   *
- * Copyright (C) 2002 Cluster File Systems, Inc.
+ * GPL HEADER START
   *
- * This file is part of the Lustre file system, http://www.lustre.org
- * Lustre is a trademark of Cluster File Systems, Inc.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   *
- * You may have signed or agreed to another license before downloading
- * this software.  If so, you are bound by the terms and conditions
- * of that agreement, and the following does not apply to you.  See the
- * LICENSE file included with this distribution for more information.
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
   *
- * If you did not agree to a different license, then this copy of Lustre
- * is open source software; you can redistribute it and/or  modify it
- * under the terms of version 2 of the GNU General Public License as
- * published by the Free Software Foundation.
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
   *
- * In either case, Lustre is distributed in the hope that it will be
- * useful, but WITHOUT ANY WARRANTY; without even the implied warranty
- * of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * license text for more details.
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
   *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright  2008 Sun Microsystems, Inc. All rights reserved
+ * Use is subject to license terms.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
   */
  #define DEBUG_SUBSYSTEM S_CLASS
  
@@ -33,10 +44,11 @@
  
  static struct lprocfs_vars lprocfs_mgc_obd_vars[] = {
          { "uuid",            lprocfs_rd_uuid,          0, 0 },
-        { "ping",            0, lprocfs_wr_ping,          0 },
+        { "ping",            0, lprocfs_wr_ping,       0, 0, 0222 },
          { "connect_flags",   lprocfs_rd_connect_flags, 0, 0 },
          { "mgs_server_uuid", lprocfs_rd_server_uuid,   0, 0 },
          { "mgs_conn_uuid",   lprocfs_rd_conn_uuid,     0, 0 },
+        { "import",          lprocfs_rd_import,    0, 0 },
          { 0 }
  };
  
diff --git a/lustre/mgc/mgc_internal.h b/lustre/mgc/mgc_internal.h

index 1edf122..dd16b10 100644 (file)
--- a/lustre/mgc/mgc_internal.h
+++ b/lustre/mgc/mgc_internal.h
@@ -1,5 +1,37 @@
  /* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
   * vim:expandtab:shiftwidth=8:tabstop=8:
+ *
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright  2008 Sun Microsystems, Inc. All rights reserved
+ * Use is subject to license terms.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
   */
  
  #ifndef _MGC_INTERNAL_H
diff --git a/lustre/mgc/mgc_request.c b/lustre/mgc/mgc_request.c

index 660229d..cc23b99 100644 (file)
--- a/lustre/mgc/mgc_request.c
+++ b/lustre/mgc/mgc_request.c
@@ -1,27 +1,41 @@
  /* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
   * vim:expandtab:shiftwidth=8:tabstop=8:
   *
- *  lustre/mgc/mgc_request.c
- *  Lustre Management Client
+ * GPL HEADER START
   *
- *  Copyright (C) 2006 Cluster File Systems, Inc.
- *   Author: Nathan Rutman <nathan@clusterfs.com>
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   *
- *   This file is part of Lustre, http://www.lustre.org
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
   *
- *   Lustre is free software; you can redistribute it and/or
- *   modify it under the terms of version 2 of the GNU General Public
- *   License as published by the Free Software Foundation.
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
   *
- *   Lustre is distributed in the hope that it will be useful,
- *   but WITHOUT ANY WARRANTY; without even the implied warranty of
- *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- *   GNU General Public License for more details.
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
   *
- *   You should have received a copy of the GNU General Public License
- *   along with Lustre; if not, write to the Free Software
- *   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
   *
+ * GPL HEADER END
+ */
+/*
+ * Copyright  2008 Sun Microsystems, Inc. All rights reserved
+ * Use is subject to license terms.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/mgc/mgc_request.c
+ *
+ * Author: Nathan Rutman <nathan@clusterfs.com>
   */
  
  #ifndef EXPORT_SYMTAB
@@ -73,7 +87,7 @@ static int mgc_name2resid(char *name, int len, struct ldlm_res_id *res_id)
  int mgc_fsname2resid(char *fsname, struct ldlm_res_id *res_id)
  {
          /* fsname is at most 8 chars long, maybe contain "-".
-         * e.g. "lustre", "CFS-000" */
+         * e.g. "lustre", "SUN-000" */
          return mgc_name2resid(fsname, strlen(fsname), res_id);
  }
  EXPORT_SYMBOL(mgc_fsname2resid);
@@ -84,7 +98,7 @@ int mgc_logname2resid(char *logname, struct ldlm_res_id *res_id)
          int len;
  
          /* logname consists of "fsname-nodetype".
-         * e.g. "lustre-MDT0001", "CFS-000-client" */
+         * e.g. "lustre-MDT0001", "SUN-000-client" */
          name_end = strrchr(logname, '-');
          LASSERT(name_end);
          len = name_end - logname;
@@ -218,6 +232,8 @@ static int config_log_add(char *logname, struct config_llog_instance *cfg,
          RETURN(rc);
  }
  
+DECLARE_MUTEX(llog_process_lock);
+
  /* Stop watching for updates on this log. */
  static int config_log_end(char *logname, struct config_llog_instance *cfg)
  {       
@@ -231,7 +247,10 @@ static int config_log_end(char *logname, struct config_llog_instance *cfg)
          /* drop the ref from the find */
          config_log_put(cld);
  
+        down(&llog_process_lock);
          cld->cld_stopping = 1;
+        up(&llog_process_lock);
+
          /* drop the start ref */
          config_log_put(cld);
          CDEBUG(D_MGC, "end config log %s (%d)\n", logname ? logname : "client",
@@ -322,10 +341,9 @@ static int mgc_requeue_add(struct config_llog_data *cld, int later)
          CDEBUG(D_INFO, "log %s: requeue (l=%d r=%d sp=%d st=%x)\n", 
                 cld->cld_logname, later, atomic_read(&cld->cld_refcount),
                 cld->cld_stopping, rq_state);
-        
+
          /* Hold lock for rq_state */
          spin_lock(&config_list_lock);
-        cld->cld_lostlock = 1;
  
          if (cld->cld_stopping || (rq_state & RQ_STOP)) {
                  spin_unlock(&config_list_lock);
@@ -333,6 +351,8 @@ static int mgc_requeue_add(struct config_llog_data *cld, int later)
                  RETURN(0);
          }
  
+        cld->cld_lostlock = 1;
+
          if (!(rq_state & RQ_RUNNING)) {
                  LASSERT(rq_state == 0);
                  rq_state = RQ_RUNNING | (later ? RQ_LATER : RQ_NOW);
@@ -662,14 +682,10 @@ static int mgc_iocontrol(unsigned int cmd, struct obd_export *exp, int len,
          int rc;
          ENTRY;
  
-#if (LINUX_VERSION_CODE < KERNEL_VERSION(2,5,0))
-        MOD_INC_USE_COUNT;
-#else
          if (!try_module_get(THIS_MODULE)) {
                  CERROR("Can't get module. Is it alive?");
                  return -EINVAL;
          }
-#endif
          switch (cmd) {
          /* REPLicator context */
          case OBD_IOC_PARSE: {
@@ -703,11 +719,7 @@ static int mgc_iocontrol(unsigned int cmd, struct obd_export *exp, int len,
                  GOTO(out, rc = -ENOTTY);
          }
  out:
-#if (LINUX_VERSION_CODE < KERNEL_VERSION(2,5,0))
-        MOD_DEC_USE_COUNT;
-#else
          module_put(THIS_MODULE);
-#endif
  
          return rc;
  }
@@ -759,7 +771,7 @@ static int mgc_set_mgs_param(struct obd_export *exp,
          struct ptlrpc_request *req;
          struct mgs_send_param *req_msp, *rep_msp;
          int size[] = { sizeof(struct ptlrpc_body), sizeof(*req_msp) };
-        int rep_size[] = { sizeof(struct ptlrpc_body), sizeof(*msp) };
+        __u32 rep_size[] = { sizeof(struct ptlrpc_body), sizeof(*msp) };
          int rc;
          ENTRY;
  
@@ -880,17 +892,16 @@ static int mgc_import_event(struct obd_device *obd,
          CDEBUG(D_MGC, "import event %#x\n", event);
  
          switch (event) {
-        case IMP_EVENT_DISCON: 
-                /* MGC imports should not wait for recovery */
+        case IMP_EVENT_DISCON:
                  break;
-        case IMP_EVENT_INACTIVE: 
+        case IMP_EVENT_INACTIVE:
                  break;
          case IMP_EVENT_INVALIDATE: {
                  struct ldlm_namespace *ns = obd->obd_namespace;
                  ldlm_namespace_cleanup(ns, LDLM_FL_LOCAL_ONLY);
                  break;
          }
-        case IMP_EVENT_ACTIVE: 
+        case IMP_EVENT_ACTIVE:
                  LCONSOLE_WARN("%s: Reactivating import\n", obd->obd_name);
                  /* Clearing obd_no_recov allows us to continue pinging */
                  obd->obd_no_recov = 0;
@@ -923,6 +934,10 @@ static int mgc_llog_init(struct obd_device *obd, struct obd_device *tgt,
                  ctxt = llog_get_context(obd, LLOG_CONFIG_REPL_CTXT);
                  llog_initiator_connect(ctxt);
                  llog_ctxt_put(ctxt);
+        } else {
+                ctxt = llog_get_context(obd, LLOG_CONFIG_ORIG_CTXT);
+                if (ctxt)
+                        llog_cleanup(ctxt);
          }
  
          RETURN(rc);
@@ -1051,7 +1066,8 @@ out_closel:
                  struct client_obd *cli = &obd->u.cli;
                  LASSERT(cli);
                  LASSERT(cli->cl_mgc_configs_dir);
-                rc = lustre_rename(cli->cl_mgc_configs_dir, temp_log, logname);
+                rc = lustre_rename(cli->cl_mgc_configs_dir, cli->cl_mgc_vfsmnt,
+                                   temp_log, logname);
          }
          CDEBUG(D_MGC, "Copied remote log %s (%d)\n", logname, rc);
  out:
@@ -1061,8 +1077,6 @@ out:
          RETURN(rc);
  }
  
-DECLARE_MUTEX(llog_process_lock);
-
  /* Get a config log from the MGS and process it.
     This func is called for both clients and servers. */
  static int mgc_process_log(struct obd_device *mgc, 
@@ -1081,8 +1095,17 @@ static int mgc_process_log(struct obd_device *mgc,
                  CERROR("Missing cld, aborting log update\n");
                  RETURN(-EINVAL);
          }
-        if (cld->cld_stopping) 
+
+        /* I don't want mutliple processes running process_log at once -- 
+           sounds like badness.  It actually might be fine, as long as 
+           we're not trying to update from the same log
+           simultaneously (in which case we should use a per-log sem.) */
+        down(&llog_process_lock);
+
+        if (cld->cld_stopping) {
+                up(&llog_process_lock);
                  RETURN(0);
+        }
  
          OBD_FAIL_TIMEOUT(OBD_FAIL_MGC_PAUSE_PROCESS_LOG, 20);
  
@@ -1094,15 +1117,10 @@ static int mgc_process_log(struct obd_device *mgc,
          ctxt = llog_get_context(mgc, LLOG_CONFIG_REPL_CTXT);
          if (!ctxt) {
                  CERROR("missing llog context\n");
+                up(&llog_process_lock);
                  RETURN(-EINVAL);
          }
  
-        /* I don't want mutliple processes running process_log at once -- 
-           sounds like badness.  It actually might be fine, as long as 
-           we're not trying to update from the same log
-           simultaneously (in which case we should use a per-log sem.) */
-        down(&llog_process_lock);
-
          /* Get the cfg lock on the llog */
          rcl = mgc_enqueue(mgc->u.cli.cl_mgc_mgsexp, NULL, LDLM_PLAIN, NULL, 
                            LCK_CR, &flags, NULL, NULL, NULL, 
@@ -1276,7 +1294,7 @@ static void /*__exit*/ mgc_exit(void)
          class_unregister_type(LUSTRE_MGC_NAME);
  }
  
-MODULE_AUTHOR("Cluster File Systems, Inc. <info@clusterfs.com>");
+MODULE_AUTHOR("Sun Microsystems, Inc. <http://www.lustre.org/>");
  MODULE_DESCRIPTION("Lustre Management Client");
  MODULE_LICENSE("GPL");
  
diff --git a/lustre/mgs/autoMakefile.am b/lustre/mgs/autoMakefile.am

index 53734b0..c538cb4 100644 (file)
--- a/lustre/mgs/autoMakefile.am
+++ b/lustre/mgs/autoMakefile.am
@@ -1,7 +1,38 @@
-# Copyright (C) 2001  Cluster File Systems, Inc.
  #
-# This code is issued under the GNU General Public License.
-# See the file COPYING in this distribution
+# GPL HEADER START
+#
+# DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License version 2 only,
+# as published by the Free Software Foundation.
+#
+# This program is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+# General Public License version 2 for more details (a copy is included
+# in the LICENSE file that accompanied this code).
+#
+# You should have received a copy of the GNU General Public License
+# version 2 along with this program; If not, see
+# http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+#
+# Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+# CA 95054 USA or visit www.sun.com if you need additional information or
+# have any questions.
+#
+# GPL HEADER END
+#
+
+#
+# Copyright  2008 Sun Microsystems, Inc. All rights reserved
+# Use is subject to license terms.
+#
+
+#
+# This file is part of Lustre, http://www.lustre.org/
+# Lustre is a trademark of Sun Microsystems, Inc.
+#
  
  if MODULES
  modulefs_DATA = mgs$(KMODEXT)
diff --git a/lustre/mgs/lproc_mgs.c b/lustre/mgs/lproc_mgs.c

index 5a4f329..f14320e 100644 (file)
--- a/lustre/mgs/lproc_mgs.c
+++ b/lustre/mgs/lproc_mgs.c
@@ -1,33 +1,42 @@
  /* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
   * vim:expandtab:shiftwidth=8:tabstop=8:
   *
- *  Copyright (C) 2002 Cluster File Systems, Inc.
+ * GPL HEADER START
   *
- *   This file is part of the Lustre file system, http://www.lustre.org
- *   Lustre is a trademark of Cluster File Systems, Inc.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   *
- *   You may have signed or agreed to another license before downloading
- *   this software.  If so, you are bound by the terms and conditions
- *   of that agreement, and the following does not apply to you.  See the
- *   LICENSE file included with this distribution for more information.
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
   *
- *   If you did not agree to a different license, then this copy of Lustre
- *   is open source software; you can redistribute it and/or modify it
- *   under the terms of version 2 of the GNU General Public License as
- *   published by the Free Software Foundation.
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
   *
- *   In either case, Lustre is distributed in the hope that it will be
- *   useful, but WITHOUT ANY WARRANTY; without even the implied warranty
- *   of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- *   license text for more details.
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
   *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright  2008 Sun Microsystems, Inc. All rights reserved
+ * Use is subject to license terms.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
   */
  #define DEBUG_SUBSYSTEM S_CLASS
  
  #include <linux/version.h>
-#if (LINUX_VERSION_CODE >= KERNEL_VERSION(2,5,0))
  #include <asm/statfs.h>
-#endif
  #include <obd.h>
  #include <obd_class.h>
  #include <lprocfs_status.h>
@@ -164,6 +173,7 @@ struct lprocfs_vars lprocfs_mgs_obd_vars[] = {
          { "fstype",          lprocfs_rd_fstype,      0, 0 },
          { "mntdev",          lprocfs_mgs_rd_mntdev,  0, 0 },
          { "num_exports",     lprocfs_rd_num_exports, 0, 0 },
+        { "hash_stats",      lprocfs_obd_rd_hash,    0, 0 },
          { "evict_client",    0, lprocfs_wr_evict_client, 0 },
          { 0 }
  };
diff --git a/lustre/mgs/mgs_fs.c b/lustre/mgs/mgs_fs.c

index 4e8033d..95a14b4 100644 (file)
--- a/lustre/mgs/mgs_fs.c
+++ b/lustre/mgs/mgs_fs.c
@@ -1,27 +1,45 @@
  /* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
   * vim:expandtab:shiftwidth=8:tabstop=8:
   *
- *  lustre/mgs/mgs_fs.c
- *  Lustre Management Server (MGS) filesystem interface code
+ * GPL HEADER START
   *
- *  Copyright (C) 2006 Cluster File Systems, Inc.
- *   Author: Nathan Rutman <nathan@clusterfs.com>
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   *
- *   This file is part of Lustre, http://www.lustre.org.
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
   *
- *   Lustre is free software; you can redistribute it and/or
- *   modify it under the terms of version 2 of the GNU General Public
- *   License as published by the Free Software Foundation.
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
   *
- *   Lustre is distributed in the hope that it will be useful,
- *   but WITHOUT ANY WARRANTY; without even the implied warranty of
- *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- *   GNU General Public License for more details.
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
   *
- *   You should have received a copy of the GNU General Public License
- *   along with Lustre; if not, write to the Free Software
- *   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright  2008 Sun Microsystems, Inc. All rights reserved
+ * Use is subject to license terms.
   */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/mgs/mgs_fs.c
+ *
+ * Lustre Management Server (MGS) filesystem interface code
+ *
+ * Author: Nathan Rutman <nathan@clusterfs.com>
+ */
+
  #ifndef EXPORT_SYMTAB
  # define EXPORT_SYMTAB
  #endif
@@ -31,9 +49,7 @@
  #include <linux/kmod.h>
  #include <linux/version.h>
  #include <linux/sched.h>
-#if (LINUX_VERSION_CODE >= KERNEL_VERSION(2,5,0))
  #include <linux/mount.h>
-#endif
  #include <obd_class.h>
  #include <obd_support.h>
  #include <lustre_disk.h>
@@ -43,13 +59,21 @@
  #include "mgs_internal.h"
  
  
-static int mgs_export_stats_init(struct obd_device *obd, struct obd_export *exp)
+static int mgs_export_stats_init(struct obd_device *obd,
+                                 struct obd_export *exp,
+                                 void *localdata)
  {
+        lnet_nid_t *client_nid = localdata;
          int rc, num_stats, newnid = 0;
  
-        rc = lprocfs_exp_setup(exp, NULL, &newnid);
-        if (rc)
+        rc = lprocfs_exp_setup(exp, client_nid, &newnid);
+        if (rc) {
+                /* Mask error for already created
+                 * /proc entries */
+                if (rc == -EALREADY)
+                        rc = 0;
                  return rc;
+        }
  
          if (newnid) {
                  num_stats = (sizeof(*obd->obd_type->typ_ops) / sizeof(void *)) +
@@ -61,7 +85,19 @@ static int mgs_export_stats_init(struct obd_device *obd, struct obd_export *exp)
                  lprocfs_init_ops_stats(LPROC_MGS_LAST, exp->exp_ops_stats);
                  mgs_stats_counter_init(exp->exp_ops_stats);
                  lprocfs_register_stats(exp->exp_nid_stats->nid_proc, "stats", exp->exp_ops_stats);
+
+                /* Always add in ldlm_stats */
+                exp->exp_nid_stats->nid_ldlm_stats = lprocfs_alloc_stats(LDLM_LAST_OPC -
+                                                                         LDLM_FIRST_OPC, 0);
+                if (exp->exp_nid_stats->nid_ldlm_stats == NULL)
+                        return -ENOMEM;
+
+                lprocfs_init_ldlm_stats(exp->exp_nid_stats->nid_ldlm_stats);
+
+                rc = lprocfs_register_stats(exp->exp_nid_stats->nid_proc, "ldlm_stats",
+                                            exp->exp_nid_stats->nid_ldlm_stats);
          }
+
          return 0;
  }
  
@@ -69,15 +105,17 @@ static int mgs_export_stats_init(struct obd_device *obd, struct obd_export *exp)
   * disk in the last_rcvd file or anywhere else.  In the event of a MGS
   * crash all connections are treated as new connections.
   */
-int mgs_client_add(struct obd_device *obd, struct obd_export *exp)
+int mgs_client_add(struct obd_device *obd,
+                   struct obd_export *exp,
+                   void *localdata)
  {
-        return mgs_export_stats_init(obd, exp);
+        return mgs_export_stats_init(obd, exp, localdata);
  }
  
  /* Remove client export data from the MGS */
  int mgs_client_free(struct obd_export *exp)
  {
-        return lprocfs_exp_cleanup(exp);
+        return 0; 
  }
  
  /* Same as mds_fid2dentry */
@@ -97,9 +135,9 @@ static struct dentry *mgs_fid2dentry(struct mgs_obd *mgs, struct ll_fid *fid)
  
          if (ino == 0)
                  RETURN(ERR_PTR(-ESTALE));
-        
+
          snprintf(fid_name, sizeof(fid_name), "0x%lx", ino);
-        
+
          /* under ext3 this is neither supposed to return bad inodes
             nor NULL inodes. */
          result = ll_lookup_one_len(fid_name, mgs->mgs_fid_de, strlen(fid_name));
@@ -177,10 +215,10 @@ int mgs_fs_setup(struct obd_device *obd, struct vfsmount *mnt)
          push_ctxt(&saved, &obd->obd_lvfs_ctxt, NULL);
  
          /* Setup the configs dir */
-        dentry = simple_mkdir(current->fs->pwd, MOUNT_CONFIGS_DIR, 0777, 1);
+        dentry = simple_mkdir(current->fs->pwd, mnt, MOUNT_CONFIGS_DIR, 0777, 1);
          if (IS_ERR(dentry)) {
                  rc = PTR_ERR(dentry);
-                CERROR("cannot create %s directory: rc = %d\n", 
+                CERROR("cannot create %s directory: rc = %d\n",
                         MOUNT_CONFIGS_DIR, rc);
                  GOTO(err_pop, rc);
          }
diff --git a/lustre/mgs/mgs_handler.c b/lustre/mgs/mgs_handler.c

index c67e48f..a567552 100644 (file)
--- a/lustre/mgs/mgs_handler.c
+++ b/lustre/mgs/mgs_handler.c
@@ -1,26 +1,41 @@
  /* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
   * vim:expandtab:shiftwidth=8:tabstop=8:
   *
- *  lustre/mgs/mgs_handler.c
- *  Lustre Management Server (mgs) request handler
+ * GPL HEADER START
   *
- *  Copyright (C) 2006 Cluster File Systems, Inc.
- *   Author: Nathan Rutman <nathan@clusterfs.com>
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   *
- *   This file is part of Lustre, http://www.lustre.org.
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
   *
- *   Lustre is free software; you can redistribute it and/or
- *   modify it under the terms of version 2 of the GNU General Public
- *   License as published by the Free Software Foundation.
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
   *
- *   Lustre is distributed in the hope that it will be useful,
- *   but WITHOUT ANY WARRANTY; without even the implied warranty of
- *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- *   GNU General Public License for more details.
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
   *
- *   You should have received a copy of the GNU General Public License
- *   along with Lustre; if not, write to the Free Software
- *   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright  2008 Sun Microsystems, Inc. All rights reserved
+ * Use is subject to license terms.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/mgs/mgs_handler.c
+ *
+ * Author: Nathan Rutman <nathan@clusterfs.com>
   */
  
  #ifndef EXPORT_SYMTAB
@@ -42,7 +57,6 @@
  #include <lustre_dlm.h>
  #include <lprocfs_status.h>
  #include <lustre_fsfilt.h>
-#include <lustre_commit_confd.h>
  #include <lustre_disk.h>
  #include "mgs_internal.h"
  
@@ -73,7 +87,7 @@ static int mgs_connect(struct lustre_handle *conn, struct obd_device *obd,
                  data->ocd_version = LUSTRE_VERSION_CODE;
          }
  
-        rc = mgs_client_add(obd, exp);
+        rc = mgs_client_add(obd, exp, localdata);
  
          if (rc) {
                  class_disconnect(exp);
@@ -84,6 +98,26 @@ static int mgs_connect(struct lustre_handle *conn, struct obd_device *obd,
          RETURN(rc);
  }
  
+static int mgs_reconnect(struct obd_export *exp, struct obd_device *obd,
+                         struct obd_uuid *cluuid, struct obd_connect_data *data,
+                         void *localdata)
+{
+        ENTRY;
+
+        if (exp == NULL || obd == NULL || cluuid == NULL)
+                RETURN(-EINVAL);
+
+        mgs_counter_incr(exp, LPROC_MGS_CONNECT);
+
+        if (data != NULL) {
+                data->ocd_connect_flags &= MGS_CONNECT_SUPPORTED;
+                exp->exp_connect_flags = data->ocd_connect_flags;
+                data->ocd_version = LUSTRE_VERSION_CODE;
+        }
+
+        RETURN(0);
+}
+
  static int mgs_disconnect(struct obd_export *exp)
  {
          int rc;
@@ -98,6 +132,8 @@ static int mgs_disconnect(struct obd_export *exp)
          rc = class_disconnect(exp);
          ldlm_cancel_locks_for_export(exp);
  
+        lprocfs_exp_cleanup(exp);
+
          /* complete all outstanding replies */
          spin_lock(&exp->exp_lock);
          while (!list_empty(&exp->exp_outstanding_replies)) {
@@ -127,6 +163,7 @@ static int mgs_setup(struct obd_device *obd, obd_count len, void *buf)
          struct mgs_obd *mgs = &obd->u.mgs;
          struct lustre_mount_info *lmi;
          struct lustre_sb_info *lsi;
+        struct llog_ctxt *ctxt;
          struct vfsmount *mnt;
          int rc = 0;
          ENTRY;
@@ -135,7 +172,7 @@ static int mgs_setup(struct obd_device *obd, obd_count len, void *buf)
  
          /* Find our disk */
          lmi = server_get_mount(obd->obd_name);
-        if (!lmi) 
+        if (!lmi)
                  RETURN(rc = -EINVAL);
  
          mnt = lmi->lmi_mnt;
@@ -145,7 +182,7 @@ static int mgs_setup(struct obd_device *obd, obd_count len, void *buf)
                  GOTO(err_put, rc = PTR_ERR(obd->obd_fsops));
  
          /* namespace for mgs llog */
-        obd->obd_namespace = ldlm_namespace_new(obd, "MGS", LDLM_NAMESPACE_SERVER, 
+        obd->obd_namespace = ldlm_namespace_new(obd, "MGS", LDLM_NAMESPACE_SERVER,
                                                  LDLM_NAMESPACE_MODEST);
          if (obd->obd_namespace == NULL)
                  GOTO(err_ops, rc = -ENOMEM);
@@ -183,11 +220,11 @@ static int mgs_setup(struct obd_device *obd, obd_count len, void *buf)
                                  mgs_handle, LUSTRE_MGS_NAME,
                                  obd->obd_proc_entry, NULL,
                                  MGS_THREADS_AUTO_MIN, MGS_THREADS_AUTO_MAX,
-                                "ll_mgs");
+                                "ll_mgs", NULL);
  
          if (!mgs->mgs_service) {
                  CERROR("failed to start service\n");
-                GOTO(err_fs, rc = -ENOMEM);
+                GOTO(err_llog, rc = -ENOMEM);
          }
  
          rc = ptlrpc_start_threads(obd, mgs->mgs_service);
@@ -208,6 +245,10 @@ static int mgs_setup(struct obd_device *obd, obd_count len, void *buf)
  
  err_thread:
          ptlrpc_unregister_service(mgs->mgs_service);
+err_llog:
+        ctxt = llog_get_context(obd, LLOG_CONFIG_ORIG_CTXT);
+        if (ctxt)
+                llog_cleanup(ctxt);
  err_fs:
          /* No extra cleanup needed for llog_init_commit_thread() */
          mgs_fs_cleanup(obd);
@@ -248,7 +289,7 @@ static int mgs_cleanup(struct obd_device *obd)
  
          if (mgs->mgs_sb == NULL)
                  RETURN(0);
-        
+
          ping_evictor_stop();
  
          ptlrpc_unregister_service(mgs->mgs_service);
@@ -277,15 +318,15 @@ static int mgs_get_cfg_lock(struct obd_device *obd, char *fsname,
          ENTRY;
  
          rc = mgc_fsname2resid(fsname, &res_id);
-        if (!rc) 
+        if (!rc)
                  rc = ldlm_cli_enqueue_local(obd->obd_namespace, &res_id,
                                              LDLM_PLAIN, NULL, LCK_EX,
                                              &flags, ldlm_blocking_ast,
                                              ldlm_completion_ast, NULL,
                                              fsname, 0, NULL, lockh);
-        if (rc) 
+        if (rc)
                  CERROR("can't take cfg lock for %s (%d)\n", fsname, rc);
-        
+
          RETURN(rc);
  }
  
@@ -296,6 +337,21 @@ static int mgs_put_cfg_lock(struct lustre_handle *lockh)
          RETURN(0);
  }
  
+static void mgs_revoke_lock(struct obd_device *obd, char *fsname,
+                            struct lustre_handle *lockh)
+{
+        int lockrc;
+
+        if (fsname[0]) {
+                lockrc = mgs_get_cfg_lock(obd, fsname, lockh);
+                if (lockrc != ELDLM_OK)
+                        CERROR("lock error %d for fs %s\n", lockrc,
+                               fsname);
+                else
+                        mgs_put_cfg_lock(lockh);
+        }
+}
+
  /* rc=0 means ok
        1 means update
       <0 means error */
@@ -307,10 +363,9 @@ static int mgs_check_target(struct obd_device *obd, struct mgs_target_info *mti)
          rc = mgs_check_index(obd, mti);
          if (rc == 0) {
                  LCONSOLE_ERROR_MSG(0x13b, "%s claims to have registered, but "
-                                  "this MGS does not know about it.  Assuming "
-                                  "writeconf.\n", mti->mti_svname);
-                mti->mti_flags |= LDD_F_WRITECONF;
-                rc = 1;
+                                  "this MGS does not know about it, preventing "
+                                  "registration.\n", mti->mti_svname);
+                rc = -ENOENT;
          } else if (rc == -1) {
                  LCONSOLE_ERROR_MSG(0x13c, "Client log %s-client has "
                                     "disappeared! Regenerating all logs.\n",
@@ -320,7 +375,7 @@ static int mgs_check_target(struct obd_device *obd, struct mgs_target_info *mti)
          } else {
                  /* Index is correctly marked as used */
  
-                /* If the logs don't contain the mti_nids then add 
+                /* If the logs don't contain the mti_nids then add
                     them as failover nids */
                  rc = mgs_check_failnid(obd, mti);
          }
@@ -330,7 +385,7 @@ static int mgs_check_target(struct obd_device *obd, struct mgs_target_info *mti)
  
  /* Called whenever a target starts up.  Flags indicate first connect, etc. */
  static int mgs_handle_target_reg(struct ptlrpc_request *req)
-{    
+{
          struct obd_device *obd = req->rq_export->exp_obd;
          struct lustre_handle lockh;
          struct mgs_target_info *mti, *rep_mti;
@@ -342,7 +397,7 @@ static int mgs_handle_target_reg(struct ptlrpc_request *req)
  
          mti = lustre_swab_reqbuf(req, REQ_REC_OFF, sizeof(*mti),
                                   lustre_swab_mgs_target_info);
-        
+
          if (!(mti->mti_flags & (LDD_F_WRITECONF | LDD_F_UPGRADE14 |
                                  LDD_F_UPDATE))) {
                  /* We're just here as a startup ping. */
@@ -350,14 +405,14 @@ static int mgs_handle_target_reg(struct ptlrpc_request *req)
                         mti->mti_svname, obd_export_nid2str(req->rq_export));
                  rc = mgs_check_target(obd, mti);
                  /* above will set appropriate mti flags */
-                if (rc <= 0) 
+                if (rc <= 0)
                          /* Nothing wrong, or fatal error */
                          GOTO(out_nolock, rc);
          }
  
          /* Revoke the config lock to make sure nobody is reading. */
          /* Although actually I think it should be alright if
-           someone was reading while we were updating the logs - if we 
+           someone was reading while we were updating the logs - if we
             revoke at the end they will just update from where they left off. */
          lockrc = mgs_get_cfg_lock(obd, mti->mti_fsname, &lockh);
          if (lockrc != ELDLM_OK) {
@@ -388,7 +443,7 @@ static int mgs_handle_target_reg(struct ptlrpc_request *req)
                  }
                  mti->mti_flags |= LDD_F_UPDATE;
                  /* Erased logs means start from scratch. */
-                mti->mti_flags &= ~LDD_F_UPGRADE14; 
+                mti->mti_flags &= ~LDD_F_UPGRADE14;
          }
  
          /* COMPAT_146 */
@@ -398,26 +453,26 @@ static int mgs_handle_target_reg(struct ptlrpc_request *req)
                          CERROR("Can't upgrade from 1.4 (%d)\n", rc);
                          GOTO(out, rc);
                  }
-                
+
                  /* We're good to go */
                  mti->mti_flags |= LDD_F_UPDATE;
          }
          /* end COMPAT_146 */
  
          if (mti->mti_flags & LDD_F_UPDATE) {
-                CDEBUG(D_MGS, "updating %s, index=%d\n", mti->mti_svname, 
+                CDEBUG(D_MGS, "updating %s, index=%d\n", mti->mti_svname,
                         mti->mti_stripe_index);
-                
-                /* create or update the target log 
+
+                /* create or update the target log
                     and update the client/mdt logs */
                  rc = mgs_write_log_target(obd, mti);
                  if (rc) {
-                        CERROR("Failed to write %s log (%d)\n", 
+                        CERROR("Failed to write %s log (%d)\n",
                                 mti->mti_svname, rc);
                          GOTO(out, rc);
                  }
  
-                mti->mti_flags &= ~(LDD_F_VIRGIN | LDD_F_UPDATE | 
+                mti->mti_flags &= ~(LDD_F_VIRGIN | LDD_F_UPDATE |
                                      LDD_F_NEED_INDEX | LDD_F_WRITECONF |
                                      LDD_F_UPGRADE14);
                  mti->mti_flags |= LDD_F_REWRITE_LDD;
@@ -428,9 +483,9 @@ out:
          if (lockrc == ELDLM_OK)
                  mgs_put_cfg_lock(&lockh);
  out_nolock:
-        CDEBUG(D_MGS, "replying with %s, index=%d, rc=%d\n", mti->mti_svname, 
+        CDEBUG(D_MGS, "replying with %s, index=%d, rc=%d\n", mti->mti_svname,
                 mti->mti_stripe_index, rc);
-        lustre_pack_reply(req, 2, rep_size, NULL); 
+        lustre_pack_reply(req, 2, rep_size, NULL);
          /* send back the whole mti in the reply */
          rep_mti = lustre_msg_buf(req->rq_repmsg, REPLY_REC_OFF,
                                   sizeof(*rep_mti));
@@ -447,7 +502,7 @@ static int mgs_set_info_rpc(struct ptlrpc_request *req)
          struct mgs_send_param *msp, *rep_msp;
          struct lustre_handle lockh;
          int rep_size[] = { sizeof(struct ptlrpc_body), sizeof(*msp) };
-        int lockrc, rc;
+        int rc;
          struct lustre_cfg_bufs bufs;
          struct lustre_cfg *lcfg;
          char fsname[MTI_NAME_MAXLEN];
@@ -466,19 +521,9 @@ static int mgs_set_info_rpc(struct ptlrpc_request *req)
                  RETURN(rc);
          }
  
-        /* Revoke lock so everyone updates.  Should be alright if
-         * someone was already reading while we were updating the logs,
-         * so we don't really need to hold the lock while we're
-         * writing.
-         */
-        if (fsname[0]) {
-                lockrc = mgs_get_cfg_lock(obd, fsname, &lockh);
-                if (lockrc != ELDLM_OK)
-                        CERROR("lock error %d for fs %s\n", lockrc,
-                               fsname);
-                else
-                        mgs_put_cfg_lock(&lockh);
-        }
+        /* request for update */
+        mgs_revoke_lock(obd, fsname, &lockh);
+
          lustre_cfg_free(lcfg);
  
          lustre_pack_reply(req, 2, rep_size, NULL);
@@ -605,8 +650,8 @@ int mgs_handle(struct ptlrpc_request *req)
          }
  
          LASSERT(current->journal_info == NULL);
-        
-        if (rc) 
+
+        if (rc)
                  CERROR("MGS handle cmd=%d rc=%d\n", opc, rc);
  
   out:
@@ -614,16 +659,150 @@ int mgs_handle(struct ptlrpc_request *req)
          RETURN(0);
  }
  
+static inline int mgs_init_export(struct obd_export *exp)
+{
+        return ldlm_init_export(exp);
+}
+
  static inline int mgs_destroy_export(struct obd_export *exp)
  {
          ENTRY;
  
          target_destroy_export(exp);
+        ldlm_destroy_export(exp);
          mgs_client_free(exp);
  
          RETURN(0);
  }
  
+static int mgs_extract_fs_pool(char * arg, char *fsname, char *poolname)
+{
+        char *ptr;
+
+        ENTRY;
+        for (ptr = arg;  (*ptr != '\0') && (*ptr != '.'); ptr++ ) {
+                *fsname = *ptr;
+                fsname++;
+        }
+        if (*ptr == '\0')
+                return -EINVAL;
+        *fsname = '\0';
+        ptr++;
+        strcpy(poolname, ptr);
+
+        RETURN(0);
+}
+
+static int mgs_iocontrol_pool(struct obd_device *obd,
+                              struct obd_ioctl_data *data)
+{
+        int rc;
+        struct lustre_handle lockh;
+        struct lustre_cfg *lcfg = NULL;
+        struct llog_rec_hdr rec;
+        char *fsname = NULL;
+        char *poolname = NULL;
+        ENTRY;
+
+        OBD_ALLOC(fsname, MTI_NAME_MAXLEN);
+        if (fsname == NULL)
+                RETURN(-ENOMEM);
+
+        OBD_ALLOC(poolname, LOV_MAXPOOLNAME + 1);
+        if (poolname == NULL) {
+                rc = -ENOMEM;
+                GOTO(out_pool, rc);
+        }
+        rec.lrh_len = llog_data_len(data->ioc_plen1);
+
+        if (data->ioc_type == LUSTRE_CFG_TYPE) {
+                rec.lrh_type = OBD_CFG_REC;
+        } else {
+                CERROR("unknown cfg record type:%d \n", data->ioc_type);
+                rc = -EINVAL;
+                GOTO(out_pool, rc);
+        }
+
+        if (data->ioc_plen1 > CFS_PAGE_SIZE) {
+                rc = -E2BIG;
+                GOTO(out_pool, rc);
+        }
+
+        OBD_ALLOC(lcfg, data->ioc_plen1);
+        if (lcfg == NULL) {
+                rc = -ENOMEM;
+                GOTO(out_pool, rc);
+        }
+        rc = copy_from_user(lcfg, data->ioc_pbuf1, data->ioc_plen1);
+        if (rc)
+                GOTO(out_pool, rc);
+
+        if (lcfg->lcfg_bufcount < 2) {
+                rc = -EINVAL;
+                GOTO(out_pool, rc);
+        }
+
+        /* first arg is always <fsname>.<poolname> */
+        mgs_extract_fs_pool(lustre_cfg_string(lcfg, 1), fsname,
+                            poolname);
+
+        switch (lcfg->lcfg_command) {
+        case LCFG_POOL_NEW: {
+                if (lcfg->lcfg_bufcount != 2)
+                        RETURN(-EINVAL);
+                rc = mgs_pool_cmd(obd, LCFG_POOL_NEW, fsname,
+                                  poolname, NULL);
+                break;
+        }
+        case LCFG_POOL_ADD: {
+                if (lcfg->lcfg_bufcount != 3)
+                        RETURN(-EINVAL);
+                rc = mgs_pool_cmd(obd, LCFG_POOL_ADD, fsname, poolname,
+                                  lustre_cfg_string(lcfg, 2));
+                break;
+        }
+        case LCFG_POOL_REM: {
+                if (lcfg->lcfg_bufcount != 3)
+                        RETURN(-EINVAL);
+                rc = mgs_pool_cmd(obd, LCFG_POOL_REM, fsname, poolname,
+                                  lustre_cfg_string(lcfg, 2));
+                break;
+        }
+        case LCFG_POOL_DEL: {
+                if (lcfg->lcfg_bufcount != 2)
+                        RETURN(-EINVAL);
+                rc = mgs_pool_cmd(obd, LCFG_POOL_DEL, fsname,
+                                  poolname, NULL);
+                break;
+        }
+        default: {
+                 rc = -EINVAL;
+                 GOTO(out_pool, rc);
+        }
+        }
+
+        if (rc) {
+                CERROR("OBD_IOC_POOL err %d, cmd %X for pool %s.%s\n",
+                       rc, lcfg->lcfg_command, fsname, poolname);
+                GOTO(out_pool, rc);
+        }
+
+        /* request for update */
+        mgs_revoke_lock(obd, fsname, &lockh);
+
+out_pool:
+        if (lcfg != NULL)
+                OBD_FREE(lcfg, data->ioc_plen1);
+
+        if (fsname != NULL)
+                OBD_FREE(fsname, MTI_NAME_MAXLEN);
+
+        if (poolname != NULL)
+                OBD_FREE(poolname, LOV_MAXPOOLNAME + 1);
+
+        RETURN(rc);
+}
+
  /* from mdt_iocontrol */
  int mgs_iocontrol(unsigned int cmd, struct obd_export *exp, int len,
                    void *karg, void *uarg)
@@ -643,7 +822,6 @@ int mgs_iocontrol(unsigned int cmd, struct obd_export *exp, int len,
                  struct lustre_cfg *lcfg;
                  struct llog_rec_hdr rec;
                  char fsname[MTI_NAME_MAXLEN];
-                int lockrc;
  
                  rec.lrh_len = llog_data_len(data->ioc_plen1);
  
@@ -658,7 +836,7 @@ int mgs_iocontrol(unsigned int cmd, struct obd_export *exp, int len,
                  if (lcfg == NULL)
                          RETURN(-ENOMEM);
                  rc = copy_from_user(lcfg, data->ioc_pbuf1, data->ioc_plen1);
-                if (rc) 
+                if (rc)
                          GOTO(out_free, rc);
  
                  if (lcfg->lcfg_bufcount < 1)
@@ -674,20 +852,17 @@ int mgs_iocontrol(unsigned int cmd, struct obd_export *exp, int len,
                     someone was already reading while we were updating the logs,
                     so we don't really need to hold the lock while we're
                     writing (above). */
-                if (fsname[0]) {
-                        lockrc = mgs_get_cfg_lock(obd, fsname, &lockh);
-                        if (lockrc != ELDLM_OK) 
-                                CERROR("lock error %d for fs %s\n", lockrc, 
-                                       fsname);
-                        else
-                                mgs_put_cfg_lock(&lockh);
-                }
+                mgs_revoke_lock(obd, fsname, &lockh);
  
  out_free:
                  OBD_FREE(lcfg, data->ioc_plen1);
                  RETURN(rc);
          }
  
+        case OBD_IOC_POOL: {
+                RETURN(mgs_iocontrol_pool(obd, data));
+        }
+
          case OBD_IOC_DUMP_LOG: {
                  struct llog_ctxt *ctxt =
                          llog_get_context(obd, LLOG_CONFIG_ORIG_CTXT);
@@ -726,10 +901,12 @@ out_free:
  static struct obd_ops mgs_obd_ops = {
          .o_owner           = THIS_MODULE,
          .o_connect         = mgs_connect,
+        .o_reconnect       = mgs_reconnect,
          .o_disconnect      = mgs_disconnect,
          .o_setup           = mgs_setup,
          .o_precleanup      = mgs_precleanup,
          .o_cleanup         = mgs_cleanup,
+        .o_init_export     = mgs_init_export,
          .o_destroy_export  = mgs_destroy_export,
          .o_iocontrol       = mgs_iocontrol,
  };
@@ -749,7 +926,7 @@ static void /*__exit*/ mgs_exit(void)
          class_unregister_type(LUSTRE_MGS_NAME);
  }
  
-MODULE_AUTHOR("Cluster File Systems, Inc. <info@clusterfs.com>");
+MODULE_AUTHOR("Sun Microsystems, Inc. <http://www.lustre.org/>");
  MODULE_DESCRIPTION("Lustre  Management Server (MGS)");
  MODULE_LICENSE("GPL");
  
diff --git a/lustre/mgs/mgs_internal.h b/lustre/mgs/mgs_internal.h

index 5fbe448..7594bf9 100644 (file)
--- a/lustre/mgs/mgs_internal.h
+++ b/lustre/mgs/mgs_internal.h
@@ -1,5 +1,37 @@
  /* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
   * vim:expandtab:shiftwidth=8:tabstop=8:
+ *
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright  2008 Sun Microsystems, Inc. All rights reserved
+ * Use is subject to license terms.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
   */
  
  #ifndef _MGS_INTERNAL_H
@@ -18,7 +50,7 @@
  
  /* mgs_llog.c */
  int class_dentry_readdir(struct obd_device *obd, struct dentry *dir,
-                         struct vfsmount *inmnt, 
+                         struct vfsmount *inmnt,
                           struct list_head *dentry_list);
  
  #define INDEX_MAP_SIZE 8192     /* covers indicies to FFFF */
@@ -50,8 +82,13 @@ int mgs_erase_log(struct obd_device *obd, char *name);
  int mgs_erase_logs(struct obd_device *obd, char *fsname);
  int mgs_setparam(struct obd_device *obd, struct lustre_cfg *lcfg, char *fsname);
  
+int mgs_pool_cmd(struct obd_device *obd, enum lcfg_command_type cmd,
+                 char *poolname, char *fsname, char *ostname);
+
  /* mgs_fs.c */
-int mgs_client_add(struct obd_device *obd, struct obd_export *exp);
+int mgs_client_add(struct obd_device *obd,
+                   struct obd_export *exp,
+                   void *localdata);
  int mgs_client_free(struct obd_export *exp);
  int mgs_fs_setup(struct obd_device *obd, struct vfsmount *mnt);
  int mgs_fs_cleanup(struct obd_device *obddev);
@@ -65,7 +102,7 @@ int lproc_mgs_add_live(struct obd_device *obd, struct fs_db *fsdb);
  int lproc_mgs_del_live(struct obd_device *obd, struct fs_db *fsdb);
  void lprocfs_mgs_init_vars(struct lprocfs_static_vars *lvars);
  #else
-static inline int lproc_mgs_setup(struct obd_device *dev) 
+static inline int lproc_mgs_setup(struct obd_device *dev)
  {return 0;}
  static inline int lproc_mgs_cleanup(struct obd_device *obd)
  {return 0;}
@@ -92,4 +129,3 @@ void mgs_counter_incr(struct obd_export *exp, int opcode);
  void mgs_stats_counter_init(struct lprocfs_stats *stats);
  
  #endif /* _MGS_INTERNAL_H */
-
diff --git a/lustre/mgs/mgs_llog.c b/lustre/mgs/mgs_llog.c

index 2134bd8..ae033a0 100644 (file)
--- a/lustre/mgs/mgs_llog.c
+++ b/lustre/mgs/mgs_llog.c
@@ -1,26 +1,43 @@
  /* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
   * vim:expandtab:shiftwidth=8:tabstop=8:
   *
- *  lustre/mgs/mgs_llog.c
- *  Lustre Management Server (mgs) config llog creation
+ * GPL HEADER START
   *
- *  Copyright (C) 2006 Cluster File Systems, Inc.
- *   Author: Nathan Rutman <nathan@clusterfs.com>
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   *
- *   This file is part of Lustre, http://www.lustre.org.
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
   *
- *   Lustre is free software; you can redistribute it and/or
- *   modify it under the terms of version 2 of the GNU General Public
- *   License as published by the Free Software Foundation.
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
   *
- *   Lustre is distributed in the hope that it will be useful,
- *   but WITHOUT ANY WARRANTY; without even the implied warranty of
- *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- *   GNU General Public License for more details.
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
   *
- *   You should have received a copy of the GNU General Public License
- *   along with Lustre; if not, write to the Free Software
- *   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright  2008 Sun Microsystems, Inc. All rights reserved
+ * Use is subject to license terms.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/mgs/mgs_llog.c
+ *
+ * Lustre Management Server (mgs) config llog creation
+ *
+ * Author: Nathan Rutman <nathan@clusterfs.com>
   */
  
  #ifndef EXPORT_SYMTAB
@@ -1073,8 +1090,10 @@ static int mgs_write_log_mdt(struct obd_device *obd, struct fs_db *fsdb,
          rc = record_attach(obd, llh, mti->mti_svname, LUSTRE_MDS_NAME, 
                             mti->mti_uuid);
          rc = record_setup(obd, llh, mti->mti_svname,
-                          "dev"/*ignored*/, "type"/*ignored*/,
-                          mti->mti_svname, 0/*options*/);
+                          mti->mti_uuid /* Ignored. Compatible with future. */,
+                          "0" /* MDT Index, default to zero. */,
+                          mti->mti_svname,
+                          0 /* options */);
          rc = record_marker(obd, llh, fsdb, CM_END, mti->mti_svname, "add mdt"); 
          rc = record_end_log(obd, &llh);
  
@@ -1944,6 +1963,10 @@ int mgs_setparam(struct obd_device *obd, struct lustre_cfg *lcfg, char *fsname)
          ptr = strrchr(devname, '-');
          memset(fsname, 0, MTI_NAME_MAXLEN);
          if (ptr && (server_name2index(ptr, &index, NULL) >= 0)) {
+                /* param related to llite isn't allowed to set by OST or MDT */
+                if (strncmp(param, PARAM_LLITE, sizeof(PARAM_LLITE)) == 0)
+                        RETURN(-EINVAL);
+
                  strncpy(fsname, devname, ptr - devname);
          } else {
                  /* assume devname is the fsname */
@@ -1991,6 +2014,145 @@ out:
  }
  
  
+static int mgs_write_log_pool(struct obd_device *obd, char *logname, struct fs_db *fsdb,
+                       char *lovname,
+                       enum lcfg_command_type cmd,
+                       char *poolname, char *fsname,
+                       char *ostname, char *comment)
+{
+        struct llog_handle *llh = NULL;
+        int rc;
+
+        rc = record_start_log(obd, &llh, logname);
+        if (rc)
+                RETURN(rc);
+        rc = record_marker(obd, llh, fsdb, CM_START, lovname, comment);
+        record_base(obd, llh, lovname, 0, cmd, poolname, fsname, ostname, 0);
+        rc = record_marker(obd, llh, fsdb, CM_END, lovname, comment);
+        rc = record_end_log(obd, &llh);
+
+        return(rc);
+}
+
+int mgs_pool_cmd(struct obd_device *obd, enum lcfg_command_type cmd,
+                 char *fsname, char *poolname, char *ostname)
+{
+        struct fs_db *fsdb;
+        char mdt_index[16];
+        char *lovname;
+        char *logname;
+        char *label, *canceled_label = NULL;
+        int label_sz;
+        struct mgs_target_info *mti;
+        int rc;
+        ENTRY;
+
+        rc = mgs_find_or_make_fsdb(obd, fsname, &fsdb);
+        if (rc) {
+                CERROR("Can't get db for %s\n", fsname);
+                RETURN(rc);
+        }
+        if (fsdb->fsdb_flags & FSDB_LOG_EMPTY) {
+                CERROR("%s is not defined\n", fsname);
+                mgs_free_fsdb(obd, fsdb);
+                RETURN(-EINVAL);
+        }
+
+        label_sz = 10 + strlen(fsname) + strlen(poolname);
+
+        /* check if ostname match fsname */
+        if (ostname != NULL) {
+                char *ptr;
+
+                ptr = strrchr(ostname, '-');
+                if ((ptr == NULL) ||
+                    (strncmp(fsname, ostname, ptr-ostname) != 0))
+                        RETURN(-EINVAL);
+                label_sz += strlen(ostname);
+        }
+
+        OBD_ALLOC(label, label_sz);
+        if (label == NULL)
+                RETURN(-ENOMEM);
+
+        switch(cmd) {
+        case LCFG_POOL_NEW: {
+                sprintf(label,
+                        "new %s.%s", fsname, poolname);
+                break;
+        }
+        case LCFG_POOL_ADD: {
+                sprintf(label,
+                        "add %s.%s.%s", fsname, poolname, ostname);
+                break;
+        }
+        case LCFG_POOL_REM: {
+                OBD_ALLOC(canceled_label, label_sz);
+                if (canceled_label == NULL)
+                         RETURN(-ENOMEM);
+                sprintf(label,
+                        "rem %s.%s.%s", fsname, poolname, ostname);
+                sprintf(canceled_label,
+                        "add %s.%s.%s", fsname, poolname, ostname);
+                break;
+        }
+        case LCFG_POOL_DEL: {
+                OBD_ALLOC(canceled_label, label_sz);
+                if (canceled_label == NULL)
+                         RETURN(-ENOMEM);
+                sprintf(label,
+                        "del %s.%s", fsname, poolname);
+                sprintf(canceled_label,
+                        "new %s.%s", fsname, poolname);
+                break;
+        }
+        default: {
+                break;
+        }
+        }
+
+        down(&fsdb->fsdb_sem);
+
+        sprintf(mdt_index, "-MDT%04x", 0);
+        name_create(&logname, fsname, mdt_index);
+        name_create(&lovname, fsdb->fsdb_mdtlov, "");
+
+        mti = NULL;
+        if (canceled_label != NULL) {
+                OBD_ALLOC(mti, sizeof(*mti));
+                if (mti != NULL) {
+                        strcpy(mti->mti_svname, "lov pool");
+                        mgs_modify(obd, fsdb, mti, logname, lovname,
+                                   canceled_label, CM_SKIP);
+                }
+        }
+
+        mgs_write_log_pool(obd, logname, fsdb, lovname,
+                           cmd, fsname, poolname, ostname, label);
+        name_destroy(&logname);
+
+        name_create(&logname, fsname, "-client");
+        if (canceled_label != NULL) {
+                mgs_modify(obd, fsdb, mti, logname, lovname,
+                           canceled_label, CM_SKIP);
+        }
+        mgs_write_log_pool(obd, logname, fsdb, fsdb->fsdb_clilov,
+                           cmd, fsname, poolname, ostname, label);
+        name_destroy(&logname);
+        name_destroy(&lovname);
+
+        up(&fsdb->fsdb_sem);
+
+        OBD_FREE(label, label_sz);
+        if (canceled_label != NULL)
+                OBD_FREE(canceled_label, label_sz);
+
+        if (mti != NULL)
+                OBD_FREE(mti, sizeof(*mti));
+
+        RETURN(rc);
+}
+
  #if 0
  /******************** unused *********************/
  static int mgs_backup_llog(struct obd_device *obd, char* fsname)
diff --git a/lustre/obdclass/Makefile.in b/lustre/obdclass/Makefile.in

index d81ee84..05c5881 100644 (file)
--- a/lustre/obdclass/Makefile.in
+++ b/lustre/obdclass/Makefile.in
@@ -20,7 +20,7 @@ sources:
  endif
  
  obdclass-all-objs := llog.o llog_cat.o llog_lvfs.o llog_obd.o llog_swab.o
-obdclass-all-objs += class_obd.o class_hash.o
+obdclass-all-objs += class_obd.o class_hash.o target.o
  obdclass-all-objs += debug.o genops.o uuid.o llog_ioctl.o
  obdclass-all-objs += lprocfs_status.o lustre_handles.o lustre_peer.o
  obdclass-all-objs += statfs_pack.o obdo.o obd_config.o obd_mount.o
diff --git a/lustre/obdclass/class_hash.c b/lustre/obdclass/class_hash.c

index 880cd2b..c03bb94 100644 (file)
--- a/lustre/obdclass/class_hash.c
+++ b/lustre/obdclass/class_hash.c
@@ -1,14 +1,51 @@
  /* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
   * vim:expandtab:shiftwidth=8:tabstop=8:
   *
- *  Copyright (C) 2005 Cluster File Systems, Inc.
- *   Author: YuZhangyong <yzy@clusterfs.com>
+ * GPL HEADER START
   *
- *   This file is part of Lustre, http://www.lustre.org/
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   *
- *   No redistribution or use is permitted outside of Cluster File Systems, Inc.
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
   *
- *   Implement a hash class for hash process in lustre system.
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright  2008 Sun Microsystems, Inc. All rights reserved
+ * Use is subject to license terms.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/obdclass/class_hash.c
+ *
+ * Implement a hash class for hash process in lustre system.
+ *
+ * Author: YuZhangyong <yzy@clusterfs.com>
+ *
+ * 2008-08-15: Brian Behlendorf <behlendorf1@llnl.gov>
+ * - Simplified API and improved documentation
+ * - Added per-hash feature flags:
+ *   * LH_DEBUG additional validation
+ *   * LH_REHASH dynamic rehashing
+ * - Added per-hash statistics
+ * - General performance enhancements
   */
  
  #ifndef __KERNEL__
@@ -16,741 +53,705 @@
  #include <obd.h>
  #endif
  
-#include <obd_class.h>
  #include <class_hash.h>
-#include <lustre_export.h>
-#include <obd_support.h>
-#include <lustre_net.h>
-#include <lustre_quota.h>
-
-int lustre_hash_init(struct lustre_class_hash_body **hash_body_new,
-                     char *hashname, __u32 hashsize,
-                     struct lustre_hash_operations *hash_operations)
-{
-        int i, n = 0;
-        struct lustre_class_hash_body *hash_body = NULL;
  
-        LASSERT(hashsize > 0);
-        LASSERT(hash_operations != NULL);
+/**
+ * Initialize new lustre hash, where:
+ * @name     - Descriptive hash name
+ * @cur_bits - Initial hash table size, in bits
+ * @max_bits - Maximum allowed hash table resize, in bits
+ * @ops      - Registered hash table operations
+ * @flags    - LH_REHASH enable synamic hash resizing
+ *           - LH_SORT enable chained hash sort
+ */
+lustre_hash_t *
+lustre_hash_init(char *name, unsigned int cur_bits, unsigned int max_bits,
+                 lustre_hash_ops_t *ops, int flags)
+{
+        lustre_hash_t *lh;
+        int            i;
          ENTRY;
-
-        i = hashsize;
-        while (i != 0) {
-                if (i & 0x1)
-                        n++;
-                i >>= 1;
-        }
-
-        LASSERTF(n == 1, "hashsize %u isn't 2^n\n", hashsize);
-
-        /* alloc space for hash_body */
-        OBD_ALLOC(hash_body, sizeof(*hash_body));
-
-        if (hash_body == NULL) {
-                CERROR("Cannot alloc space for hash body, hashname = %s \n",
-                        hashname);
-                RETURN(-ENOMEM);
+  
+        LASSERT(name != NULL);
+        LASSERT(ops != NULL);
+
+        LASSERT(cur_bits > 0);
+        LASSERT(max_bits >= cur_bits);
+        LASSERT(max_bits < 31);
+  
+        OBD_ALLOC_PTR(lh);
+        if (!lh)
+                RETURN(NULL);
+  
+        strncpy(lh->lh_name, name, sizeof(lh->lh_name));
+        atomic_set(&lh->lh_rehash_count, 0);
+        atomic_set(&lh->lh_count, 0);
+        rwlock_init(&lh->lh_rwlock);
+        lh->lh_cur_bits = cur_bits;
+        lh->lh_cur_mask = (1 << cur_bits) - 1;
+        lh->lh_min_bits = cur_bits;
+        lh->lh_max_bits = max_bits;
+        /* XXX: need to fixup lustre_hash_rehash_bits() before this can be
+         *      anything other than 0.5 and 2.0 */
+        lh->lh_min_theta = 1 << (LH_THETA_BITS - 1);
+        lh->lh_max_theta = 1 << (LH_THETA_BITS + 1);
+        lh->lh_ops = ops;
+        lh->lh_flags = flags;
+
+        /* theta * 1000 */
+        __lustre_hash_set_theta(lh, 500, 2000);
+
+        OBD_VMALLOC(lh->lh_buckets, sizeof(*lh->lh_buckets) << lh->lh_cur_bits);
+        if (!lh->lh_buckets) {
+                OBD_FREE_PTR(lh);
+                RETURN(NULL);
          }
-
-        LASSERT(hashname != NULL &&
-                strlen(hashname) <= sizeof(hash_body->hashname));
-        strcpy(hash_body->hashname, hashname);
-        hash_body->lchb_hash_max_size = hashsize;
-        hash_body->lchb_hash_operations = hash_operations;
-
-        /* alloc space for the hash tables */
-        OBD_ALLOC(hash_body->lchb_hash_tables,
-                  sizeof(*hash_body->lchb_hash_tables) * hash_body->lchb_hash_max_size);
-
-        if (hash_body->lchb_hash_tables == NULL) {
-                OBD_FREE(hash_body, sizeof(*hash_body));
-                CERROR("Cannot alloc space for hashtables, hashname = %s \n",
-                        hash_body->hashname);
-                RETURN(-ENOMEM);
+  
+        for (i = 0; i <= lh->lh_cur_mask; i++) {
+                INIT_HLIST_HEAD(&lh->lh_buckets[i].lhb_head);
+                rwlock_init(&lh->lh_buckets[i].lhb_rwlock);
+                atomic_set(&lh->lh_buckets[i].lhb_count, 0);
          }
-
-        spin_lock_init(&hash_body->lchb_lock); /* initialize the body lock */
-
-        for(i = 0 ; i < hash_body->lchb_hash_max_size; i++) {
-                /* initial the bucket lock and list_head */
-                INIT_HLIST_HEAD(&hash_body->lchb_hash_tables[i].lhb_head);
-                spin_lock_init(&hash_body->lchb_hash_tables[i].lhb_lock);
-        }
-        *hash_body_new = hash_body;
-
-        RETURN(0);
+  
+        return lh;
  }
  EXPORT_SYMBOL(lustre_hash_init);
-
-void lustre_hash_exit(struct lustre_class_hash_body **new_hash_body)
+  
+/**
+ * Cleanup lustre hash @lh.
+ */
+void
+lustre_hash_exit(lustre_hash_t *lh)
  {
-        int i;
-        struct lustre_class_hash_body *hash_body = NULL;
+        lustre_hash_bucket_t *lhb;
+        struct hlist_node    *hnode;
+        struct hlist_node    *pos;
+        int                   i;
          ENTRY;
  
-        hash_body = *new_hash_body;
-
-        if (hash_body == NULL) {
-                CWARN("hash body has been deleted\n");
-                goto out_hash;
-        }
-
-        spin_lock(&hash_body->lchb_lock); /* lock the hash tables */
-
-        if (hash_body->lchb_hash_tables == NULL ) {
-                spin_unlock(&hash_body->lchb_lock);
-                CWARN("hash tables has been deleted\n");
-                goto out_hash;
-        }
-
-        for( i = 0; i < hash_body->lchb_hash_max_size; i++ ) {
-                struct lustre_hash_bucket * bucket;
-                struct hlist_node * actual_hnode, *pos;
-
-                bucket = &hash_body->lchb_hash_tables[i];
-                spin_lock(&bucket->lhb_lock); /* lock the bucket */
-                hlist_for_each_safe(actual_hnode, pos, &(bucket->lhb_head)) {
-                        lustre_hash_delitem_nolock(hash_body, i, actual_hnode);
+        LASSERT(lh != NULL);
+  
+        write_lock(&lh->lh_rwlock);
+  
+        lh_for_each_bucket(lh, lhb, i) {
+                write_lock(&lhb->lhb_rwlock);
+                hlist_for_each_safe(hnode, pos, &(lhb->lhb_head)) {
+                        __lustre_hash_bucket_validate(lh, lhb, hnode);
+                        __lustre_hash_bucket_del(lh, lhb, hnode);
+                        lh_exit(lh, hnode);
                  }
-                spin_unlock(&bucket->lhb_lock);
+  
+                LASSERT(hlist_empty(&(lhb->lhb_head)));
+                LASSERT(atomic_read(&lhb->lhb_count) == 0);
+                write_unlock(&lhb->lhb_rwlock);
          }
+  
+        OBD_VFREE(lh->lh_buckets, sizeof(*lh->lh_buckets) << lh->lh_cur_bits);
+        LASSERT(atomic_read(&lh->lh_count) == 0);
+        write_unlock(&lh->lh_rwlock);
+  
+        OBD_FREE_PTR(lh);
+        EXIT;
+}
+EXPORT_SYMBOL(lustre_hash_exit);
  
-        /* free the hash_tables's memory space */
-        OBD_FREE(hash_body->lchb_hash_tables,
-                 sizeof(*hash_body->lchb_hash_tables) *
-                 hash_body->lchb_hash_max_size);
+static inline unsigned int lustre_hash_rehash_bits(lustre_hash_t *lh)
+{
+        if (!(lh->lh_flags & LH_REHASH))
+                return 0;
  
-        hash_body->lchb_hash_tables = NULL;
+        /* XXX: need to handle case with max_theta != 2.0 
+         *      and the case with min_theta != 0.5 */
+        if ((lh->lh_cur_bits < lh->lh_max_bits) &&
+            (__lustre_hash_theta(lh) > lh->lh_max_theta))
+                return lh->lh_cur_bits + 1;
  
-        spin_unlock(&hash_body->lchb_lock);
+        if ((lh->lh_cur_bits > lh->lh_min_bits) &&
+            (__lustre_hash_theta(lh) < lh->lh_min_theta))
+                return lh->lh_cur_bits - 1;
  
-out_hash : 
-        /* free the hash_body's memory space */
-        if (hash_body != NULL) {
-                OBD_FREE(hash_body, sizeof(*hash_body));
-                *new_hash_body = NULL;
-        }
+        return 0;
+}
+  
+/**
+ * Add item @hnode to lustre hash @lh using @key.  The registered
+ * ops->lh_get function will be called when the item is added.
+ */
+void
+lustre_hash_add(lustre_hash_t *lh, void *key, struct hlist_node *hnode)
+{
+        lustre_hash_bucket_t *lhb;
+        int                   bits;
+        unsigned              i;
+        ENTRY;
+  
+        __lustre_hash_key_validate(lh, key, hnode);
+
+        read_lock(&lh->lh_rwlock);
+        i = lh_hash(lh, key, lh->lh_cur_mask);
+        lhb = &lh->lh_buckets[i];
+        LASSERT(i <= lh->lh_cur_mask);
+        LASSERT(hlist_unhashed(hnode));
+
+        write_lock(&lhb->lhb_rwlock);
+        __lustre_hash_bucket_add(lh, lhb, hnode);
+        write_unlock(&lhb->lhb_rwlock);
+
+        bits = lustre_hash_rehash_bits(lh);
+        read_unlock(&lh->lh_rwlock);
+        if (bits)
+                lustre_hash_rehash(lh, bits);
+  
          EXIT;
  }
-EXPORT_SYMBOL(lustre_hash_exit);
+EXPORT_SYMBOL(lustre_hash_add);
  
-/*
- * only allow unique @key in hashtables, if the same @key has existed 
- * in hashtables, it will return with fails.
+static struct hlist_node *
+lustre_hash_findadd_unique_hnode(lustre_hash_t *lh, void *key,
+                                 struct hlist_node *hnode)
+{
+        int                   bits = 0;
+        struct hlist_node    *ehnode;
+        lustre_hash_bucket_t *lhb;
+        unsigned              i;
+        ENTRY;
+  
+        __lustre_hash_key_validate(lh, key, hnode);
+  
+        read_lock(&lh->lh_rwlock);
+        i = lh_hash(lh, key, lh->lh_cur_mask);
+        lhb = &lh->lh_buckets[i];
+        LASSERT(i <= lh->lh_cur_mask);
+        LASSERT(hlist_unhashed(hnode));
+
+        write_lock(&lhb->lhb_rwlock);
+        ehnode = __lustre_hash_bucket_lookup(lh, lhb, key);
+        if (ehnode) {
+                lh_get(lh, ehnode);
+        } else {
+                __lustre_hash_bucket_add(lh, lhb, hnode);
+                ehnode = hnode;
+                bits = lustre_hash_rehash_bits(lh);
+        }
+        write_unlock(&lhb->lhb_rwlock);
+        read_unlock(&lh->lh_rwlock);
+        if (bits)
+                lustre_hash_rehash(lh, bits);
+  
+        RETURN(ehnode);
+}
+  
+/**
+ * Add item @hnode to lustre hash @lh using @key.  The registered
+ * ops->lh_get function will be called if the item was added.
+ * Returns 0 on success or -EALREADY on key collisions.
   */
-int lustre_hash_additem_unique(struct lustre_class_hash_body *hash_body, 
-                               void *key, struct hlist_node *actual_hnode)
+int
+lustre_hash_add_unique(lustre_hash_t *lh, void *key, struct hlist_node *hnode)
  {
-        int hashent;
-        struct lustre_hash_bucket *bucket = NULL;
-        struct lustre_hash_operations *hop = hash_body->lchb_hash_operations;
+        struct hlist_node    *ehnode;
          ENTRY;
-
-        LASSERT(hlist_unhashed(actual_hnode));
-        hashent = hop->lustre_hashfn(hash_body, key);
-
-        /* get the hash-bucket and lock it */
-        bucket = &hash_body->lchb_hash_tables[hashent];
-        spin_lock(&bucket->lhb_lock);
-
-        if ( (lustre_hash_getitem_in_bucket_nolock(hash_body, hashent, key)) != NULL) {
-                /* the added-item exist in hashtables, so cannot add it again */
-                spin_unlock(&bucket->lhb_lock);
-
-                CWARN("Already found the key in hash [%s]\n", 
-                      hash_body->hashname);
+        
+        ehnode = lustre_hash_findadd_unique_hnode(lh, key, hnode);
+        if (ehnode != hnode) {
+                lh_put(lh, ehnode);
                  RETURN(-EALREADY);
          }
-
-        hlist_add_head(actual_hnode, &(bucket->lhb_head));
-
-#ifdef LUSTRE_HASH_DEBUG
-        /* hash distribute debug */
-        hash_body->lchb_hash_tables[hashent].lhb_item_count++; 
-        CDEBUG(D_INFO, "hashname[%s] bucket[%d] has [%d] hashitem\n", 
-                        hash_body->hashname, hashent, 
-                        hash_body->lchb_hash_tables[hashent].lhb_item_count);
-#endif  
-        hop->lustre_hash_object_refcount_get(actual_hnode); 
-
-        spin_unlock(&bucket->lhb_lock);
-
          RETURN(0);
  }
-EXPORT_SYMBOL(lustre_hash_additem_unique);
-
-/*
- * only allow unique @key in hashtables, if the same @key has existed 
- * in hashtables, it will return with fails.
+EXPORT_SYMBOL(lustre_hash_add_unique);
+  
+/**
+ * Add item @hnode to lustre hash @lh using @key.  If this @key
+ * already exists in the hash then ops->lh_get will be called on the
+ * conflicting entry and that entry will be returned to the caller.
+ * Otherwise ops->lh_get is called on the item which was added.
   */
-void* lustre_hash_findadd_unique(struct lustre_class_hash_body *hash_body, 
-                                     void *key, struct hlist_node *actual_hnode)
+void *
+lustre_hash_findadd_unique(lustre_hash_t *lh, void *key,
+                           struct hlist_node *hnode)
  {
-        int hashent;
-        struct lustre_hash_bucket *bucket = NULL;
-        struct lustre_hash_operations *hop = hash_body->lchb_hash_operations;
-        struct hlist_node * hash_item_hnode = NULL;
-        void *obj;
+        struct hlist_node    *ehnode;
+        void                 *obj;
          ENTRY;
-
-        LASSERT(hlist_unhashed(actual_hnode));
-        hashent = hop->lustre_hashfn(hash_body, key);
-
-        /* get the hash-bucket and lock it */
-        bucket = &hash_body->lchb_hash_tables[hashent];
-        spin_lock(&bucket->lhb_lock);
-
-        hash_item_hnode = lustre_hash_getitem_in_bucket_nolock(hash_body,
-                                                               hashent, key);
-        if ( hash_item_hnode != NULL) {
-                /* the added-item exist in hashtables, so cannot add it again */
-                obj = hop->lustre_hash_object_refcount_get(hash_item_hnode);
-                spin_unlock(&bucket->lhb_lock);
-                RETURN(obj);
-        }
-
-        hlist_add_head(actual_hnode, &(bucket->lhb_head));
-
-#ifdef LUSTRE_HASH_DEBUG
-        /* hash distribute debug */
-        hash_body->lchb_hash_tables[hashent].lhb_item_count++; 
-        CDEBUG(D_INFO, "hashname[%s] bucket[%d] has [%d] hashitem\n", 
-                        hash_body->hashname, hashent, 
-                        hash_body->lchb_hash_tables[hashent].lhb_item_count);
-#endif
-        obj = hop->lustre_hash_object_refcount_get(actual_hnode);
-
-        spin_unlock(&bucket->lhb_lock);
-
+        
+        ehnode = lustre_hash_findadd_unique_hnode(lh, key, hnode);
+        obj = lh_get(lh, ehnode);
+        lh_put(lh, ehnode);
          RETURN(obj);
  }
  EXPORT_SYMBOL(lustre_hash_findadd_unique);
-
-/*
- * this version of additem, it allow multi same @key <key, value> in hashtables. 
- * in this additem version, we don't need to check if exist same @key in hash 
- * tables, we only add it to related hashbucket.
- * example: maybe same nid will be related to multi difference export
+  
+/**
+ * Delete item @hnode from the lustre hash @lh using @key.  The @key
+ * is required to ensure the correct hash bucket is locked since there
+ * is no direct linkage from the item to the bucket.  The object
+ * removed from the hash will be returned and obs->lh_put is called
+ * on the removed object.
   */
-int lustre_hash_additem(struct lustre_class_hash_body *hash_body, void *key, 
-                         struct hlist_node *actual_hnode)
+void *
+lustre_hash_del(lustre_hash_t *lh, void *key, struct hlist_node *hnode)
  {
-        int hashent;
-        struct lustre_hash_bucket *bucket = NULL;
-        struct lustre_hash_operations *hop = hash_body->lchb_hash_operations;
+        lustre_hash_bucket_t *lhb;
+        unsigned              i;
+        void                 *obj;
          ENTRY;
-
-        LASSERT(hlist_unhashed(actual_hnode));
-
-        hashent = hop->lustre_hashfn(hash_body, key);
-
-        /* get the hashbucket and lock it */
-        bucket = &hash_body->lchb_hash_tables[hashent];
-        spin_lock(&bucket->lhb_lock);
-
-        hlist_add_head(actual_hnode, &(bucket->lhb_head));
-
-#ifdef LUSTRE_HASH_DEBUG
-        /* hash distribute debug */
-        hash_body->lchb_hash_tables[hashent].lhb_item_count++; 
-        CDEBUG(D_INFO, "hashname[%s] bucket[%d] has [%d] hashitem\n", 
-                        hash_body->hashname, hashent, 
-                        hash_body->lchb_hash_tables[hashent].lhb_item_count);
-#endif  
-        hop->lustre_hash_object_refcount_get(actual_hnode); 
-
-        spin_unlock(&bucket->lhb_lock);
-
-        RETURN(0);
+  
+        __lustre_hash_key_validate(lh, key, hnode);
+  
+        read_lock(&lh->lh_rwlock);
+        i = lh_hash(lh, key, lh->lh_cur_mask);
+        lhb = &lh->lh_buckets[i];
+        LASSERT(i <= lh->lh_cur_mask);
+        LASSERT(!hlist_unhashed(hnode));
+
+        write_lock(&lhb->lhb_rwlock);
+        obj = __lustre_hash_bucket_del(lh, lhb, hnode);
+        write_unlock(&lhb->lhb_rwlock);
+        read_unlock(&lh->lh_rwlock);
+  
+        RETURN(obj);
  }
-EXPORT_SYMBOL(lustre_hash_additem);
-
-
-/*
- * this version of delitem will delete a hashitem with given @key, 
- * we need to search the <@key, @value> in hashbucket with @key, 
- * if match, the hashitem will be delete. 
- * we have a no-search version of delitem, it will directly delete a hashitem, 
- * doesn't need to search it in hashtables, so it is a O(1) delete.
+EXPORT_SYMBOL(lustre_hash_del);
+  
+/**
+ * Delete item given @key in lustre hash @lh.  The first @key found in
+ * the hash will be removed, if the key exists multiple times in the hash
+ * @lh this function must be called once per key.  The removed object
+ * will be returned and ops->lh_put is called on the removed object.
   */
-int lustre_hash_delitem_by_key(struct lustre_class_hash_body *hash_body, 
-                               void *key)
+void *
+lustre_hash_del_key(lustre_hash_t *lh, void *key)
  {
-        int hashent ;
-        struct hlist_node * hash_item;
-        struct lustre_hash_bucket *bucket = NULL;
-        struct lustre_hash_operations *hop = hash_body->lchb_hash_operations;
-        int retval = 0;
+        struct hlist_node    *hnode;
+        lustre_hash_bucket_t *lhb;
+        unsigned              i;
+        void                 *obj = NULL;
          ENTRY;
-
-        hashent = hop->lustre_hashfn(hash_body, key);
-
-        /* first, lock the hashbucket */
-        bucket = &hash_body->lchb_hash_tables[hashent];
-        spin_lock(&bucket->lhb_lock);
-
-        /* get the hash_item from hash_bucket */
-        hash_item = lustre_hash_getitem_in_bucket_nolock(hash_body, hashent, 
-                                                         key);
-
-        if (hash_item == NULL) {
-                spin_unlock(&bucket->lhb_lock);
-                RETURN(-ENOENT);
-        }
-
-        /* call delitem_nolock() to delete the hash_item */
-        retval = lustre_hash_delitem_nolock(hash_body, hashent, hash_item);
-
-        spin_unlock(&bucket->lhb_lock);
-
-        RETURN(retval);
+  
+        read_lock(&lh->lh_rwlock);
+        i = lh_hash(lh, key, lh->lh_cur_mask);
+        lhb = &lh->lh_buckets[i];
+        LASSERT(i <= lh->lh_cur_mask);
+
+        write_lock(&lhb->lhb_rwlock);
+        hnode = __lustre_hash_bucket_lookup(lh, lhb, key);
+        if (hnode)
+                obj = __lustre_hash_bucket_del(lh, lhb, hnode);
+
+        write_unlock(&lhb->lhb_rwlock);
+        read_unlock(&lh->lh_rwlock);
+  
+        RETURN(obj);
  }
-EXPORT_SYMBOL(lustre_hash_delitem_by_key);
-
-/*
- * the O(1) version of delete hash item, 
- * it will directly delete the hashitem with given @hash_item,
- * the parameter @key used to get the relation hash bucket and lock it.
+EXPORT_SYMBOL(lustre_hash_del_key);
+  
+/**
+ * Lookup an item using @key in the lustre hash @lh and return it.
+ * If the @key is found in the hash lh->lh_get() is called and the
+ * matching objects is returned.  It is the callers responsibility
+ * to call the counterpart ops->lh_put using the lh_put() macro
+ * when when finished with the object.  If the @key was not found
+ * in the hash @lh NULL is returned.
   */
-int lustre_hash_delitem(struct lustre_class_hash_body *hash_body, 
-                        void *key, struct hlist_node * hash_item)
-{  
-        int hashent = 0;
-        int retval = 0;
-        struct lustre_hash_bucket *bucket = NULL;
-        struct lustre_hash_operations *hop = hash_body->lchb_hash_operations;
+void *
+lustre_hash_lookup(lustre_hash_t *lh, void *key)
+{
+        struct hlist_node    *hnode;
+        lustre_hash_bucket_t *lhb;
+        unsigned              i;
+        void                 *obj = NULL;
          ENTRY;
-
-        hashent = hop->lustre_hashfn(hash_body, key);
-
-        bucket = &hash_body->lchb_hash_tables[hashent];
-        spin_lock(&bucket->lhb_lock);
-
-        /* call delitem_nolock() to delete the hash_item */
-        retval = lustre_hash_delitem_nolock(hash_body, hashent, hash_item);
-
-        spin_unlock(&bucket->lhb_lock);
-
-        RETURN(retval);
+  
+        read_lock(&lh->lh_rwlock);
+        i = lh_hash(lh, key, lh->lh_cur_mask);
+        lhb = &lh->lh_buckets[i];
+        LASSERT(i <= lh->lh_cur_mask);
+
+        read_lock(&lhb->lhb_rwlock);
+        hnode = __lustre_hash_bucket_lookup(lh, lhb, key);
+        if (hnode)
+                obj = lh_get(lh, hnode);
+  
+        read_unlock(&lhb->lhb_rwlock);
+        read_unlock(&lh->lh_rwlock);
+  
+        RETURN(obj);
  }
-EXPORT_SYMBOL(lustre_hash_delitem);
-
-void lustre_hash_bucket_iterate(struct lustre_class_hash_body *hash_body,
-                                void *key, hash_item_iterate_cb func, void *data)
+EXPORT_SYMBOL(lustre_hash_lookup);
+  
+/**
+ * For each item in the lustre hash @lh call the passed callback @func
+ * and pass to it as an argument each hash item and the private @data.
+ * Before each callback ops->lh_get will be called, and after each
+ * callback ops->lh_put will be called.  Finally, during the callback
+ * the bucket lock is held so the callback must never sleep.
+ */
+void
+lustre_hash_for_each(lustre_hash_t *lh, lh_for_each_cb func, void *data)
  {
-        int hashent, find = 0;
-        struct lustre_hash_bucket *bucket = NULL;
-        struct hlist_node *hash_item_node = NULL;
-        struct lustre_hash_operations *hop = hash_body->lchb_hash_operations;
-        struct obd_export *tmp = NULL;
-
+        struct hlist_node    *hnode;
+        lustre_hash_bucket_t *lhb;
+        void                 *obj;
+        int                   i;
          ENTRY;
-
-        hashent = hop->lustre_hashfn(hash_body, key);
-        bucket = &hash_body->lchb_hash_tables[hashent];
-
-        spin_lock(&bucket->lhb_lock);
-        hlist_for_each(hash_item_node, &(bucket->lhb_head)) {
-                find = hop->lustre_hash_key_compare(key, hash_item_node);
-                if (find) {
-                        tmp = hop->lustre_hash_object_refcount_get(hash_item_node);
-                        func(tmp, data);
-                        hop->lustre_hash_object_refcount_put(hash_item_node);
+  
+        read_lock(&lh->lh_rwlock);
+        lh_for_each_bucket(lh, lhb, i) {
+                read_lock(&lhb->lhb_rwlock);
+                hlist_for_each(hnode, &(lhb->lhb_head)) {
+                        __lustre_hash_bucket_validate(lh, lhb, hnode);
+                        obj = lh_get(lh, hnode);
+                        func(obj, data);
+                        (void)lh_put(lh, hnode);
                  }
+                read_unlock(&lhb->lhb_rwlock);
          }
-        spin_unlock(&bucket->lhb_lock);
-}
-EXPORT_SYMBOL(lustre_hash_bucket_iterate);
+        read_unlock(&lh->lh_rwlock);
  
-void lustre_hash_iterate_all(struct lustre_class_hash_body *hash_body,
-                            hash_item_iterate_cb func, void *data)
+        EXIT;
+}
+EXPORT_SYMBOL(lustre_hash_for_each);
+  
+/**
+ * For each item in the lustre hash @lh call the passed callback @func
+ * and pass to it as an argument each hash item and the private @data.
+ * Before each callback ops->lh_get will be called, and after each
+ * callback ops->lh_put will be called.  During the callback the
+ * bucket lock will not be held will allows for the current item
+ * to be removed from the hash during the callback.  However, care
+ * should be taken to prevent other callers from operating on the
+ * hash concurrently or list corruption may occur.
+ */
+void
+lustre_hash_for_each_safe(lustre_hash_t *lh, lh_for_each_cb func, void *data)
  {
-        int i;
-        struct lustre_hash_operations *hop = hash_body->lchb_hash_operations;
+        struct hlist_node    *hnode;
+        struct hlist_node    *pos;
+        lustre_hash_bucket_t *lhb;
+        void                 *obj;
+        int                   i;
          ENTRY;
-
-        for( i = 0; i < hash_body->lchb_hash_max_size; i++ ) {
-                struct lustre_hash_bucket * bucket;
-                struct hlist_node * actual_hnode, *pos;
-                void *obj;
-
-                bucket = &hash_body->lchb_hash_tables[i];
-#ifdef LUSTRE_HASH_DEBUG
-                CDEBUG(D_INFO, "idx %d - bucket %p\n", i, bucket);
-#endif
-                spin_lock(&bucket->lhb_lock); /* lock the bucket */
-                hlist_for_each_safe(actual_hnode, pos, &(bucket->lhb_head)) {
-                        obj = hop->lustre_hash_object_refcount_get(actual_hnode);
+  
+        read_lock(&lh->lh_rwlock);
+        lh_for_each_bucket(lh, lhb, i) {
+                read_lock(&lhb->lhb_rwlock);
+                hlist_for_each_safe(hnode, pos, &(lhb->lhb_head)) {
+                        __lustre_hash_bucket_validate(lh, lhb, hnode);
+                        obj = lh_get(lh, hnode);
+                        read_unlock(&lhb->lhb_rwlock);
                          func(obj, data);
-                        hop->lustre_hash_object_refcount_put(actual_hnode);
+                        read_lock(&lhb->lhb_rwlock);
+                        (void)lh_put(lh, hnode);
                  }
-                spin_unlock(&bucket->lhb_lock);
+                read_unlock(&lhb->lhb_rwlock);
          }
+        read_unlock(&lh->lh_rwlock);
          EXIT;
  }
-EXPORT_SYMBOL(lustre_hash_iterate_all);
-
-
-void * lustre_hash_get_object_by_key(struct lustre_class_hash_body *hash_body,
-                                     void *key)
+EXPORT_SYMBOL(lustre_hash_for_each_safe);
+  
+/**
+ * For each hash bucket in the lustre hash @lh call the passed callback
+ * @func until all the hash buckets are empty.  The passed callback @func
+ * or the previously registered callback lh->lh_put must remove the item
+ * from the hash.  You may either use the lustre_hash_del() or hlist_del()
+ * functions.  No rwlocks will be held during the callback @func it is
+ * safe to sleep if needed.  This function will not terminate until the
+ * hash is empty.  Note it is still possible to concurrently add new
+ * items in to the hash.  It is the callers responsibility to ensure
+ * the required locking is in place to prevent concurrent insertions.
+ */
+void
+lustre_hash_for_each_empty(lustre_hash_t *lh, lh_for_each_cb func, void *data)
  {
-        int hashent ;
-        struct hlist_node * hash_item_hnode = NULL;
-        void * obj_value = NULL;
-        struct lustre_hash_bucket *bucket = NULL;
-        struct lustre_hash_operations * hop = hash_body->lchb_hash_operations;
+        struct hlist_node    *hnode;
+        lustre_hash_bucket_t *lhb;
+        void                 *obj;
+        int                   i;
          ENTRY;
-
-        /* get the hash value from the given item */
-        hashent = hop->lustre_hashfn(hash_body, key);
-
-        bucket = &hash_body->lchb_hash_tables[hashent];
-        spin_lock(&bucket->lhb_lock); /* lock the bucket */
-
-        hash_item_hnode = lustre_hash_getitem_in_bucket_nolock(hash_body, 
-                                                               hashent, key);
-
-        if (hash_item_hnode == NULL) {
-                spin_unlock(&bucket->lhb_lock); /* lock the bucket */
-                RETURN(NULL);
+  
+restart:
+        read_lock(&lh->lh_rwlock);
+        lh_for_each_bucket(lh, lhb, i) {
+                write_lock(&lhb->lhb_rwlock);
+                while (!hlist_empty(&lhb->lhb_head)) {
+                        hnode =  lhb->lhb_head.first;
+                        __lustre_hash_bucket_validate(lh, lhb, hnode);
+                        obj = lh_get(lh, hnode);
+                        write_unlock(&lhb->lhb_rwlock);
+                        read_unlock(&lh->lh_rwlock);
+                        func(obj, data);
+                        (void)lh_put(lh, hnode);
+                        goto restart;
+                }
+                write_unlock(&lhb->lhb_rwlock);
          }
-
-        obj_value = hop->lustre_hash_object_refcount_get(hash_item_hnode);
-        spin_unlock(&bucket->lhb_lock); /* lock the bucket */
-
-        RETURN(obj_value);
-}
-EXPORT_SYMBOL(lustre_hash_get_object_by_key);
-
-/* string hashing using djb2 hash algorithm */
-__u32 djb2_hashfn(struct lustre_class_hash_body *hash_body,  void* key, 
-                  size_t size)
-{
-        __u32 hash = 5381;
-        int i;
-        char *ptr = key;
-
-        LASSERT(key != NULL);
-
-        for (i=0; i<size; i++) 
-                hash = hash * 33 + ptr[i];
-
-        hash &= (hash_body->lchb_hash_max_size - 1);
-
-        RETURN(hash);
+        read_unlock(&lh->lh_rwlock);
+        EXIT;
  }
+EXPORT_SYMBOL(lustre_hash_for_each_empty);
  
  /*
- * define (uuid <-> export) hash operations and function define
- */
-
-/* define the uuid hash operations */
-struct lustre_hash_operations uuid_hash_operations = {
-        .lustre_hashfn = uuid_hashfn,
-        .lustre_hash_key_compare = uuid_hash_key_compare,
-        .lustre_hash_object_refcount_get = uuid_export_refcount_get,
-        .lustre_hash_object_refcount_put = uuid_export_refcount_put,
-};
-
-__u32 uuid_hashfn(struct lustre_class_hash_body *hash_body,  void * key)
-{
-        struct obd_uuid * uuid_key = key;
-
-        return djb2_hashfn(hash_body, uuid_key->uuid, sizeof(uuid_key->uuid));
-}
-
-/* Note, it is impossible to find an export that is in failed state with
- * this function */
-int uuid_hash_key_compare(void *key, struct hlist_node *compared_hnode)
-{
-        struct obd_export *export = NULL;
-        struct obd_uuid *uuid_key = NULL, *compared_uuid = NULL;
-
-        LASSERT( key != NULL);
-
-        uuid_key = (struct obd_uuid*)key;
-
-        export = hlist_entry(compared_hnode, struct obd_export, exp_uuid_hash);
-
-        compared_uuid = &export->exp_client_uuid;
-
-        RETURN(obd_uuid_equals(uuid_key, compared_uuid) &&
-               !export->exp_failed);
-}
-
-void * uuid_export_refcount_get(struct hlist_node * actual_hnode)
+ * For each item in the lustre hash @lh which matches the @key call
+ * the passed callback @func and pass to it as an argument each hash
+ * item and the private @data.  Before each callback ops->lh_get will
+ * be called, and after each callback ops->lh_put will be called.
+ * Finally, during the callback the bucket lock is held so the
+ * callback must never sleep.
+   */
+void
+lustre_hash_for_each_key(lustre_hash_t *lh, void *key,
+                         lh_for_each_cb func, void *data)
  {
-        struct obd_export *export = NULL;
-
-        LASSERT(actual_hnode != NULL);
-
-        export = hlist_entry(actual_hnode, struct obd_export, exp_uuid_hash);
-
-        LASSERT(export != NULL);
-
-        class_export_get(export);
-
-        RETURN(export);
-}
-
-void uuid_export_refcount_put(struct hlist_node * actual_hnode)
-{
-        struct obd_export *export = NULL;
-
-        LASSERT(actual_hnode != NULL);
-
-        export = hlist_entry(actual_hnode, struct obd_export, exp_uuid_hash);
-
-        LASSERT(export != NULL);
-
-        class_export_put(export);
+        struct hlist_node    *hnode;
+        lustre_hash_bucket_t *lhb;
+        unsigned              i;
+        ENTRY;
+  
+        read_lock(&lh->lh_rwlock);
+        i = lh_hash(lh, key, lh->lh_cur_mask);
+        lhb = &lh->lh_buckets[i];
+        LASSERT(i <= lh->lh_cur_mask);
+  
+        read_lock(&lhb->lhb_rwlock);
+        hlist_for_each(hnode, &(lhb->lhb_head)) {
+                __lustre_hash_bucket_validate(lh, lhb, hnode);
+  
+                if (!lh_compare(lh, key, hnode))
+                        continue;
+  
+                func(lh_get(lh, hnode), data);
+                (void)lh_put(lh, hnode);
+        }
+  
+        read_unlock(&lhb->lhb_rwlock);
+        read_unlock(&lh->lh_rwlock);
+  
+        EXIT;
  }
-
-/*
- * define (nid <-> export) hash operations and function define
+EXPORT_SYMBOL(lustre_hash_for_each_key);
+  
+/**
+ * Rehash the lustre hash @lh to the given @bits.  This can be used
+ * to grow the hash size when excessive chaining is detected, or to
+ * shrink the hash when it is larger than needed.  When the LH_REHASH
+ * flag is set in @lh the lustre hash may be dynamically rehashed
+ * during addition or removal if the hash's theta value exceeds
+ * either the lh->lh_min_theta or lh->max_theta values.  By default
+ * these values are tuned to keep the chained hash depth small, and
+ * this approach assumes a reasonably uniform hashing function.  The
+ * theta thresholds for @lh are tunable via lustre_hash_set_theta().
   */
-
-/* define the nid hash operations */
-struct lustre_hash_operations nid_hash_operations = {
-        .lustre_hashfn = nid_hashfn,
-        .lustre_hash_key_compare = nid_hash_key_compare,
-        .lustre_hash_object_refcount_get = nid_export_refcount_get,
-        .lustre_hash_object_refcount_put = nid_export_refcount_put,
-};
-
-__u32 nid_hashfn(struct lustre_class_hash_body *hash_body,  void * key)
+int
+lustre_hash_rehash(lustre_hash_t *lh, int bits)
  {
-        return djb2_hashfn(hash_body, key, sizeof(lnet_nid_t));
-}
-
-/* Note, it is impossible to find an export that is in failed state with
- * this function */
-int nid_hash_key_compare(void *key, struct hlist_node *compared_hnode)
-{
-        struct obd_export *export = NULL;
-        lnet_nid_t *nid_key = NULL;
-
-        LASSERT( key != NULL);
-
-        nid_key = (lnet_nid_t*)key;
-
-        export = hlist_entry(compared_hnode, struct obd_export, exp_nid_hash);
-
-        return (export->exp_connection->c_peer.nid == *nid_key &&
-                !export->exp_failed);
-}
-
-void *nid_export_refcount_get(struct hlist_node *actual_hnode)
-{
-        struct obd_export *export = NULL;
-
-        LASSERT(actual_hnode != NULL);
-
-        export = hlist_entry(actual_hnode, struct obd_export, exp_nid_hash);
-
-        LASSERT(export != NULL);
-
-        class_export_get(export);
-
-        RETURN(export);
-}
-
-void nid_export_refcount_put(struct hlist_node *actual_hnode)
-{
-        struct obd_export *export = NULL;
-
-        LASSERT(actual_hnode != NULL);
-
-        export = hlist_entry(actual_hnode, struct obd_export, exp_nid_hash);
-
-        LASSERT(export != NULL);
+        struct hlist_node     *hnode;
+        struct hlist_node     *pos;
+        lustre_hash_bucket_t  *lh_buckets;
+        lustre_hash_bucket_t  *rehash_buckets;
+        lustre_hash_bucket_t  *lh_lhb;
+        lustre_hash_bucket_t  *rehash_lhb;
+        int                    i;
+        int                    theta;
+        int                    lh_mask;
+        int                    lh_bits;
+        int                    mask = (1 << bits) - 1;
+        void                  *key;
+        ENTRY;
+  
+        LASSERT(!in_interrupt());
+        LASSERT(mask > 0);
  
-        class_export_put(export);
+        OBD_VMALLOC(rehash_buckets, sizeof(*rehash_buckets) << bits);
+        if (!rehash_buckets)
+                RETURN(-ENOMEM);
+  
+        for (i = 0; i <= mask; i++) {
+                INIT_HLIST_HEAD(&rehash_buckets[i].lhb_head);
+                rwlock_init(&rehash_buckets[i].lhb_rwlock);
+                atomic_set(&rehash_buckets[i].lhb_count, 0);
+        }
+  
+        write_lock(&lh->lh_rwlock);
+
+        /* 
+         * Early return for multiple concurrent racing callers,
+         * ensure we only trigger the rehash if it is still needed. 
+         */
+        theta = __lustre_hash_theta(lh);
+        if ((theta >= lh->lh_min_theta) && (theta <= lh->lh_max_theta)) {
+                OBD_VFREE(rehash_buckets, sizeof(*rehash_buckets) << bits);
+                write_unlock(&lh->lh_rwlock);
+                RETURN(-EALREADY);
+        }
+  
+        lh_bits = lh->lh_cur_bits;
+        lh_buckets = lh->lh_buckets;
+        lh_mask = (1 << lh_bits) - 1;
+  
+        lh->lh_cur_bits = bits;
+        lh->lh_cur_mask = (1 << bits) - 1;
+        lh->lh_buckets = rehash_buckets;
+        atomic_inc(&lh->lh_rehash_count);
+
+        for (i = 0; i <= lh_mask; i++) {
+                lh_lhb = &lh_buckets[i];
+
+                write_lock(&lh_lhb->lhb_rwlock);
+                hlist_for_each_safe(hnode, pos, &(lh_lhb->lhb_head)) {
+                        key = lh_key(lh, hnode);
+                        LASSERT(key);
+
+                        /* 
+                         * Validate hnode is in the correct bucket.
+                         */
+                        if (unlikely(lh->lh_flags & LH_DEBUG))
+                                LASSERT(lh_hash(lh, key, lh_mask) == i);
+
+                        /* 
+                         * Delete from old hash bucket.
+                         */
+                        hlist_del(hnode);
+                        LASSERT(atomic_read(&lh_lhb->lhb_count) > 0);
+                        atomic_dec(&lh_lhb->lhb_count);
+
+                        /* 
+                         * Add to rehash bucket, ops->lh_key must be defined. 
+                         */
+                        rehash_lhb = &rehash_buckets[lh_hash(lh, key, mask)];
+                        hlist_add_head(hnode, &(rehash_lhb->lhb_head));
+                        atomic_inc(&rehash_lhb->lhb_count);
+                }
+  
+                LASSERT(hlist_empty(&(lh_lhb->lhb_head)));
+                LASSERT(atomic_read(&lh_lhb->lhb_count) == 0);
+                write_unlock(&lh_lhb->lhb_rwlock);
+        }
+  
+        OBD_VFREE(lh_buckets, sizeof(*lh_buckets) << lh_bits);
+        write_unlock(&lh->lh_rwlock);
+  
+        RETURN(0);
  }
-
-/*
- * define (net_peer <-> connection) hash operations and function define
+EXPORT_SYMBOL(lustre_hash_rehash);
+  
+/**
+ * Rehash the object referenced by @hnode in the lustre hash @lh.  The
+ * @old_key must be provided to locate the objects previous location
+ * in the hash, and the @new_key will be used to reinsert the object.
+ * Use this function instead of a lustre_hash_add() + lustre_hash_del()
+ * combo when it is critical that there is no window in time where the
+ * object is missing from the hash.  When an object is being rehashed
+ * the registered lh_get() and lh_put() functions will not be called.
   */
-
-/* define the conn hash operations */
-struct lustre_hash_operations conn_hash_operations = {
-        .lustre_hashfn = conn_hashfn,
-        .lustre_hash_key_compare = conn_hash_key_compare,
-        .lustre_hash_object_refcount_get = conn_refcount_get,
-        .lustre_hash_object_refcount_put = conn_refcount_put,
-};
-EXPORT_SYMBOL(conn_hash_operations);
-
-__u32 conn_hashfn(struct lustre_class_hash_body *hash_body,  void * key)
-{
-        return djb2_hashfn(hash_body, key, sizeof(lnet_process_id_t));
-}
-
-int conn_hash_key_compare(void *key, struct hlist_node *compared_hnode)
-{
-        struct ptlrpc_connection *c = NULL;
-        lnet_process_id_t *conn_key = NULL;
-
-        LASSERT( key != NULL);
-
-        conn_key = (lnet_process_id_t*)key;
-
-        c = hlist_entry(compared_hnode, struct ptlrpc_connection, c_hash);
-
-        return (conn_key->nid == c->c_peer.nid &&
-                conn_key->pid == c->c_peer.pid);
-}
-
-void *conn_refcount_get(struct hlist_node *actual_hnode)
-{
-        struct ptlrpc_connection *c = NULL;
-
-        LASSERT(actual_hnode != NULL);
-
-        c = hlist_entry(actual_hnode, struct ptlrpc_connection, c_hash);
-
-        LASSERT(c != NULL);
-
-        atomic_inc(&c->c_refcount);
-
-        RETURN(c);
-}
-
-void conn_refcount_put(struct hlist_node *actual_hnode)
+void lustre_hash_rehash_key(lustre_hash_t *lh, void *old_key, void *new_key,
+                            struct hlist_node *hnode)
  {
-        struct ptlrpc_connection *c = NULL;
-
-        LASSERT(actual_hnode != NULL);
-
-        c = hlist_entry(actual_hnode, struct ptlrpc_connection, c_hash);
-
-        LASSERT(c != NULL);
-
-        atomic_dec(&c->c_refcount);
-}
-
-/*******************************************************************************/
-/* ( nid<>nidstats ) hash operations define */
-
-struct lustre_hash_operations nid_stat_hash_operations = {
-        .lustre_hashfn = nid_hashfn,
-        .lustre_hash_key_compare = nidstats_hash_key_compare,
-        .lustre_hash_object_refcount_get = nidstats_refcount_get,
-        .lustre_hash_object_refcount_put = nidstats_refcount_put,
-};
-EXPORT_SYMBOL(nid_stat_hash_operations);
-
-int nidstats_hash_key_compare(void *key, struct hlist_node * compared_hnode)
-{
-        struct nid_stat *data;
-        lnet_nid_t *nid_key;
-
-        LASSERT( key != NULL);
-
-        nid_key = (lnet_nid_t*)key;
-        data = hlist_entry(compared_hnode, struct nid_stat, nid_hash);
-
-        return (data->nid == *nid_key);
-}
-
-void* nidstats_refcount_get(struct hlist_node * actual_hnode)
-{
-        struct nid_stat *data;
-
-        data = hlist_entry(actual_hnode, struct nid_stat, nid_hash);
-        data->nid_exp_ref_count++;
-
-        RETURN(data);
-}
-
-void nidstats_refcount_put(struct hlist_node * actual_hnode)
-{
-        struct nid_stat *data;
-
-        data = hlist_entry(actual_hnode, struct nid_stat, nid_hash);
-        data->nid_exp_ref_count--;
+        lustre_hash_bucket_t  *old_lhb;
+        lustre_hash_bucket_t  *new_lhb;
+        unsigned               i;
+        int                    j;
+        ENTRY;
+  
+        __lustre_hash_key_validate(lh, new_key, hnode);
+        LASSERT(!hlist_unhashed(hnode));
+  
+        read_lock(&lh->lh_rwlock);
+  
+        i = lh_hash(lh, old_key, lh->lh_cur_mask);
+        old_lhb = &lh->lh_buckets[i];
+        LASSERT(i <= lh->lh_cur_mask);
+
+        j = lh_hash(lh, new_key, lh->lh_cur_mask);
+        new_lhb = &lh->lh_buckets[j];
+        LASSERT(j <= lh->lh_cur_mask);
+
+        write_lock(&old_lhb->lhb_rwlock);
+        write_lock(&new_lhb->lhb_rwlock);
+
+        /* 
+         * Migrate item between hash buckets without calling
+         * the lh_get() and lh_put() callback functions. 
+         */
+        hlist_del(hnode);
+        LASSERT(atomic_read(&old_lhb->lhb_count) > 0);
+        atomic_dec(&old_lhb->lhb_count);
+        hlist_add_head(hnode, &(new_lhb->lhb_head));
+        atomic_inc(&new_lhb->lhb_count);
+
+        write_unlock(&new_lhb->lhb_rwlock);
+        write_unlock(&old_lhb->lhb_rwlock);
+        read_unlock(&lh->lh_rwlock);
+  
          EXIT;
  }
-
-/*******************************************************************************/
-
-#ifdef __KERNEL__
-/*
- * define ( lqs <-> qctxt ) hash operations and function define
- */
-
-/* define the conn hash operations */
-struct lustre_hash_operations lqs_hash_operations = {
-        .lustre_hashfn = lqs_hashfn,
-        .lustre_hash_key_compare = lqs_hash_key_compare,
-        .lustre_hash_object_refcount_get = lqs_refcount_get,
-        .lustre_hash_object_refcount_put = lqs_refcount_put,
-};
-EXPORT_SYMBOL(lqs_hash_operations);
-
-/* string hashing using djb2 hash algorithm */
-__u32 lqs_hashfn(struct lustre_class_hash_body *hash_body,  void * key)
-{
-        struct quota_adjust_qunit *lqs_key = NULL;
-        __u32 hash;
-
-        LASSERT(key != NULL);
-
-        lqs_key = (struct quota_adjust_qunit *)key;
-
-        hash = QAQ_IS_GRP(lqs_key) ? 5381 : 5387;
-        hash *= lqs_key->qaq_id;
-
-        hash &= (hash_body->lchb_hash_max_size - 1);
-
-        RETURN(hash);
-}
-
-int lqs_hash_key_compare(void *key, struct hlist_node *compared_hnode)
-{
-        struct quota_adjust_qunit *lqs_key = NULL;
-        struct lustre_qunit_size *q = NULL;
-        int retval = 0;
-
-        LASSERT( key != NULL);
-
-        lqs_key = (struct quota_adjust_qunit *)key;
-
-        q = hlist_entry(compared_hnode, struct lustre_qunit_size, lqs_hash);
-
-        spin_lock(&q->lqs_lock);
-        if (lqs_key->qaq_id == q->lqs_id && QAQ_IS_GRP(lqs_key) == LQS_IS_GRP(q))
-                 retval = 1;
-        spin_unlock(&q->lqs_lock);
-
-        return retval;
-}
-
-void * lqs_refcount_get(struct hlist_node * actual_hnode)
+EXPORT_SYMBOL(lustre_hash_rehash_key);
+  
+int lustre_hash_debug_header(char *str, int size)
  {
-        struct lustre_qunit_size *q = NULL;
-
-        LASSERT(actual_hnode != NULL);
-
-        q = hlist_entry(actual_hnode, struct lustre_qunit_size, lqs_hash);
-
-        LASSERT(q != NULL);
-
-        lqs_getref(q);
-
-        RETURN(q);
+        return snprintf(str, size,
+                 "%-36s%6s%6s%6s%6s%6s%6s%6s%7s%6s%s\n",
+                 "name", "cur", "min", "max", "theta", "t-min", "t-max",
+                 "flags", "rehash", "count", " distribution");
  }
+EXPORT_SYMBOL(lustre_hash_debug_header);
  
-void lqs_refcount_put(struct hlist_node * actual_hnode)
+int lustre_hash_debug_str(lustre_hash_t *lh, char *str, int size)
  {
-        struct lustre_qunit_size *q = NULL;
-
-        LASSERT(actual_hnode != NULL);
-
-        q = hlist_entry(actual_hnode, struct lustre_qunit_size, lqs_hash);
-
-        LASSERT(q != NULL);
-
-        lqs_putref(q);
+        lustre_hash_bucket_t  *lhb;
+        int                    theta;
+        int                    i;
+        int                    c = 0;
+        int                    dist[8] = { 0, };
+
+        if (str == NULL || size == 0)
+                return 0;
+
+        read_lock(&lh->lh_rwlock);
+        theta = __lustre_hash_theta(lh);
+
+        c += snprintf(str + c, size - c, "%-36s ", lh->lh_name);
+        c += snprintf(str + c, size - c, "%5d ",  1 << lh->lh_cur_bits);
+        c += snprintf(str + c, size - c, "%5d ",  1 << lh->lh_min_bits);
+        c += snprintf(str + c, size - c, "%5d ",  1 << lh->lh_max_bits);
+        c += snprintf(str + c, size - c, "%d.%03d ",
+                      __lustre_hash_theta_int(theta),
+                      __lustre_hash_theta_frac(theta));
+        c += snprintf(str + c, size - c, "%d.%03d ",
+                      __lustre_hash_theta_int(lh->lh_min_theta),
+                      __lustre_hash_theta_frac(lh->lh_min_theta));
+        c += snprintf(str + c, size - c, "%d.%03d ",
+                      __lustre_hash_theta_int(lh->lh_max_theta),
+                      __lustre_hash_theta_frac(lh->lh_max_theta));
+        c += snprintf(str + c, size - c, " 0x%02x ", lh->lh_flags);
+        c += snprintf(str + c, size - c, "%6d ",
+                      atomic_read(&lh->lh_rehash_count));
+        c += snprintf(str + c, size - c, "%5d ",
+                      atomic_read(&lh->lh_count));
+
+        /* 
+         * The distribution is a summary of the chained hash depth in
+         * each of the lustre hash buckets.  Each buckets lhb_count is
+         * divided by the hash theta value and used to generate a
+         * histogram of the hash distribution.  A uniform hash will
+         * result in all hash buckets being close to the average thus
+         * only the first few entries in the histogram will be non-zero.
+         * If you hash function results in a non-uniform hash the will
+         * be observable by outlier bucks in the distribution histogram.
+         *
+         * Uniform hash distribution:      128/128/0/0/0/0/0/0
+         * Non-Uniform hash distribution:  128/125/0/0/0/0/2/1
+         */
+        lh_for_each_bucket(lh, lhb, i)
+                dist[min(__fls(atomic_read(&lhb->lhb_count)/max(theta,1)),7)]++;
+
+        for (i = 0; i < 8; i++)
+                c += snprintf(str + c, size - c, "%d%c",  dist[i],
+                              (i == 7) ? '\n' : '/');
+  
+        read_unlock(&lh->lh_rwlock);
+  
+        return c;
  }
-#endif
+EXPORT_SYMBOL(lustre_hash_debug_str);
diff --git a/lustre/obdclass/class_obd.c b/lustre/obdclass/class_obd.c

index ef4cc0e..9a12337 100644 (file)
--- a/lustre/obdclass/class_obd.c
+++ b/lustre/obdclass/class_obd.c
@@ -1,30 +1,37 @@
  /* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
   * vim:expandtab:shiftwidth=8:tabstop=8:
   *
- * Object Devices Class Driver
+ * GPL HEADER START
   *
- *  Copyright (C) 2001-2003 Cluster File Systems, Inc.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   *
- *   This file is part of the Lustre file system, http://www.lustre.org
- *   Lustre is a trademark of Cluster File Systems, Inc.
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
   *
- *   You may have signed or agreed to another license before downloading
- *   this software.  If so, you are bound by the terms and conditions
- *   of that agreement, and the following does not apply to you.  See the
- *   LICENSE file included with this distribution for more information.
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
   *
- *   If you did not agree to a different license, then this copy of Lustre
- *   is open source software; you can redistribute it and/or modify it
- *   under the terms of version 2 of the GNU General Public License as
- *   published by the Free Software Foundation.
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
   *
- *   In either case, Lustre is distributed in the hope that it will be
- *   useful, but WITHOUT ANY WARRANTY; without even the implied warranty
- *   of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- *   license text for more details.
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
   *
- * These are the only exported functions, they provide some generic
- * infrastructure for managing object devices
+ * GPL HEADER END
+ */
+/*
+ * Copyright  2008 Sun Microsystems, Inc. All rights reserved
+ * Use is subject to license terms.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
   */
  
  #define DEBUG_SUBSYSTEM S_CLASS
@@ -51,6 +58,7 @@ atomic_t libcfs_kmemory = {0};
  struct obd_device *obd_devs[MAX_OBD_DEVICES];
  struct list_head obd_types;
  spinlock_t obd_dev_lock = SPIN_LOCK_UNLOCKED;
+cfs_mem_cache_t *obd_lvfs_ctxt_cache;
  
  /* The following are visible and mutable through /proc/sys/lustre/. */
  unsigned int obd_debug_peer_on_timeout;
@@ -265,16 +273,16 @@ int class_handle_ioctl(unsigned int cmd, unsigned long arg)
                  if (!data->ioc_inlbuf1) {
                          CERROR("No buffer passed in ioctl\n");
                          GOTO(out, err = -EINVAL);
-                } 
+                }
                  if (data->ioc_inllen1 < 128) {
                          CERROR("ioctl buffer too small to hold version\n");
                          GOTO(out, err = -EINVAL);
                  }
-                                
+
                  obd = class_num2obd(index);
                  if (!obd)
                          GOTO(out, err = -ENOENT);
-                
+
                  if (obd->obd_stopping)
                          status = "ST";
                  else if (obd->obd_set_up)
@@ -282,7 +290,7 @@ int class_handle_ioctl(unsigned int cmd, unsigned long arg)
                  else if (obd->obd_attached)
                          status = "AT";
                  else
-                        status = "--"; 
+                        status = "--";
                  str = (char *)data->ioc_bulk;
                  snprintf(str, len - sizeof(*data), "%3d %s %s %s %s %d",
                           (int)index, status, obd->obd_type->typ_name,
@@ -295,14 +303,24 @@ int class_handle_ioctl(unsigned int cmd, unsigned long arg)
  
          }
  
-        if (data->ioc_dev >= class_devno_max()) {
+        if (data->ioc_dev == OBD_DEV_BY_DEVNAME) {
+                if (data->ioc_inllen4 <= 0 || data->ioc_inlbuf4 == NULL)
+                        GOTO(out, err = -EINVAL);
+                if (strnlen(data->ioc_inlbuf4, MAX_OBD_NAME) >= MAX_OBD_NAME)
+                        GOTO(out, err = -EINVAL);
+                obd = class_name2obd(data->ioc_inlbuf4);
+        } else if (data->ioc_dev < class_devno_max()) {
+                obd = class_num2obd(data->ioc_dev);
+        } else {
                  CERROR("OBD ioctl: No device\n");
                  GOTO(out, err = -EINVAL);
          }
  
-        obd = class_num2obd(data->ioc_dev);
          if (obd == NULL) {
-                CERROR("OBD ioctl : No Device %d\n", data->ioc_dev);
+                if (data->ioc_dev == OBD_DEV_BY_DEVNAME)
+                        CERROR("OBD ioctl: No Device %s\n", data->ioc_inlbuf4);
+                else
+                        CERROR("OBD ioctl: No Device %d\n", data->ioc_dev);
                  GOTO(out, err = -EINVAL);
          }
          LASSERT(obd->obd_magic == OBD_DEVICE_MAGIC);
@@ -320,7 +338,9 @@ int class_handle_ioctl(unsigned int cmd, unsigned long arg)
                  }
                  CDEBUG(D_HA, "%s: disabling committed-transno notification\n",
                         obd->obd_name);
+                spin_lock_bh(&obd->obd_processing_task_lock);
                  obd->obd_no_transno = 1;
+                spin_unlock_bh(&obd->obd_processing_task_lock);
                  GOTO(out, err = 0);
          }
  
@@ -368,6 +388,7 @@ void *obd_psdev = NULL;
  #endif
  
  EXPORT_SYMBOL(obd_devs);
+EXPORT_SYMBOL(obd_lvfs_ctxt_cache);
  EXPORT_SYMBOL(obd_print_fail_loc);
  EXPORT_SYMBOL(obd_race_waitq);
  EXPORT_SYMBOL(obd_race_state);
@@ -489,7 +510,7 @@ int obd_init_checks(void)
                  ret = -EINVAL;
          }
          if ((u64val & ~CFS_PAGE_MASK) >= CFS_PAGE_SIZE) {
-                CWARN("mask failed: u64val "LPU64" >= %lu\n", u64val, 
+                CWARN("mask failed: u64val "LPU64" >= %lu\n", u64val,
                        (unsigned long)CFS_PAGE_SIZE);
                  ret = -EINVAL;
          }
@@ -513,21 +534,17 @@ int init_obdclass(void)
          int i, err;
  #ifdef __KERNEL__
          int lustre_register_fs(void);
-
-        printk(KERN_INFO "Lustre: OBD class driver, info@clusterfs.com\n");
-        printk(KERN_INFO "        Lustre Version: "LUSTRE_VERSION_STRING"\n");
-        printk(KERN_INFO "        Build Version: "BUILD_VERSION"\n");
-#else
-        CDEBUG(D_INFO, "Lustre: OBD class driver, info@clusterfs.com\n");
-        CDEBUG(D_INFO, "        Lustre Version: "LUSTRE_VERSION_STRING"\n");
-        CDEBUG(D_INFO, "        Build Version: "BUILD_VERSION"\n");
  #endif
  
+        LCONSOLE_INFO("OBD class driver, http://www.lustre.org/\n");
+        LCONSOLE_INFO("    Lustre Version: "LUSTRE_VERSION_STRING"\n");
+        LCONSOLE_INFO("    Build Version: "BUILD_VERSION"\n");
+
          spin_lock_init(&obd_types_lock);
          cfs_waitq_init(&obd_race_waitq);
          obd_zombie_impexp_init();
  #ifdef LPROCFS
-        obd_memory = lprocfs_alloc_stats(OBD_STATS_NUM, 
+        obd_memory = lprocfs_alloc_stats(OBD_STATS_NUM,
                                           LPROCFS_STATS_FLAG_PERCPU);
          if (obd_memory == NULL) {
                  CERROR("kmalloc of 'obd_memory' failed\n");
@@ -535,12 +552,17 @@ int init_obdclass(void)
          }
  
          lprocfs_counter_init(obd_memory, OBD_MEMORY_STAT,
-                             LPROCFS_CNTR_AVGMINMAX, 
+                             LPROCFS_CNTR_AVGMINMAX,
                               "memused", "bytes");
          lprocfs_counter_init(obd_memory, OBD_MEMORY_PAGES_STAT,
-                             LPROCFS_CNTR_AVGMINMAX, 
+                             LPROCFS_CNTR_AVGMINMAX,
                               "pagesused", "pages");
  #endif
+        obd_lvfs_ctxt_cache = cfs_mem_cache_create("obd_lvfs_ctxt_cache",
+                sizeof(struct lvfs_run_ctxt), 0, 0);
+        if (obd_lvfs_ctxt_cache == NULL)
+                RETURN(-ENOMEM);
+
          err = obd_init_checks();
          if (err == -EOVERFLOW)
                  return err;
@@ -563,8 +585,13 @@ int init_obdclass(void)
          for (i = 0; i < class_devno_max(); i++)
                  obd_devs[i] = NULL;
  
-        /* Default the dirty page cache cap to 1/2 of system memory */
-        obd_max_dirty_pages = num_physpages / 2;
+        /* Default the dirty page cache cap to 1/2 of system memory.
+         * For clients with less memory, a larger fraction is needed
+         * for other purposes (mostly for BGL). */
+        if (num_physpages <= 512 << (20 - CFS_PAGE_SHIFT))
+                obd_max_dirty_pages = num_physpages / 4;
+        else
+                obd_max_dirty_pages = num_physpages / 2;
  
          err = obd_init_caches();
          if (err)
@@ -618,17 +645,20 @@ static void cleanup_obdclass(void)
          memory_max = obd_memory_max();
          pages_max = obd_pages_max();
  
+        cfs_mem_cache_destroy(obd_lvfs_ctxt_cache);
+
          lprocfs_free_stats(&obd_memory);
-        CDEBUG((memory_leaked | pages_leaked) ? D_ERROR : D_INFO,
-               "obd_memory max: "LPU64", leaked: "LPU64" "
+        CDEBUG((memory_leaked) ? D_ERROR : D_INFO,
+               "obd_memory max: "LPU64", leaked: "LPU64"\n",
+               memory_max, memory_leaked);
+        CDEBUG((pages_leaked) ? D_ERROR : D_INFO,
                 "obd_memory_pages max: "LPU64", leaked: "LPU64"\n",
-               memory_max, memory_leaked, 
                 pages_max, pages_leaked);
  
          EXIT;
  }
  
-MODULE_AUTHOR("Cluster File Systems, Inc. <info@clusterfs.com>");
+MODULE_AUTHOR("Sun Microsystems, Inc. <http://www.lustre.org/>");
  MODULE_DESCRIPTION("Lustre Class Driver Build Version: " BUILD_VERSION);
  MODULE_LICENSE("GPL");
  
diff --git a/lustre/obdclass/darwin/darwin-module.c b/lustre/obdclass/darwin/darwin-module.c

index 287d942..f67ae0e 100644 (file)
--- a/lustre/obdclass/darwin/darwin-module.c
+++ b/lustre/obdclass/darwin/darwin-module.c
@@ -1,3 +1,39 @@
+/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
+ * vim:expandtab:shiftwidth=8:tabstop=8:
+ *
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright  2008 Sun Microsystems, Inc. All rights reserved
+ * Use is subject to license terms.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
  #define DEBUG_SUBSYSTEM S_CLASS
  #ifndef EXPORT_SYMTAB
  # define EXPORT_SYMTAB
diff --git a/lustre/obdclass/darwin/darwin-sysctl.c b/lustre/obdclass/darwin/darwin-sysctl.c

index f953cf8..3d0d5f6 100644 (file)
--- a/lustre/obdclass/darwin/darwin-sysctl.c
+++ b/lustre/obdclass/darwin/darwin-sysctl.c
@@ -1,3 +1,39 @@
+/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
+ * vim:expandtab:shiftwidth=8:tabstop=8:
+ *
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright  2008 Sun Microsystems, Inc. All rights reserved
+ * Use is subject to license terms.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
  #include <sys/param.h>
  #include <sys/kernel.h>
  #include <sys/malloc.h>
@@ -151,4 +187,3 @@ void obd_sysctl_clean (void)
         obd_table_header = NULL;
  #endif
  }
-
diff --git a/lustre/obdclass/debug.c b/lustre/obdclass/debug.c

index 7d3d313..a80cd2c 100644 (file)
--- a/lustre/obdclass/debug.c
+++ b/lustre/obdclass/debug.c
@@ -1,25 +1,39 @@
  /* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
   * vim:expandtab:shiftwidth=8:tabstop=8:
   *
- *  Copyright (C) 2002 Cluster File Systems, Inc.
+ * GPL HEADER START
   *
- *   This file is part of the Lustre file system, http://www.lustre.org
- *   Lustre is a trademark of Cluster File Systems, Inc.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   *
- *   You may have signed or agreed to another license before downloading
- *   this software.  If so, you are bound by the terms and conditions
- *   of that agreement, and the following does not apply to you.  See the
- *   LICENSE file included with this distribution for more information.
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
   *
- *   If you did not agree to a different license, then this copy of Lustre
- *   is open source software; you can redistribute it and/or modify it
- *   under the terms of version 2 of the GNU General Public License as
- *   published by the Free Software Foundation.
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
   *
- *   In either case, Lustre is distributed in the hope that it will be
- *   useful, but WITHOUT ANY WARRANTY; without even the implied warranty
- *   of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- *   license text for more details.
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright  2008 Sun Microsystems, Inc. All rights reserved
+ * Use is subject to license terms.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/obdclass/debug.c
   *
   * Helper routines for dumping data structs for debugging.
   */
@@ -103,9 +117,10 @@ int dump_obdo(struct obdo *oa)
  void dump_lsm(int level, struct lov_stripe_md *lsm)
  {
          CDEBUG(level, "lsm %p, objid "LPX64", maxbytes "LPX64", magic 0x%08X, "
-               "stripe_size %u, stripe_count %u\n", lsm,
+               "stripe_size %u, stripe_count %u pool "LOV_POOLNAMEF"\n", lsm,
                 lsm->lsm_object_id, lsm->lsm_maxbytes, lsm->lsm_magic,
-               lsm->lsm_stripe_size, lsm->lsm_stripe_count);
+               lsm->lsm_stripe_size, lsm->lsm_stripe_count,
+               lsm->lsm_pool_name);
  }
  
  /* XXX assumes only a single page in request */
diff --git a/lustre/obdclass/genops.c b/lustre/obdclass/genops.c

index 34ad584..2b59509 100644 (file)
--- a/lustre/obdclass/genops.c
+++ b/lustre/obdclass/genops.c
@@ -1,25 +1,39 @@
  /* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
   * vim:expandtab:shiftwidth=8:tabstop=8:
   *
- *  Copyright (c) 2001-2003 Cluster File Systems, Inc.
+ * GPL HEADER START
   *
- *   This file is part of the Lustre file system, http://www.lustre.org
- *   Lustre is a trademark of Cluster File Systems, Inc.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   *
- *   You may have signed or agreed to another license before downloading
- *   this software.  If so, you are bound by the terms and conditions
- *   of that agreement, and the following does not apply to you.  See the
- *   LICENSE file included with this distribution for more information.
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
   *
- *   If you did not agree to a different license, then this copy of Lustre
- *   is open source software; you can redistribute it and/or modify it
- *   under the terms of version 2 of the GNU General Public License as
- *   published by the Free Software Foundation.
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
   *
- *   In either case, Lustre is distributed in the hope that it will be
- *   useful, but WITHOUT ANY WARRANTY; without even the implied warranty
- *   of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- *   license text for more details.
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright  2008 Sun Microsystems, Inc. All rights reserved
+ * Use is subject to license terms.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/obdclass/genops.c
   *
   * These are the only exported functions, they provide some generic
   * infrastructure for managing object devices
@@ -33,6 +47,7 @@
  #include <obd_class.h>
  #include <lprocfs_status.h>
  #include <class_hash.h>
+#include <lustre_export.h>
  
  extern struct list_head obd_types;
  spinlock_t obd_types_lock;
@@ -72,7 +87,7 @@ static void obd_device_free(struct obd_device *obd)
                   "%08x != %08x\n", obd, obd->obd_magic, OBD_DEVICE_MAGIC);
          if (obd->obd_namespace != NULL) {
                  CERROR("obd %p: namespace %p was not properly cleaned up "
-                       "(obd_force=%d)!\n", 
+                       "(obd_force=%d)!\n",
                         obd, obd->obd_namespace, obd->obd_force);
                  LBUG();
          }
@@ -104,7 +119,7 @@ struct obd_type *class_get_type(const char *name)
  #ifdef CONFIG_KMOD
          if (!type) {
                  const char *modname = name;
-                if (strcmp(modname, LUSTRE_MDT_NAME) == 0) 
+                if (strcmp(modname, LUSTRE_MDT_NAME) == 0)
                          modname = LUSTRE_MDS_NAME;
                  if (!request_module(modname)) {
                          CDEBUG(D_INFO, "Loaded module '%s'\n", modname);
@@ -204,7 +219,7 @@ int class_unregister_type(const char *name)
                  RETURN(-EBUSY);
          }
  
-        if (type->typ_procroot) 
+        if (type->typ_procroot)
                  lprocfs_remove(&type->typ_procroot);
  
          spin_lock(&obd_types_lock);
@@ -217,6 +232,17 @@ int class_unregister_type(const char *name)
          RETURN(0);
  } /* class_unregister_type */
  
+/**
+ * Create a new obd device.
+ *
+ * Find an empty slot in ::obd_devs[], create a new obd device in it.
+ *
+ * \param typename [in] obd device type string.
+ * \param name     [in] obd device name.
+ *
+ * \retval NULL if create fails, otherwise return the obd device
+ *         pointer created.
+ */
  struct obd_device *class_newdev(const char *type_name, const char *name)
  {
          struct obd_device *result = NULL;
@@ -225,7 +251,7 @@ struct obd_device *class_newdev(const char *type_name, const char *name)
          int i;
          int new_obd_minor = 0;
  
-        if (strlen(name) > MAX_OBD_NAME) {
+        if (strlen(name) >= MAX_OBD_NAME) {
                  CERROR("name/uuid must be < %u bytes long\n", MAX_OBD_NAME);
                  RETURN(ERR_PTR(-EINVAL));
          }
@@ -237,7 +263,7 @@ struct obd_device *class_newdev(const char *type_name, const char *name)
          }
  
          newdev = obd_device_alloc();
-        if (newdev == NULL) { 
+        if (newdev == NULL) {
                  class_put_type(type);
                  RETURN(ERR_PTR(-ENOMEM));
          }
@@ -267,18 +293,19 @@ struct obd_device *class_newdev(const char *type_name, const char *name)
                          result->obd_minor = i;
                          new_obd_minor = i;
                          result->obd_type = type;
-                        memcpy(result->obd_name, name, strlen(name));
+                        strncpy(result->obd_name, name,
+                                sizeof(result->obd_name) - 1);
                          obd_devs[i] = result;
                  }
          }
          spin_unlock(&obd_dev_lock);
-        
+
          if (result == NULL && i >= class_devno_max()) {
                  CERROR("all %u OBD devices used, increase MAX_OBD_DEVICES\n",
                         class_devno_max());
                  result = ERR_PTR(-EOVERFLOW);
          }
-        
+
          if (IS_ERR(result)) {
                  obd_device_free(newdev);
                  class_put_type(type);
@@ -371,15 +398,22 @@ struct obd_device *class_uuid2obd(struct obd_uuid *uuid)
          return class_num2obd(dev);
  }
  
+/**
+ * Get obd device from ::obd_devs[]
+ *
+ * \param num [in] array index
+ *
+ * \retval NULL if ::obd_devs[\a num] does not contains an obd device
+ *         otherwise return the obd device there.
+ */
  struct obd_device *class_num2obd(int num)
  {
          struct obd_device *obd = NULL;
  
          if (num < class_devno_max()) {
                  obd = obd_devs[num];
-                if (obd == NULL) {
+                if (obd == NULL)
                          return NULL;
-                }
  
                  LASSERTF(obd->obd_magic == OBD_DEVICE_MAGIC,
                           "%p obd_magic %08x != %08x\n",
@@ -523,7 +557,7 @@ int obd_init_caches(void)
  
          LASSERT(obd_device_cachep == NULL);
          obd_device_cachep = cfs_mem_cache_create("ll_obd_dev_cache",
-                                                 sizeof(struct obd_device), 
+                                                 sizeof(struct obd_device),
                                                   0, 0);
          if (!obd_device_cachep)
                  GOTO(out, -ENOMEM);
@@ -617,7 +651,7 @@ void __class_export_put(struct obd_export *exp)
  
                  CDEBUG(D_IOCTL, "final put %p/%s\n",
                         exp, exp->exp_client_uuid.uuid);
-        
+
                  spin_lock(&obd_zombie_impexp_lock);
                  list_add(&exp->exp_obd_chain, &obd_zombie_exports);
                  spin_unlock(&obd_zombie_impexp_lock);
@@ -643,7 +677,9 @@ void class_export_destroy(struct obd_export *exp)
                  ptlrpc_put_connection_superhack(exp->exp_connection);
  
          LASSERT(list_empty(&exp->exp_outstanding_replies));
+        LASSERT(list_empty(&exp->exp_uncommitted_replies));
          LASSERT(list_empty(&exp->exp_req_replay_queue));
+        LASSERT(list_empty(&exp->exp_queued_rpc));
          obd_destroy_export(exp);
  
          OBD_FREE_RCU(exp, sizeof(*exp), &exp->exp_handle);
@@ -664,14 +700,15 @@ struct obd_export *class_new_export(struct obd_device *obd,
                  return ERR_PTR(-ENOMEM);
  
          export->exp_conn_cnt = 0;
+        export->exp_lock_hash = NULL;
          atomic_set(&export->exp_refcount, 2);
          atomic_set(&export->exp_rpc_count, 0);
          export->exp_obd = obd;
          CFS_INIT_LIST_HEAD(&export->exp_outstanding_replies);
+        spin_lock_init(&export->exp_uncommitted_replies_lock);
+        CFS_INIT_LIST_HEAD(&export->exp_uncommitted_replies);
          CFS_INIT_LIST_HEAD(&export->exp_req_replay_queue);
-        /* XXX this should be in LDLM init */
-        CFS_INIT_LIST_HEAD(&export->exp_ldlm_data.led_held_locks);
-        spin_lock_init(&export->exp_ldlm_data.led_lock);
+        CFS_INIT_LIST_HEAD(&export->exp_queued_rpc);
  
          CFS_INIT_LIST_HEAD(&export->exp_handle.h_link);
          class_handle_hash(&export->exp_handle, export_handle_addref);
@@ -684,15 +721,15 @@ struct obd_export *class_new_export(struct obd_device *obd,
          obd_init_export(export);
  
          if (!obd_uuid_equals(cluuid, &obd->obd_uuid)) {
-               rc = lustre_hash_additem_unique(obd->obd_uuid_hash_body, cluuid, 
-                                               &export->exp_uuid_hash);
-               if (rc != 0) {
-                       CWARN("%s: denying duplicate export for %s\n",
-                             obd->obd_name, cluuid->uuid);
-                       class_handle_unhash(&export->exp_handle);
-                       OBD_FREE_PTR(export);
-                       return ERR_PTR(-EALREADY);
-               }
+                rc = lustre_hash_add_unique(obd->obd_uuid_hash, cluuid,
+                                            &export->exp_uuid_hash);
+                if (rc != 0) {
+                        LCONSOLE_WARN("%s: denying duplicate export for %s, %d\n",
+                                      obd->obd_name, cluuid->uuid, rc);
+                        class_handle_unhash(&export->exp_handle);
+                        OBD_FREE_PTR(export);
+                        return ERR_PTR(-EALREADY);
+                }
          }
  
          spin_lock(&obd->obd_dev_lock);
@@ -714,15 +751,22 @@ void class_unlink_export(struct obd_export *exp)
  
          spin_lock(&exp->exp_obd->obd_dev_lock);
          /* delete an uuid-export hashitem from hashtables */
-        if (!hlist_unhashed(&exp->exp_uuid_hash)) {
-                lustre_hash_delitem(exp->exp_obd->obd_uuid_hash_body, 
-                                    &exp->exp_client_uuid, &exp->exp_uuid_hash);
-        }
+        if (!hlist_unhashed(&exp->exp_uuid_hash))
+                lustre_hash_del(exp->exp_obd->obd_uuid_hash,
+                                &exp->exp_client_uuid,
+                                &exp->exp_uuid_hash);
+
          list_del_init(&exp->exp_obd_chain);
          list_del_init(&exp->exp_obd_chain_timed);
          exp->exp_obd->obd_num_exports--;
          spin_unlock(&exp->exp_obd->obd_dev_lock);
-
+        /* Keep these counter valid always */
+        spin_lock_bh(&exp->exp_obd->obd_processing_task_lock);
+        if (exp->exp_delayed)
+                exp->exp_obd->obd_delayed_clients--;
+        else if (exp->exp_replay_needed)
+                exp->exp_obd->obd_recoverable_clients--;
+        spin_unlock_bh(&exp->exp_obd->obd_processing_task_lock);
          class_export_put(exp);
  }
  EXPORT_SYMBOL(class_unlink_export);
@@ -738,8 +782,9 @@ struct obd_import *class_import_get(struct obd_import *import)
          LASSERT(atomic_read(&import->imp_refcount) >= 0);
          LASSERT(atomic_read(&import->imp_refcount) < 0x5a5a5a);
          atomic_inc(&import->imp_refcount);
-        CDEBUG(D_INFO, "import %p refcount=%d\n", import,
-               atomic_read(&import->imp_refcount));
+        CDEBUG(D_INFO, "import %p refcount=%d obd=%s\n", import,
+               atomic_read(&import->imp_refcount), 
+               import->imp_obd->obd_name);
          return import;
  }
  EXPORT_SYMBOL(class_import_get);
@@ -748,17 +793,16 @@ void class_import_put(struct obd_import *import)
  {
          ENTRY;
  
-        CDEBUG(D_INFO, "import %p refcount=%d\n", import,
-               atomic_read(&import->imp_refcount) - 1);
-
          LASSERT(atomic_read(&import->imp_refcount) > 0);
          LASSERT(atomic_read(&import->imp_refcount) < 0x5a5a5a);
          LASSERT(list_empty(&import->imp_zombie_chain));
  
-        if (atomic_dec_and_test(&import->imp_refcount)) {
+        CDEBUG(D_INFO, "import %p refcount=%d obd=%s\n", import,
+               atomic_read(&import->imp_refcount) - 1, 
+               import->imp_obd->obd_name);
  
+        if (atomic_dec_and_test(&import->imp_refcount)) {
                  CDEBUG(D_INFO, "final put import %p\n", import);
-                
                  spin_lock(&obd_zombie_impexp_lock);
                  list_add(&import->imp_zombie_chain, &obd_zombie_imports);
                  spin_unlock(&obd_zombie_impexp_lock);
@@ -773,7 +817,7 @@ EXPORT_SYMBOL(class_import_put);
  void class_import_destroy(struct obd_import *import)
  {
          ENTRY;
-        
+
          CDEBUG(D_IOCTL, "destroying import %p\n", import);
  
          LASSERT(atomic_read(&import->imp_refcount) == 0);
@@ -826,6 +870,7 @@ struct obd_import *class_new_import(struct obd_device *obd)
          cfs_waitq_init(&imp->imp_recovery_waitq);
  
          atomic_set(&imp->imp_refcount, 2);
+        atomic_set(&imp->imp_unregistering, 0);
          atomic_set(&imp->imp_inflight, 0);
          atomic_set(&imp->imp_replay_inflight, 0);
          atomic_set(&imp->imp_inval_count, 0);
@@ -834,6 +879,10 @@ struct obd_import *class_new_import(struct obd_device *obd)
          class_handle_hash(&imp->imp_handle, import_handle_addref);
          init_imp_at(&imp->imp_at);
  
+/* b1_8 supports both v1 & v2. but HEAD only supports v2.
+ * So let's use v2.
+ */
+#define HAVE_DEFAULT_V2_CONNECT 1
  #ifdef HAVE_DEFAULT_V2_CONNECT
          /* the default magic is V2, will be used in connect RPC, and
           * then adjusted according to the flags in request/reply. */
@@ -908,10 +957,11 @@ int class_disconnect(struct obd_export *export)
          already_disconnected = export->exp_disconnected;
          export->exp_disconnected = 1;
  
-        if (!hlist_unhashed(&export->exp_nid_hash)) {
-                lustre_hash_delitem(export->exp_obd->obd_nid_hash_body,
-                                    &export->exp_connection->c_peer.nid, &export->exp_nid_hash);
-        }
+        if (!hlist_unhashed(&export->exp_nid_hash))
+                lustre_hash_del(export->exp_obd->obd_nid_hash,
+                                &export->exp_connection->c_peer.nid,
+                                &export->exp_nid_hash);
+
          spin_unlock(&export->exp_lock);
  
          /* class_cleanup(), abort_recovery(), and class_fail_export()
@@ -928,7 +978,8 @@ int class_disconnect(struct obd_export *export)
          RETURN(0);
  }
  
-static void class_disconnect_export_list(struct list_head *list, int flags)
+static void class_disconnect_export_list(struct list_head *list,
+                                         enum obd_option flags)
  {
          int rc;
          struct lustre_handle fake_conn;
@@ -973,38 +1024,35 @@ static void class_disconnect_export_list(struct list_head *list, int flags)
                         exp->exp_obd->obd_name, obd_export_nid2str(exp),
                         exp, exp->exp_last_request_time);
                  rc = obd_disconnect(fake_exp);
+                CDEBUG(D_HA, "disconnected export at %s (%p): rc %d\n",
+                       obd_export_nid2str(exp), exp, rc);
                  class_export_put(exp);
          }
          EXIT;
  }
  
-static inline int get_exp_flags_from_obd(struct obd_device *obd)
-{
-        return ((obd->obd_fail ? OBD_OPT_FAILOVER : 0) |
-                (obd->obd_force ? OBD_OPT_FORCE : 0));
-}
-
  void class_disconnect_exports(struct obd_device *obd)
  {
          struct list_head work_list;
          ENTRY;
  
          /* Move all of the exports from obd_exports to a work list, en masse. */
+        CFS_INIT_LIST_HEAD(&work_list);
          spin_lock(&obd->obd_dev_lock);
-        list_add(&work_list, &obd->obd_exports);
-        list_del_init(&obd->obd_exports);
+        list_splice_init(&obd->obd_delayed_exports, &work_list);
+        list_splice_init(&obd->obd_exports, &work_list);
          spin_unlock(&obd->obd_dev_lock);
  
          CDEBUG(D_HA, "OBD device %d (%p) has exports, "
                 "disconnecting them\n", obd->obd_minor, obd);
-        class_disconnect_export_list(&work_list, get_exp_flags_from_obd(obd));
+        class_disconnect_export_list(&work_list, exp_flags_from_obd(obd));
          EXIT;
  }
  EXPORT_SYMBOL(class_disconnect_exports);
  
-/* Remove exports that have not completed recovery.
- */
-void class_disconnect_stale_exports(struct obd_device *obd)
+/* Remove exports that have not completed recovery. */
+void class_disconnect_stale_exports(struct obd_device *obd,
+                                    enum obd_option flags)
  {
          struct list_head work_list;
          struct list_head *pos, *n;
@@ -1017,8 +1065,7 @@ void class_disconnect_stale_exports(struct obd_device *obd)
          list_for_each_safe(pos, n, &obd->obd_exports) {
                  exp = list_entry(pos, struct obd_export, exp_obd_chain);
                  if (exp->exp_replay_needed) {
-                        list_del(&exp->exp_obd_chain);
-                        list_add(&exp->exp_obd_chain, &work_list);
+                        list_move(&exp->exp_obd_chain, &work_list);
                          cnt++;
                  }
          }
@@ -1026,11 +1073,111 @@ void class_disconnect_stale_exports(struct obd_device *obd)
  
          CDEBUG(D_ERROR, "%s: disconnecting %d stale clients\n",
                 obd->obd_name, cnt);
-        class_disconnect_export_list(&work_list, get_exp_flags_from_obd(obd));
+        class_disconnect_export_list(&work_list, flags);
          EXIT;
  }
  EXPORT_SYMBOL(class_disconnect_stale_exports);
  
+void class_disconnect_expired_exports(struct obd_device *obd)
+{
+        struct list_head expired_list;
+        struct obd_export *exp, *n;
+        int cnt = 0;
+        ENTRY;
+
+        CFS_INIT_LIST_HEAD(&expired_list);
+        spin_lock(&obd->obd_dev_lock);
+        list_for_each_entry_safe(exp, n, &obd->obd_delayed_exports,
+                                 exp_obd_chain) {
+                if (exp_expired(exp, obd->u.obt.obt_stale_export_age)) {
+                        list_move(&exp->exp_obd_chain, &expired_list);
+                        cnt++;
+                }
+        }
+        spin_unlock(&obd->obd_dev_lock);
+
+        if (cnt == 0)
+                return;
+
+        CDEBUG(D_INFO, "%s: disconnecting %d expired exports\n",
+               obd->obd_name, cnt);
+        class_disconnect_export_list(&expired_list, exp_flags_from_obd(obd));
+
+        EXIT;
+}
+EXPORT_SYMBOL(class_disconnect_expired_exports);
+
+void class_set_export_delayed(struct obd_export *exp)
+{
+        struct obd_device *obd = class_exp2obd(exp);
+
+        LASSERT(!exp->exp_delayed);
+        spin_lock(&exp->exp_lock);
+        exp->exp_delayed = 1;
+        spin_unlock(&exp->exp_lock);
+
+        /* no need to ping delayed exports */
+        spin_lock(&obd->obd_dev_lock);
+        list_del_init(&exp->exp_obd_chain_timed);
+        list_move_tail(&exp->exp_obd_chain, &obd->obd_delayed_exports);
+        spin_unlock(&obd->obd_dev_lock);
+
+        LASSERT(obd->obd_recoverable_clients > 0);
+
+        spin_lock_bh(&obd->obd_processing_task_lock);
+        obd->obd_delayed_clients++;
+        obd->obd_recoverable_clients--;
+        spin_unlock_bh(&obd->obd_processing_task_lock);
+
+        CDEBUG(D_HA, "%s: set client %s as delayed\n",
+               obd->obd_name, exp->exp_client_uuid.uuid);
+}
+EXPORT_SYMBOL(class_set_export_delayed);
+
+/*
+ * Manage exports that have not completed recovery.
+ */
+void class_handle_stale_exports(struct obd_device *obd)
+{
+        struct list_head delay_list, evict_list;
+        struct obd_export *exp, *n;
+        ENTRY;
+
+        CFS_INIT_LIST_HEAD(&delay_list);
+        CFS_INIT_LIST_HEAD(&evict_list);
+        spin_lock(&obd->obd_dev_lock);
+        list_for_each_entry_safe(exp, n, &obd->obd_exports, exp_obd_chain) {
+                LASSERT(!exp->exp_delayed);
+                /* clients finished recovery */
+                if (!exp->exp_replay_needed)
+                        continue;
+                /* connected non-vbr clients are evicted */
+                if (exp->exp_in_recovery && !exp_connect_vbr(exp)) {
+                        list_move_tail(&exp->exp_obd_chain, &evict_list);
+                        continue;
+                }
+                if (obd->obd_version_recov || !exp->exp_in_recovery)
+                        list_move_tail(&exp->exp_obd_chain, &delay_list);
+        }
+#ifndef HAVE_DELAYED_RECOVERY
+        /* delayed recovery is turned off, evict all delayed exports */
+        list_splice_init(&obd->obd_delayed_exports, &evict_list);
+#endif
+        spin_unlock(&obd->obd_dev_lock);
+
+        list_for_each_entry_safe(exp, n, &delay_list, exp_obd_chain) {
+                class_set_export_delayed(exp);
+                exp->exp_last_request_time = cfs_time_current_sec();
+        }
+        LASSERT(list_empty(&delay_list));
+
+        /* evict clients without VBR support */
+        class_disconnect_export_list(&evict_list, exp_flags_from_obd(obd));
+
+        EXIT;
+}
+EXPORT_SYMBOL(class_handle_stale_exports);
+
  int oig_init(struct obd_io_group **oig_out)
  {
          struct obd_io_group *oig;
@@ -1209,7 +1356,7 @@ char *obd_export_nid2str(struct obd_export *exp)
  {
          if (exp->exp_connection != NULL)
                  return libcfs_nid2str(exp->exp_connection->c_peer.nid);
-        
+
          return "(no nid)";
  }
  EXPORT_SYMBOL(obd_export_nid2str);
@@ -1222,8 +1369,7 @@ int obd_export_evict_by_nid(struct obd_device *obd, char *nid)
          lnet_nid_t nid_key = libcfs_str2nid(nid);
  
          do {
-                doomed_exp = lustre_hash_get_object_by_key(obd->obd_nid_hash_body,
-                                                           &nid_key);
+                doomed_exp = lustre_hash_lookup(obd->obd_nid_hash, &nid_key);
  
                  if (doomed_exp == NULL)
                          break;
@@ -1232,7 +1378,7 @@ int obd_export_evict_by_nid(struct obd_device *obd, char *nid)
                           "nid %s found, wanted nid %s, requested nid %s\n",
                           obd_export_nid2str(doomed_exp),
                           libcfs_nid2str(nid_key), nid);
-        
+
                  exports_evicted++;
                  CDEBUG(D_HA, "%s: evict NID '%s' (%s) #%d at adminstrative request\n",
                         obd->obd_name, nid, doomed_exp->exp_client_uuid.uuid,
@@ -1251,17 +1397,16 @@ EXPORT_SYMBOL(obd_export_evict_by_nid);
  int obd_export_evict_by_uuid(struct obd_device *obd, char *uuid)
  {
          struct obd_export *doomed_exp = NULL;
-        struct obd_uuid doomed;
+        struct obd_uuid doomed_uuid;
          int exports_evicted = 0;
  
-        obd_str2uuid(&doomed, uuid);
-        if(obd_uuid_equals(&doomed, &obd->obd_uuid)) {
+        obd_str2uuid(&doomed_uuid, uuid);
+        if(obd_uuid_equals(&doomed_uuid, &obd->obd_uuid)) {
                  CERROR("%s: can't evict myself\n", obd->obd_name);
                  return exports_evicted;
          }
  
-        doomed_exp = lustre_hash_get_object_by_key(obd->obd_uuid_hash_body, 
-                                                   &doomed);
+        doomed_exp = lustre_hash_lookup(obd->obd_uuid_hash, &doomed_uuid);
  
          if (doomed_exp == NULL) {
                  CERROR("%s: can't disconnect %s: no exports found\n",
diff --git a/lustre/obdclass/linux/linux-module.c b/lustre/obdclass/linux/linux-module.c

index c9807f5..40603d6 100644 (file)
--- a/lustre/obdclass/linux/linux-module.c
+++ b/lustre/obdclass/linux/linux-module.c
@@ -1,28 +1,45 @@
  /* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
   * vim:expandtab:shiftwidth=8:tabstop=8:
   *
- * Object Devices Class Driver
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   *
- *  Copyright (C) 2001-2003 Cluster File Systems, Inc.
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
   *
- *   This file is part of Lustre, http://www.lustre.org.
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
   *
- *   Lustre is free software; you can redistribute it and/or
- *   modify it under the terms of version 2 of the GNU General Public
- *   License as published by the Free Software Foundation.
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
   *
- *   Lustre is distributed in the hope that it will be useful,
- *   but WITHOUT ANY WARRANTY; without even the implied warranty of
- *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- *   GNU General Public License for more details.
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
   *
- *   You should have received a copy of the GNU General Public License
- *   along with Lustre; if not, write to the Free Software
- *   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ * GPL HEADER END
+ */
+/*
+ * Copyright  2008 Sun Microsystems, Inc. All rights reserved
+ * Use is subject to license terms.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
   *
+ * lustre/obdclass/linux/linux-module.c
+ *
+ * Object Devices Class Driver
   * These are the only exported functions, they provide some generic
   * infrastructure for managing object devices
   */
+
  #define DEBUG_SUBSYSTEM S_CLASS
  #ifndef EXPORT_SYMTAB
  # define EXPORT_SYMTAB
@@ -82,7 +99,7 @@ int obd_ioctl_getdata(char **buf, int *len, void *arg)
          ENTRY;
  
          err = copy_from_user(&hdr, (void *)arg, sizeof(hdr));
-        if ( err ) 
+        if ( err )
                  RETURN(err);
  
          if (hdr.ioc_version != OBD_IOCTL_VERSION) {
@@ -150,8 +167,8 @@ int obd_ioctl_getdata(char **buf, int *len, void *arg)
  
  int obd_ioctl_popdata(void *arg, void *data, int len)
  {
-        int err; 
-        
+        int err;
+
          err = copy_to_user(arg, data, len);
          if (err)
                  err = -EFAULT;
@@ -228,7 +245,8 @@ int obd_proc_read_version(char *page, char **start, off_t off, int count,
                          BUILD_VERSION);
  #else
          return snprintf(page, count, "lustre: %s\nkernel: %s\nbuild:  %s\n",
-                        LUSTRE_VERSION_STRING, "patchless", BUILD_VERSION);
+                        LUSTRE_VERSION_STRING, "patchless_client",
+                        BUILD_VERSION);
  #endif
  }
  
@@ -245,6 +263,21 @@ int obd_proc_read_pinger(char *page, char **start, off_t off, int count,
                         );
  }
  
+/**
+ * Check all obd devices health
+ *
+ * \param page
+ * \param start
+ * \param off
+ * \param count
+ * \param eof
+ * \param data
+ *                  proc read function parameters, please refer to kernel
+ *                  code fs/proc/generic.c proc_file_read()
+ * \param data [in] unused
+ *
+ * \retval number of characters printed
+ */
  static int obd_proc_read_health(char *page, char **start, off_t off,
                                  int count, int *eof, void *data)
  {
@@ -259,7 +292,7 @@ static int obd_proc_read_health(char *page, char **start, off_t off,
                  struct obd_device *obd;
  
                  obd = class_num2obd(i);
-                if (obd == NULL)
+                if (obd == NULL || !obd->obd_attached || !obd->obd_set_up)
                          continue;
  
                  LASSERT(obd->obd_magic == OBD_DEVICE_MAGIC);
@@ -313,7 +346,7 @@ static void obd_device_list_seq_stop(struct seq_file *p, void *v)
  }
  
  static void *obd_device_list_seq_next(struct seq_file *p, void *v, loff_t *pos)
-{      
+{
          ++*pos;
          if (*pos >= class_devno_max())
                  return NULL;
@@ -410,7 +443,7 @@ int class_procfs_init(void)
  int class_procfs_clean(void)
  {
          ENTRY;
-        if (proc_lustre_root) 
+        if (proc_lustre_root)
                  lprocfs_remove(&proc_lustre_root);
          RETURN(0);
  }
diff --git a/lustre/obdclass/linux/linux-obdo.c b/lustre/obdclass/linux/linux-obdo.c

index a5bf3b9..3c85ae4 100644 (file)
--- a/lustre/obdclass/linux/linux-obdo.c
+++ b/lustre/obdclass/linux/linux-obdo.c
@@ -1,28 +1,41 @@
  /* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
   * vim:expandtab:shiftwidth=8:tabstop=8:
   *
- * Object Devices Class Driver
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
   *
- *  Copyright (C) 2001-2003 Cluster File Systems, Inc.
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
   *
- *   This file is part of the Lustre file system, http://www.lustre.org
- *   Lustre is a trademark of Cluster File Systems, Inc.
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
   *
- *   You may have signed or agreed to another license before downloading
- *   this software.  If so, you are bound by the terms and conditions
- *   of that agreement, and the following does not apply to you.  See the
- *   LICENSE file included with this distribution for more information.
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
   *
- *   If you did not agree to a different license, then this copy of Lustre
- *   is open source software; you can redistribute it and/or modify it
- *   under the terms of version 2 of the GNU General Public License as
- *   published by the Free Software Foundation.
+ * GPL HEADER END
+ */
+/*
+ * Copyright  2008 Sun Microsystems, Inc. All rights reserved
+ * Use is subject to license terms.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
   *
- *   In either case, Lustre is distributed in the hope that it will be
- *   useful, but WITHOUT ANY WARRANTY; without even the implied warranty
- *   of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- *   license text for more details.
+ * lustre/obdclass/linux/linux-obdo.c
   *
+ * Object Devices Class Driver
   * These are the only exported functions, they provide some generic
   * infrastructure for managing object devices
   */
@@ -65,7 +78,7 @@ void obdo_from_iattr(struct obdo *oa, struct iattr *attr, unsigned int ia_valid)
          if (ia_valid & ATTR_MODE) {
                  oa->o_mode = attr->ia_mode;
                  oa->o_valid |= OBD_MD_FLTYPE | OBD_MD_FLMODE;
-                if (!in_group_p(oa->o_gid) && !capable(CAP_FSETID))
+                if (!in_group_p(oa->o_gid) && !cfs_capable(CFS_CAP_FSETID))
                          oa->o_mode &= ~S_ISGID;
          }
          if (ia_valid & ATTR_UID) {
@@ -113,7 +126,7 @@ void iattr_from_obdo(struct iattr *attr, struct obdo *oa, obd_flag valid)
          if (valid & OBD_MD_FLMODE) {
                  attr->ia_mode = (attr->ia_mode & S_IFMT)|(oa->o_mode & ~S_IFMT);
                  attr->ia_valid |= ATTR_MODE;
-                if (!in_group_p(oa->o_gid) && !capable(CAP_FSETID))
+                if (!in_group_p(oa->o_gid) && !cfs_capable(CFS_CAP_FSETID))
                          attr->ia_mode &= ~S_ISGID;
          }
          if (valid & OBD_MD_FLUID) {
@@ -207,16 +220,10 @@ void obdo_refresh_inode(struct inode *dst, struct obdo *src, obd_flag valid)
  
          if (valid & OBD_MD_FLATIME && src->o_atime > LTIME_S(dst->i_atime))
                  LTIME_S(dst->i_atime) = src->o_atime;
-
-        /* mtime is always updated with ctime, but can be set in past.
-           As write and utime(2) may happen within 1 second, and utime's
-           mtime has a priority over write's one, leave mtime from mds 
-           for the same ctimes. */
-        if (valid & OBD_MD_FLCTIME && src->o_ctime > LTIME_S(dst->i_ctime)) {
+        if (valid & OBD_MD_FLMTIME && src->o_mtime > LTIME_S(dst->i_mtime))
+                LTIME_S(dst->i_mtime) = src->o_mtime;
+        if (valid & OBD_MD_FLCTIME && src->o_ctime > LTIME_S(dst->i_ctime))
                  LTIME_S(dst->i_ctime) = src->o_ctime;
-                if (valid & OBD_MD_FLMTIME)
-                        LTIME_S(dst->i_mtime) = src->o_mtime;
-        }
          if (valid & OBD_MD_FLSIZE)
                  i_size_write(dst, src->o_size);
          /* optimum IO size */
@@ -285,4 +292,3 @@ void obdo_to_inode(struct inode *dst, struct obdo *src, obd_flag valid)
  }
  EXPORT_SYMBOL(obdo_to_inode);
  #endif
-
diff --git a/lustre/obdclass/linux/linux-sysctl.c b/lustre/obdclass/linux/linux-sysctl.c

index 0f860c3..b9eac5e 100644 (file)
--- a/lustre/obdclass/linux/linux-sysctl.c
+++ b/lustre/obdclass/linux/linux-sysctl.c
@@ -1,26 +1,37 @@
  /* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
   * vim:expandtab:shiftwidth=8:tabstop=8:
   *
- *  Copyright (C) 2001, 2002 Cluster File Systems, Inc.
+ * GPL HEADER START
   *
- *   This file is part of the Lustre file system, http://www.lustre.org
- *   Lustre is a trademark of Cluster File Systems, Inc.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   *
- *   You may have signed or agreed to another license before downloading
- *   this software.  If so, you are bound by the terms and conditions
- *   of that agreement, and the following does not apply to you.  See the
- *   LICENSE file included with this distribution for more information.
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
   *
- *   If you did not agree to a different license, then this copy of Lustre
- *   is open source software; you can redistribute it and/or modify it
- *   under the terms of version 2 of the GNU General Public License as
- *   published by the Free Software Foundation.
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
   *
- *   In either case, Lustre is distributed in the hope that it will be
- *   useful, but WITHOUT ANY WARRANTY; without even the implied warranty
- *   of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- *   license text for more details.
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
   *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright  2008 Sun Microsystems, Inc. All rights reserved
+ * Use is subject to license terms.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
   */
  
  #include <linux/module.h>
@@ -30,9 +41,6 @@
  #include <linux/mm.h>
  #include <linux/sysctl.h>
  #include <linux/version.h>
-#if (LINUX_VERSION_CODE < KERNEL_VERSION(2,5,0))
-#include <linux/swapctl.h>
-#endif
  #include <linux/proc_fs.h>
  #include <linux/slab.h>
  #include <linux/stat.h>
@@ -64,6 +72,7 @@ enum {
          OBD_DUMP_ON_EVICTION,   /* dump kernel debug log upon eviction */
          OBD_DEBUG_PEER_ON_TIMEOUT, /* dump peer debug when RPC times out */
          OBD_ALLOC_FAIL_RATE,    /* memory allocation random failure rate */
+        OBD_MAX_DIRTY_PAGES,    /* maximum dirty pages */
  };
  
  int LL_PROC_PROTO(proc_fail_loc)
@@ -87,6 +96,47 @@ int LL_PROC_PROTO(proc_set_timeout)
          return rc;
  }
  
+int LL_PROC_PROTO(proc_max_dirty_pages_in_mb)
+{
+        int rc = 0;
+        DECLARE_LL_PROC_PPOS_DECL;
+
+        if (!table->data || !table->maxlen || !*lenp || (*ppos && !write)) {
+                *lenp = 0;
+                return 0;
+        }
+        if (write) {
+                rc = lprocfs_write_frac_helper(buffer, *lenp,
+                                               (unsigned int*)table->data,
+                                               1 << (20 - CFS_PAGE_SHIFT));
+                /* Don't allow them to let dirty pages exceed 90% of system memory,
+                 * and set a hard minimum of 4MB. */
+                if (obd_max_dirty_pages > ((num_physpages / 10) * 9)) {
+                        CERROR("Refusing to set max dirty pages to %u, which "
+                               "is more than 90%% of available RAM; setting to %lu\n",
+                               obd_max_dirty_pages, ((num_physpages / 10) * 9));
+                        obd_max_dirty_pages = ((num_physpages / 10) * 9);
+                } else if (obd_max_dirty_pages < 4 << (20 - CFS_PAGE_SHIFT)) {
+                        obd_max_dirty_pages = 4 << (20 - CFS_PAGE_SHIFT);
+                }
+        } else {
+                char buf[21];
+                int len;
+
+                len = lprocfs_read_frac_helper(buf, sizeof(buf),
+                                               *(unsigned int*)table->data,
+                                               1 << (20 - CFS_PAGE_SHIFT));
+                if (len > *lenp)
+                        len = *lenp;
+                buf[len] = '\0';
+                if (copy_to_user(buffer, buf, len))
+                        return -EFAULT;
+                *lenp = len;
+        }
+        *ppos += *lenp;
+        return rc;
+}
+
  #ifdef RANDOM_FAIL_ALLOC
  int LL_PROC_PROTO(proc_alloc_fail_rate)
  {
@@ -98,7 +148,7 @@ int LL_PROC_PROTO(proc_alloc_fail_rate)
                  return 0;
          }
          if (write) {
-                rc = lprocfs_write_frac_helper(buffer, *lenp, 
+                rc = lprocfs_write_frac_helper(buffer, *lenp,
                                                 (unsigned int*)table->data,
                                                 OBD_ALLOC_FAIL_MULT);
          } else {
@@ -130,9 +180,9 @@ int LL_PROC_PROTO(proc_memory_alloc)
                  *lenp = 0;
                  return 0;
          }
-        if (write) 
+        if (write)
                  return -EINVAL;
-        
+
          len = snprintf(buf, sizeof(buf), LPU64"\n", obd_memory_sum());
          if (len > *lenp)
                  len = *lenp;
@@ -315,6 +365,14 @@ static cfs_sysctl_table_t obd_table[] = {
                  .proc_handler = &proc_alloc_fail_rate
          },
  #endif
+        {
+                .ctl_name = OBD_MAX_DIRTY_PAGES,
+                .procname = "max_dirty_mb",
+                .data     = &obd_max_dirty_pages,
+                .maxlen   = sizeof(int),
+                .mode     = 0644,
+                .proc_handler = &proc_max_dirty_pages_in_mb
+        },
          { 0 }
  };
  
diff --git a/lustre/obdclass/llog.c b/lustre/obdclass/llog.c

index b95e30a..2a5ba2d 100644 (file)
--- a/lustre/obdclass/llog.c
+++ b/lustre/obdclass/llog.c
@@ -1,32 +1,46 @@
  /* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
   * vim:expandtab:shiftwidth=8:tabstop=8:
   *
- *  Copyright (C) 2001-2003 Cluster File Systems, Inc.
- *   Author: Andreas Dilger <adilger@clusterfs.com>
+ * GPL HEADER START
   *
- *   This file is part of the Lustre file system, http://www.lustre.org
- *   Lustre is a trademark of Cluster File Systems, Inc.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   *
- *   You may have signed or agreed to another license before downloading
- *   this software.  If so, you are bound by the terms and conditions
- *   of that agreement, and the following does not apply to you.  See the
- *   LICENSE file included with this distribution for more information.
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
   *
- *   If you did not agree to a different license, then this copy of Lustre
- *   is open source software; you can redistribute it and/or modify it
- *   under the terms of version 2 of the GNU General Public License as
- *   published by the Free Software Foundation.
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
   *
- *   In either case, Lustre is distributed in the hope that it will be
- *   useful, but WITHOUT ANY WARRANTY; without even the implied warranty
- *   of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- *   license text for more details.
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
   *
- * OST<->MDS recovery logging infrastructure.
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright  2008 Sun Microsystems, Inc. All rights reserved
+ * Use is subject to license terms.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
   *
+ * lustre/obdclass/llog.c
+ *
+ * OST<->MDS recovery logging infrastructure.
   * Invariants in implementation:
   * - we do not share logs among different OST<->MDS connections, so that
   *   if an OST or MDS fails it need only look at log(s) relevant to itself
+ *
+ * Author: Andreas Dilger <adilger@clusterfs.com>
   */
  
  #define DEBUG_SUBSYSTEM S_LOG
@@ -86,17 +100,17 @@ int llog_cancel_rec(struct llog_handle *loghandle, int index)
          int rc = 0;
          ENTRY;
  
-        CDEBUG(D_RPCTRACE, "canceling %d in log "LPX64"\n",
+        CDEBUG(D_RPCTRACE, "Canceling %d in log "LPX64"\n",
                 index, loghandle->lgh_id.lgl_oid);
  
          if (index == 0) {
-                CERROR("cannot cancel index 0 (which is header)\n");
+                CERROR("Can't cancel index 0 which is header\n");
                  RETURN(-EINVAL);
          }
  
          if (!ext2_clear_bit(index, llh->llh_bitmap)) {
-                CDEBUG(D_RPCTRACE, "catalog index %u already clear?\n", index);
-                RETURN(-EINVAL);
+                CDEBUG(D_RPCTRACE, "Catalog index %u already clear?\n", index);
+                RETURN(-ENOENT);
          }
  
          llh->llh_count--;
@@ -106,7 +120,7 @@ int llog_cancel_rec(struct llog_handle *loghandle, int index)
              (loghandle->lgh_last_idx == (LLOG_BITMAP_BYTES * 8) - 1)) {
                  rc = llog_destroy(loghandle);
                  if (rc) {
-                        CERROR("failure destroying log after last cancel: %d\n",
+                        CERROR("Failure destroying log after last cancel: %d\n",
                                 rc);
                          ext2_set_bit(index, llh->llh_bitmap);
                          llh->llh_count++;
@@ -118,7 +132,7 @@ int llog_cancel_rec(struct llog_handle *loghandle, int index)
  
          rc = llog_write_rec(loghandle, &llh->llh_hdr, NULL, 0, NULL, 0);
          if (rc) {
-                CERROR("failure re-writing header %d\n", rc);
+                CERROR("Failure re-writing header %d\n", rc);
                  ext2_set_bit(index, llh->llh_bitmap);
                  llh->llh_count++;
          }
@@ -231,11 +245,11 @@ static int llog_process_thread(void *arg)
          cfs_daemonize_ctxt("llog_process_thread");
  
          if (cd != NULL) {
-                last_called_index = cd->first_idx;
-                index = cd->first_idx + 1;
+                last_called_index = cd->lpcd_first_idx;
+                index = cd->lpcd_first_idx + 1;
          }
-        if (cd != NULL && cd->last_idx)
-                last_index = cd->last_idx;
+        if (cd != NULL && cd->lpcd_last_idx)
+                last_index = cd->lpcd_last_idx;
          else
                  last_index = LLOG_BITMAP_BYTES * 8 - 1;
  
@@ -333,7 +347,7 @@ static int llog_process_thread(void *arg)
  
   out:
          if (cd != NULL)
-                cd->last_idx = last_called_index;
+                cd->lpcd_last_idx = last_called_index;
          if (buf)
                  OBD_FREE(buf, LLOG_CHUNK_SIZE);
          lpi->lpi_rc = rc;
@@ -400,9 +414,9 @@ int llog_reverse_process(struct llog_handle *loghandle, llog_cb_t cb,
                  RETURN(-ENOMEM);
  
          if (cd != NULL)
-                first_index = cd->first_idx + 1;
-        if (cd != NULL && cd->last_idx)
-                index = cd->last_idx;
+                first_index = cd->lpcd_first_idx + 1;
+        if (cd != NULL && cd->lpcd_last_idx)
+                index = cd->lpcd_last_idx;
          else
                  index = LLOG_BITMAP_BYTES * 8 - 1;
  
diff --git a/lustre/obdclass/llog_cat.c b/lustre/obdclass/llog_cat.c

index 6cb3175..10aa38e 100644 (file)
--- a/lustre/obdclass/llog_cat.c
+++ b/lustre/obdclass/llog_cat.c
@@ -1,32 +1,47 @@
  /* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
   * vim:expandtab:shiftwidth=8:tabstop=8:
   *
- *  Copyright (C) 2001-2003 Cluster File Systems, Inc.
- *   Author: Andreas Dilger <adilger@clusterfs.com>
+ * GPL HEADER START
   *
- *   This file is part of the Lustre file system, http://www.lustre.org
- *   Lustre is a trademark of Cluster File Systems, Inc.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   *
- *   You may have signed or agreed to another license before downloading
- *   this software.  If so, you are bound by the terms and conditions
- *   of that agreement, and the following does not apply to you.  See the
- *   LICENSE file included with this distribution for more information.
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
   *
- *   If you did not agree to a different license, then this copy of Lustre
- *   is open source software; you can redistribute it and/or modify it
- *   under the terms of version 2 of the GNU General Public License as
- *   published by the Free Software Foundation.
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
   *
- *   In either case, Lustre is distributed in the hope that it will be
- *   useful, but WITHOUT ANY WARRANTY; without even the implied warranty
- *   of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- *   license text for more details.
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright  2008 Sun Microsystems, Inc. All rights reserved
+ * Use is subject to license terms.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/obdclass/llog_cat.c
   *
   * OST<->MDS recovery logging infrastructure.
   *
   * Invariants in implementation:
   * - we do not share logs among different OST<->MDS connections, so that
   *   if an OST or MDS fails it need only look at log(s) relevant to itself
+ *
+ * Author: Andreas Dilger <adilger@clusterfs.com>
   */
  
  #define DEBUG_SUBSYSTEM S_LOG
@@ -69,11 +84,11 @@ static struct llog_handle *llog_cat_new_log(struct llog_handle *cathandle)
  
          if (OBD_FAIL_CHECK_ONCE(OBD_FAIL_MDS_LLOG_CREATE_FAILED))
                  RETURN(ERR_PTR(-ENOSPC));
-        
+
          rc = llog_create(cathandle->lgh_ctxt, &loghandle, NULL, NULL);
          if (rc)
                  RETURN(ERR_PTR(rc));
-        
+
          rc = llog_init_handle(loghandle,
                                LLOG_F_IS_PLAIN | LLOG_F_ZAP_WHEN_EMPTY,
                                &cathandle->lgh_hdr->llh_tgtuuid);
@@ -113,13 +128,12 @@ static struct llog_handle *llog_cat_new_log(struct llog_handle *cathandle)
          LASSERT(list_empty(&loghandle->u.phd.phd_entry));
          list_add_tail(&loghandle->u.phd.phd_entry, &cathandle->u.chd.chd_head);
  
- out_destroy:
+out_destroy:
          if (rc < 0)
                  llog_destroy(loghandle);
  
          RETURN(loghandle);
  }
-EXPORT_SYMBOL(llog_cat_new_log);
  
  /* Open an existent log handle and add it to the open list.
   * This log handle will be closed when all of the records in it are removed.
@@ -294,7 +308,7 @@ EXPORT_SYMBOL(llog_cat_add_rec);
   * Assumes caller has already pushed us into the kernel context.
   */
  int llog_cat_cancel_records(struct llog_handle *cathandle, int count,
-                        struct llog_cookie *cookies)
+                            struct llog_cookie *cookies)
  {
          int i, index, rc = 0;
          ENTRY;
@@ -379,14 +393,14 @@ int llog_cat_process(struct llog_handle *cat_llh, llog_cb_t cb, void *data)
                  CWARN("catlog "LPX64" crosses index zero\n",
                        cat_llh->lgh_id.lgl_oid);
  
-                cd.first_idx = llh->llh_cat_idx;
-                cd.last_idx = 0;
+                cd.lpcd_first_idx = llh->llh_cat_idx;
+                cd.lpcd_last_idx = 0;
                  rc = llog_process(cat_llh, llog_cat_process_cb, &d, &cd);
                  if (rc != 0)
                          RETURN(rc);
  
-                cd.first_idx = 0;
-                cd.last_idx = cat_llh->lgh_last_idx;
+                cd.lpcd_first_idx = 0;
+                cd.lpcd_last_idx = cat_llh->lgh_last_idx;
                  rc = llog_process(cat_llh, llog_cat_process_cb, &d, &cd);
          } else {
                  rc = llog_process(cat_llh, llog_cat_process_cb, &d, NULL);
@@ -396,6 +410,56 @@ int llog_cat_process(struct llog_handle *cat_llh, llog_cb_t cb, void *data)
  }
  EXPORT_SYMBOL(llog_cat_process);
  
+#ifdef __KERNEL__
+int llog_cat_process_thread(void *data)
+{
+        struct llog_process_cat_args *args = data;
+        struct llog_ctxt *ctxt = args->lpca_ctxt;
+        struct llog_handle *llh = NULL;
+        void  *cb = args->lpca_cb;
+        struct llog_logid logid;
+        int rc;
+        ENTRY;
+
+        cfs_daemonize_ctxt("ll_log_process");
+
+        logid = *(struct llog_logid *)(args->lpca_arg);
+        rc = llog_create(ctxt, &llh, &logid, NULL);
+        if (rc) {
+                CERROR("llog_create() failed %d\n", rc);
+                GOTO(out, rc);
+        }
+        rc = llog_init_handle(llh, LLOG_F_IS_CAT, NULL);
+        if (rc) {
+                CERROR("llog_init_handle failed %d\n", rc);
+                GOTO(release_llh, rc);
+        }
+
+        if (cb) {
+                rc = llog_cat_process(llh, (llog_cb_t)cb, NULL);
+                if (rc != LLOG_PROC_BREAK && rc != 0)
+                        CERROR("llog_cat_process() failed %d\n", rc);
+        } else {
+                CWARN("No callback function for recovery\n");
+        }
+
+        /* 
+         * Make sure that all cached data is sent. 
+         */
+        llog_sync(ctxt, NULL);
+        GOTO(release_llh, rc);
+release_llh:
+        rc = llog_cat_put(llh);
+        if (rc)
+                CERROR("llog_cat_put() failed %d\n", rc);
+out:
+        llog_ctxt_put(ctxt);
+        OBD_FREE_PTR(args);
+        return rc;
+}
+EXPORT_SYMBOL(llog_cat_process_thread);
+#endif
+
  static int llog_cat_reverse_process_cb(struct llog_handle *cat_llh,
                                         struct llog_rec_hdr *rec, void *data)
  {
@@ -440,15 +504,15 @@ int llog_cat_reverse_process(struct llog_handle *cat_llh,
                  CWARN("catalog "LPX64" crosses index zero\n",
                        cat_llh->lgh_id.lgl_oid);
  
-                cd.first_idx = 0;
-                cd.last_idx = cat_llh->lgh_last_idx;
+                cd.lpcd_first_idx = 0;
+                cd.lpcd_last_idx = cat_llh->lgh_last_idx;
                  rc = llog_reverse_process(cat_llh, llog_cat_reverse_process_cb,
                                            &d, &cd);
                  if (rc != 0)
                          RETURN(rc);
  
-                cd.first_idx = le32_to_cpu(llh->llh_cat_idx);
-                cd.last_idx = 0;
+                cd.lpcd_first_idx = le32_to_cpu(llh->llh_cat_idx);
+                cd.lpcd_last_idx = 0;
                  rc = llog_reverse_process(cat_llh, llog_cat_reverse_process_cb,
                                            &d, &cd);
          } else {
@@ -491,49 +555,3 @@ out:
  
          RETURN(0);
  }
-
-#if 0
-/* Assumes caller has already pushed us into the kernel context. */
-int llog_cat_init(struct llog_handle *cathandle, struct obd_uuid *tgtuuid)
-{
-        struct llog_log_hdr *llh;
-        loff_t offset = 0;
-        int rc = 0;
-        ENTRY;
-
-        LASSERT(sizeof(*llh) == LLOG_CHUNK_SIZE);
-
-        down(&cathandle->lgh_lock);
-        llh = cathandle->lgh_hdr;
-
-        if (i_size_read(cathandle->lgh_file->f_dentry->d_inode) == 0) {
-                llog_write_rec(cathandle, &llh->llh_hdr, NULL, 0, NULL, 0);
-
-write_hdr:
-                rc = lustre_fwrite(cathandle->lgh_file, llh, LLOG_CHUNK_SIZE,
-                                   &offset);
-                if (rc != LLOG_CHUNK_SIZE) {
-                        CERROR("error writing catalog header: rc %d\n", rc);
-                        OBD_FREE(llh, sizeof(*llh));
-                        if (rc >= 0)
-                                rc = -ENOSPC;
-                } else
-                        rc = 0;
-        } else {
-                rc = lustre_fread(cathandle->lgh_file, llh, LLOG_CHUNK_SIZE,
-                                  &offset);
-                if (rc != LLOG_CHUNK_SIZE) {
-                        CERROR("error reading catalog header: rc %d\n", rc);
-                        /* Can we do much else if the header is bad? */
-                        goto write_hdr;
-                } else
-                        rc = 0;
-        }
-
-        cathandle->lgh_tgtuuid = &llh->llh_tgtuuid;
-        up(&cathandle->lgh_lock);
-        RETURN(rc);
-}
-EXPORT_SYMBOL(llog_cat_init);
-
-#endif
diff --git a/lustre/obdclass/llog_internal.h b/lustre/obdclass/llog_internal.h

index 82bb2e3..4c31016 100644 (file)
--- a/lustre/obdclass/llog_internal.h
+++ b/lustre/obdclass/llog_internal.h
@@ -1,3 +1,39 @@
+/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
+ * vim:expandtab:shiftwidth=8:tabstop=8:
+ *
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright  2008 Sun Microsystems, Inc. All rights reserved
+ * Use is subject to license terms.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
  #ifndef __LLOG_INTERNAL_H__
  #define __LLOG_INTERNAL_H__
  
@@ -13,7 +49,9 @@ struct llog_process_info {
  };
  
  int llog_put_cat_list(struct obd_device *obd, struct obd_device *disk_obd,
-                      char *name, int count, struct llog_catid *idarray);
+                      char *name, int idx, int count,
+                      struct llog_catid *idarray);
+
  int llog_cat_id2handle(struct llog_handle *cathandle, struct llog_handle **res,
                         struct llog_logid *logid);
  int class_config_dump_handler(struct llog_handle * handle,
diff --git a/lustre/obdclass/llog_ioctl.c b/lustre/obdclass/llog_ioctl.c

index bc8afc6..261c031 100644 (file)
--- a/lustre/obdclass/llog_ioctl.c
+++ b/lustre/obdclass/llog_ioctl.c
@@ -1,25 +1,37 @@
  /* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
   * vim:expandtab:shiftwidth=8:tabstop=8:
   *
- *  Copyright (C) 2005 Cluster File Systems, Inc.
+ * GPL HEADER START
   *
- *   This file is part of the Lustre file system, http://www.lustre.org
- *   Lustre is a trademark of Cluster File Systems, Inc.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   *
- *   You may have signed or agreed to another license before downloading
- *   this software.  If so, you are bound by the terms and conditions
- *   of that agreement, and the following does not apply to you.  See the
- *   LICENSE file included with this distribution for more information.
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
   *
- *   If you did not agree to a different license, then this copy of Lustre
- *   is open source software; you can redistribute it and/or modify it
- *   under the terms of version 2 of the GNU General Public License as
- *   published by the Free Software Foundation.
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
   *
- *   In either case, Lustre is distributed in the hope that it will be
- *   useful, but WITHOUT ANY WARRANTY; without even the implied warranty
- *   of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- *   license text for more details.
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright  2008 Sun Microsystems, Inc. All rights reserved
+ * Use is subject to license terms.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
   */
  
  #define DEBUG_SUBSYSTEM S_LOG
@@ -420,16 +432,14 @@ int llog_catalog_list(struct obd_device *obd, int count,
          ENTRY;
          size = sizeof(*idarray) * count;
  
-        OBD_ALLOC(idarray, size);
+        OBD_VMALLOC(idarray, size);
          if (!idarray)
                  RETURN(-ENOMEM);
-        memset(idarray, 0, size);
  
-        rc = llog_get_cat_list(obd, obd, name, count, idarray);
-        if (rc) {
-                OBD_FREE(idarray, size);
-                RETURN(rc);
-        }
+        mutex_down(&obd->obd_llog_cat_process);
+        rc = llog_get_cat_list(obd, obd, name, 0, count, idarray);
+        if (rc)
+                GOTO(out, rc);
  
          out = data->ioc_bulk;
          remains = data->ioc_inllen1;
@@ -445,8 +455,12 @@ int llog_catalog_list(struct obd_device *obd, int count,
                          break;
                  }
          }
-        OBD_FREE(idarray, size);
-        RETURN(0);
+out:
+        /* release semaphore */
+        mutex_up(&obd->obd_llog_cat_process);
+
+        OBD_VFREE(idarray, size);
+        RETURN(rc);
  
  }
  EXPORT_SYMBOL(llog_catalog_list);
diff --git a/lustre/obdclass/llog_lvfs.c b/lustre/obdclass/llog_lvfs.c

index 6204fac..05b5066 100644 (file)
--- a/lustre/obdclass/llog_lvfs.c
+++ b/lustre/obdclass/llog_lvfs.c
@@ -1,32 +1,46 @@
  /* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
   * vim:expandtab:shiftwidth=8:tabstop=8:
   *
- *  Copyright (C) 2001-2003 Cluster File Systems, Inc.
- *   Author: Andreas Dilger <adilger@clusterfs.com>
+ * GPL HEADER START
   *
- *   This file is part of the Lustre file system, http://www.lustre.org
- *   Lustre is a trademark of Cluster File Systems, Inc.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   *
- *   You may have signed or agreed to another license before downloading
- *   this software.  If so, you are bound by the terms and conditions
- *   of that agreement, and the following does not apply to you.  See the
- *   LICENSE file included with this distribution for more information.
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
   *
- *   If you did not agree to a different license, then this copy of Lustre
- *   is open source software; you can redistribute it and/or modify it
- *   under the terms of version 2 of the GNU General Public License as
- *   published by the Free Software Foundation.
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
   *
- *   In either case, Lustre is distributed in the hope that it will be
- *   useful, but WITHOUT ANY WARRANTY; without even the implied warranty
- *   of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- *   license text for more details.
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
   *
- * OST<->MDS recovery logging infrastructure.
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright  2008 Sun Microsystems, Inc. All rights reserved
+ * Use is subject to license terms.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/obdclass/llog_lvfs.c
   *
+ * OST<->MDS recovery logging infrastructure.
   * Invariants in implementation:
   * - we do not share logs among different OST<->MDS connections, so that
   *   if an OST or MDS fails it need only look at log(s) relevant to itself
+ *
+ * Author: Andreas Dilger <adilger@clusterfs.com>
   */
  
  #define DEBUG_SUBSYSTEM S_LOG
@@ -309,18 +323,16 @@ static int llog_lvfs_write_rec(struct llog_handle *loghandle,
          /* NOTE: padding is a record, but no bit is set */
          if (left != 0 && left != reclen &&
              left < (reclen + LLOG_MIN_REC_SIZE)) {
-                loghandle->lgh_last_idx++;
-                rc = llog_lvfs_pad(obd, file, left, loghandle->lgh_last_idx);
+                index = loghandle->lgh_last_idx + 1;
+                rc = llog_lvfs_pad(obd, file, left, index);
                  if (rc)
                          RETURN(rc);
-                /* if it's the last idx in log file, then return -ENOSPC */
-                if (loghandle->lgh_last_idx == LLOG_BITMAP_SIZE(llh) - 1)
-                        RETURN(-ENOSPC);
+                loghandle->lgh_last_idx++; /*for pad rec*/
          }
-
-        loghandle->lgh_last_idx++;
-        index = loghandle->lgh_last_idx;
-        LASSERT(index < LLOG_BITMAP_SIZE(llh));
+        /* if it's the last idx in log file, then return -ENOSPC */
+        if (loghandle->lgh_last_idx >= LLOG_BITMAP_SIZE(llh) - 1)
+                RETURN(-ENOSPC);
+        index = ++loghandle->lgh_last_idx;
          rec->lrh_index = index;
          if (buf == NULL) {
                  lrt = (struct llog_rec_tail *)
@@ -328,6 +340,9 @@ static int llog_lvfs_write_rec(struct llog_handle *loghandle,
                  lrt->lrt_len = rec->lrh_len;
                  lrt->lrt_index = rec->lrh_index;
          }
+        /*The caller should make sure only 1 process access the lgh_last_idx,
+         *Otherwise it might hit the assert.*/
+        LASSERT(index < LLOG_BITMAP_SIZE(llh));
          if (ext2_set_bit(index, llh->llh_bitmap)) {
                  CERROR("argh, index %u already set in log bitmap?\n", index);
                  LBUG(); /* should never happen */
@@ -711,6 +726,7 @@ static int llog_lvfs_destroy(struct llog_handle *handle)
          if (strcmp(fdentry->d_parent->d_name.name, dir) == 0) {
                  struct inode *inode = fdentry->d_parent->d_inode;
                  struct lvfs_run_ctxt saved;
+                struct vfsmount *mnt = mntget(handle->lgh_file->f_vfsmnt);
  
                  push_ctxt(&saved, &obd->obd_lvfs_ctxt, NULL);
                  dget(fdentry);
@@ -718,9 +734,10 @@ static int llog_lvfs_destroy(struct llog_handle *handle)
  
                  if (rc == 0) {
                          LOCK_INODE_MUTEX(inode);
-                        rc = vfs_unlink(inode, fdentry);
+                        rc = ll_vfs_unlink(inode, fdentry, mnt);
                          UNLOCK_INODE_MUTEX(inode);
                  }
+                mntput(mnt);
  
                  dput(fdentry);
                  pop_ctxt(&saved, &obd->obd_lvfs_ctxt, NULL);
@@ -748,18 +765,20 @@ static int llog_lvfs_destroy(struct llog_handle *handle)
  
  /* reads the catalog list */
  int llog_get_cat_list(struct obd_device *obd, struct obd_device *disk_obd,
-                      char *name, int count, struct llog_catid *idarray)
+                      char *name, int idx, int count, struct llog_catid *idarray)
  {
          struct lvfs_run_ctxt saved;
          struct l_file *file;
-        int rc;
+        int rc, rc1 = 0;
          int size = sizeof(*idarray) * count;
-        loff_t off = 0;
+        loff_t off = idx *  sizeof(*idarray);
          ENTRY;
  
          if (!count) 
                  RETURN(0);
  
+        LASSERT_SEM_LOCKED(&obd->obd_llog_cat_process);
+
          push_ctxt(&saved, &obd->obd_lvfs_ctxt, NULL);
          file = filp_open(name, O_RDWR | O_CREAT | O_LARGEFILE, 0700);
          if (!file || IS_ERR(file)) {
@@ -768,7 +787,7 @@ int llog_get_cat_list(struct obd_device *obd, struct obd_device *disk_obd,
                         name, rc);
                  GOTO(out, rc);
          }
-        
+
          if (!S_ISREG(file->f_dentry->d_inode->i_mode)) {
                  CERROR("%s is not a regular file!: mode = %o\n", name,
                         file->f_dentry->d_inode->i_mode);
@@ -778,6 +797,11 @@ int llog_get_cat_list(struct obd_device *obd, struct obd_device *disk_obd,
          CDEBUG(D_CONFIG, "cat list: disk size=%d, read=%d\n",
                 (int)i_size_read(file->f_dentry->d_inode), size);
  
+        memset(idarray, 0, size);
+        /* read for new ost index or for empty file */
+        if (i_size_read(file->f_dentry->d_inode) < off)
+                GOTO(out, rc = 0);
+
          rc = fsfilt_read_record(disk_obd, file, idarray, size, &off);
          if (rc) {
                  CERROR("OBD filter: error reading %s: rc %d\n", name, rc);
@@ -788,24 +812,27 @@ int llog_get_cat_list(struct obd_device *obd, struct obd_device *disk_obd,
   out:
          pop_ctxt(&saved, &obd->obd_lvfs_ctxt, NULL);
          if (file && !IS_ERR(file))
-                rc = filp_close(file, 0);
+                rc1 = filp_close(file, 0);
+        if (rc == 0)
+                rc = rc1;
          return rc;
  }
  EXPORT_SYMBOL(llog_get_cat_list);
  
  /* writes the cat list */
  int llog_put_cat_list(struct obd_device *obd, struct obd_device *disk_obd,
-                      char *name, int count, struct llog_catid *idarray)
+                      char *name, int idx, int count, struct llog_catid *idarray)
  {
          struct lvfs_run_ctxt saved;
          struct l_file *file;
-        int rc;
+        int rc, rc1 = 0;
          int size = sizeof(*idarray) * count;
-        loff_t off = 0;
+        loff_t off = idx * sizeof(*idarray);
  
-        if (!count) 
-                return (0);
+        if (!count)
+                GOTO(out1, rc = 0);
  
+        LASSERT_SEM_LOCKED(&obd->obd_llog_cat_process);
          push_ctxt(&saved, &obd->obd_lvfs_ctxt, NULL);
          file = filp_open(name, O_RDWR | O_CREAT | O_LARGEFILE, 0700);
          if (!file || IS_ERR(file)) {
@@ -823,17 +850,22 @@ int llog_put_cat_list(struct obd_device *obd, struct obd_device *disk_obd,
  
          rc = fsfilt_write_record(disk_obd, file, idarray, size, &off, 1);
          if (rc) {
-                CDEBUG(D_INODE,"OBD filter: error reading %s: rc %d\n",
+                CDEBUG(D_INODE,"OBD filter: error writeing %s: rc %d\n",
                         name, rc);
                  GOTO(out, rc);
          }
  
- out:
+out:
          pop_ctxt(&saved, &obd->obd_lvfs_ctxt, NULL);
          if (file && !IS_ERR(file))
-                rc = filp_close(file, 0);
+                rc1 = filp_close(file, 0);
+
+        if (rc == 0)
+                rc = rc1;
+out1:
          RETURN(rc);
  }
+EXPORT_SYMBOL(llog_put_cat_list);
  
  struct llog_operations llog_lvfs_ops = {
          lop_write_rec:   llog_lvfs_write_rec,
@@ -900,14 +932,14 @@ static int llog_lvfs_destroy(struct llog_handle *handle)
  }
  
  int llog_get_cat_list(struct obd_device *obd, struct obd_device *disk_obd,
-                      char *name, int count, struct llog_catid *idarray)
+                      char *name, int idx, int count, struct llog_catid *idarray)
  {
          LBUG();
          return 0;
  }
  
  int llog_put_cat_list(struct obd_device *obd, struct obd_device *disk_obd,
-                      char *name, int count, struct llog_catid *idarray)
+                      char *name, int idx, int count, struct llog_catid *idarray)
  {
          LBUG();
          return 0;
diff --git a/lustre/obdclass/llog_obd.c b/lustre/obdclass/llog_obd.c

index a49c621..888b7f3 100644 (file)
--- a/lustre/obdclass/llog_obd.c
+++ b/lustre/obdclass/llog_obd.c
@@ -1,25 +1,37 @@
  /* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
   * vim:expandtab:shiftwidth=8:tabstop=8:
   *
- *  Copyright (C) 2005 Cluster File Systems, Inc.
+ * GPL HEADER START
   *
- *   This file is part of the Lustre file system, http://www.lustre.org
- *   Lustre is a trademark of Cluster File Systems, Inc.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   *
- *   You may have signed or agreed to another license before downloading
- *   this software.  If so, you are bound by the terms and conditions
- *   of that agreement, and the following does not apply to you.  See the
- *   LICENSE file included with this distribution for more information.
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
   *
- *   If you did not agree to a different license, then this copy of Lustre
- *   is open source software; you can redistribute it and/or modify it
- *   under the terms of version 2 of the GNU General Public License as
- *   published by the Free Software Foundation.
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
   *
- *   In either case, Lustre is distributed in the hope that it will be
- *   useful, but WITHOUT ANY WARRANTY; without even the implied warranty
- *   of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- *   license text for more details.
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright  2008 Sun Microsystems, Inc. All rights reserved
+ * Use is subject to license terms.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
   */
  
  #define DEBUG_SUBSYSTEM S_LOG
@@ -45,10 +57,10 @@ static struct llog_ctxt* llog_new_ctxt(struct obd_device *obd)
          OBD_ALLOC(ctxt, sizeof(*ctxt));
          if (!ctxt)
                  return NULL;
-        
+
          ctxt->loc_obd = obd;
          atomic_set(&ctxt->loc_refcount, 1);
-        
+
          return ctxt;
  }
  
@@ -60,6 +72,7 @@ static void llog_ctxt_destroy(struct llog_ctxt *ctxt)
                  class_import_put(ctxt->loc_imp);
                  ctxt->loc_imp = NULL;
          }
+        LASSERT(ctxt->loc_llcd == NULL);
          OBD_FREE(ctxt, sizeof(*ctxt));
          return;
  }
@@ -78,7 +91,11 @@ int __llog_ctxt_put(struct llog_ctxt *ctxt)
          obd->obd_llog_ctxt[ctxt->loc_idx] = NULL;
          spin_unlock(&obd->obd_dev_lock);
  
-       LASSERT(obd->obd_stopping == 1 || obd->obd_set_up == 0);
+        LASSERTF(obd->obd_starting == 1 || 
+                 obd->obd_stopping == 1 || obd->obd_set_up == 0,
+                 "wrong obd state: %d/%d/%d\n", !!obd->obd_starting, 
+                 !!obd->obd_stopping, !!obd->obd_set_up);
+
          /* cleanup the llog ctxt here */
          if (CTXTP(ctxt, cleanup))
                  rc = CTXTP(ctxt, cleanup)(ctxt);
@@ -106,12 +123,22 @@ int llog_cleanup(struct llog_ctxt *ctxt)
  
          /* sync with other llog ctxt user thread */
          spin_lock(&obd->obd_dev_lock);
-       LASSERT(obd->obd_stopping == 1 || obd->obd_set_up == 0);
+
+        /* obd->obd_starting is needed for the case of cleanup
+         * in error case while obd is starting up. */
+        LASSERTF(obd->obd_starting == 1 || 
+                 obd->obd_stopping == 1 || obd->obd_set_up == 0,
+                 "wrong obd state: %d/%d/%d\n", !!obd->obd_starting, 
+                 !!obd->obd_stopping, !!obd->obd_set_up);
+
          spin_unlock(&obd->obd_dev_lock);
  
          idx = ctxt->loc_idx;
          /*try to free the ctxt */
          rc = __llog_ctxt_put(ctxt);
+        if (rc)
+                CERROR("Error %d while cleaning up ctxt %p\n", 
+                       rc, ctxt);
  
          l_wait_event(obd->obd_llog_waitq, llog_ctxt_null(obd, idx), &lwi);
  
@@ -129,18 +156,22 @@ int llog_setup(struct obd_device *obd, int index, struct obd_device *disk_obd,
          if (index < 0 || index >= LLOG_MAX_CTXTS)
                  RETURN(-EFAULT);
  
-        ctxt = llog_get_context(obd, index); 
+        /* someone can call lov_llog_init with NULL uuid - this can produce
+         * parallel enter to this function */
+        mutex_down(&obd->obd_llog_alloc);
+        ctxt = llog_get_context(obd, index);
          if (ctxt) {
                  /* mds_lov_update_mds might call here multiple times. So if the
                     llog is already set up then don't to do it again. */
-                CDEBUG(D_CONFIG, "obd %s ctxt %d already set up\n", 
+                CDEBUG(D_CONFIG, "obd %s ctxt %d already set up\n",
                         obd->obd_name, index);
                  LASSERT(ctxt->loc_obd == obd);
                  LASSERT(ctxt->loc_exp == disk_obd->obd_self_export);
                  LASSERT(ctxt->loc_logops == op);
-                llog_ctxt_put(ctxt); 
+                llog_ctxt_put(ctxt);
                  GOTO(out, rc = 0);
          }
+
          ctxt = llog_new_ctxt(obd);
          if (!ctxt)
                  GOTO(out, rc = -ENOMEM);
@@ -151,14 +182,19 @@ int llog_setup(struct obd_device *obd, int index, struct obd_device *disk_obd,
          ctxt->loc_logops = op;
          sema_init(&ctxt->loc_sem, 1);
  
-        if (op->lop_setup)
-                rc = op->lop_setup(obd, index, disk_obd, count, logid);
+        if (op->lop_setup) {
+                if (OBD_FAIL_CHECK(OBD_FAIL_OBD_LLOG_SETUP))
+                        rc = -EOPNOTSUPP;
+                else
+                        rc = op->lop_setup(obd, index, disk_obd, count, logid);
+        }
  
          if (rc) {
                  llog_ctxt_destroy(ctxt);
                  obd->obd_llog_ctxt[index] = NULL;
          }
  out:
+        mutex_up(&obd->obd_llog_alloc);
          RETURN(rc);
  }
  EXPORT_SYMBOL(llog_setup);
@@ -179,11 +215,10 @@ int llog_sync(struct llog_ctxt *ctxt, struct obd_export *exp)
  EXPORT_SYMBOL(llog_sync);
  
  int llog_add(struct llog_ctxt *ctxt, struct llog_rec_hdr *rec,
-                struct lov_stripe_md *lsm, struct llog_cookie *logcookies,
-                int numcookies)
+             struct lov_stripe_md *lsm, struct llog_cookie *logcookies,
+             int numcookies)
  {
-        __u32 cap;
-        int rc;
+        int raised, rc;
          ENTRY;
  
          if (!ctxt) {
@@ -192,10 +227,12 @@ int llog_add(struct llog_ctxt *ctxt, struct llog_rec_hdr *rec,
          }
          
          CTXT_CHECK_OP(ctxt, add, -EOPNOTSUPP);
-        cap = current->cap_effective;             
-        cap_raise(current->cap_effective, CAP_SYS_RESOURCE);
+        raised = cfs_cap_raised(CFS_CAP_SYS_RESOURCE);
+        if (!raised)
+                cfs_cap_raise(CFS_CAP_SYS_RESOURCE);
          rc = CTXTP(ctxt, add)(ctxt, rec, lsm, logcookies, numcookies);
-        current->cap_effective = cap; 
+        if (!raised)
+                cfs_cap_lower(CFS_CAP_SYS_RESOURCE);
          RETURN(rc);
  }
  EXPORT_SYMBOL(llog_add);
@@ -273,13 +310,17 @@ int llog_obd_origin_setup(struct obd_device *obd, int index,
  {
          struct llog_ctxt *ctxt;
          struct llog_handle *handle;
-        struct lvfs_run_ctxt saved;
+        struct lvfs_run_ctxt *saved = NULL;
          int rc;
          ENTRY;
  
          if (count == 0)
                  RETURN(0);
  
+        OBD_SLAB_ALLOC_PTR(saved, obd_lvfs_ctxt_cache);
+        if (saved == NULL)
+                RETURN(-ENOMEM);
+
          LASSERT(count == 1);
  
          ctxt = llog_get_context(obd, index);
@@ -297,9 +338,9 @@ int llog_obd_origin_setup(struct obd_device *obd, int index,
                  GOTO(out, rc);
  
          ctxt->loc_handle = handle;
-        push_ctxt(&saved, &disk_obd->obd_lvfs_ctxt, NULL);
+        push_ctxt(saved, &disk_obd->obd_lvfs_ctxt, NULL);
          rc = llog_init_handle(handle, LLOG_F_IS_CAT, NULL);
-        pop_ctxt(&saved, &disk_obd->obd_lvfs_ctxt, NULL);
+        pop_ctxt(saved, &disk_obd->obd_lvfs_ctxt, NULL);
          if (rc)
                  GOTO(out, rc);
  
@@ -308,6 +349,7 @@ int llog_obd_origin_setup(struct obd_device *obd, int index,
                  CERROR("llog_process with cat_cancel_cb failed: %d\n", rc);
  out:
          llog_ctxt_put(ctxt);
+        OBD_SLAB_FREE_PTR(saved, obd_lvfs_ctxt_cache);
          RETURN(rc);
  }
  EXPORT_SYMBOL(llog_obd_origin_setup);
@@ -372,42 +414,40 @@ int llog_obd_origin_add(struct llog_ctxt *ctxt,
  }
  EXPORT_SYMBOL(llog_obd_origin_add);
  
-int llog_cat_initialize(struct obd_device *obd, int count,
+int llog_cat_initialize(struct obd_device *obd, int idx,
                          struct obd_uuid *uuid)
  {
+        struct llog_catid idarray;
          char name[32] = CATLIST;
-        struct llog_catid *idarray = NULL;
-        int size = sizeof(*idarray) * count;
          int rc;
          ENTRY;
  
-        if (count) {
-                OBD_VMALLOC(idarray, size);
-                if (!idarray)
-                        RETURN(-ENOMEM);
-        }
-
-        rc = llog_get_cat_list(obd, obd, name, count, idarray);
+        mutex_down(&obd->obd_llog_cat_process);
+        rc = llog_get_cat_list(obd, obd, name, idx, 1, &idarray);
          if (rc) {
                  CERROR("rc: %d\n", rc);
                  GOTO(out, rc);
          }
  
-        rc = obd_llog_init(obd, obd, count, idarray, uuid);
+        CDEBUG(D_INFO, "%s: Init llog for %s/%d - catid "LPX64"/"LPX64":%x\n",
+               obd->obd_name, uuid->uuid, idx, idarray.lci_logid.lgl_oid,
+               idarray.lci_logid.lgl_ogr, idarray.lci_logid.lgl_ogen);
+
+        rc = obd_llog_init(obd, obd, 1, &idarray, uuid);
          if (rc) {
                  CERROR("rc: %d\n", rc);
                  GOTO(out, rc);
          }
  
-        rc = llog_put_cat_list(obd, obd, name, count, idarray);
+        rc = llog_put_cat_list(obd, obd, name, idx, 1, &idarray);
          if (rc) {
                  CERROR("rc: %d\n", rc);
                  GOTO(out, rc);
          }
  
   out:
-        if (idarray)
-                OBD_VFREE(idarray, size);
+        mutex_up(&obd->obd_llog_cat_process);
+
          RETURN(rc);
  }
  EXPORT_SYMBOL(llog_cat_initialize);
diff --git a/lustre/obdclass/llog_swab.c b/lustre/obdclass/llog_swab.c

index 4f45df0..70a53d2 100644 (file)
--- a/lustre/obdclass/llog_swab.c
+++ b/lustre/obdclass/llog_swab.c
@@ -1,29 +1,43 @@
  /* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
   * vim:expandtab:shiftwidth=8:tabstop=8:
   *
- *  Copyright (C) 2004-2005 Cluster File Systems, Inc.
- *   Author: jacob berkman  <jacob@clusterfs.com>
+ * GPL HEADER START
   *
- *   This file is part of the Lustre file system, http://www.lustre.org
- *   Lustre is a trademark of Cluster File Systems, Inc.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   *
- *   You may have signed or agreed to another license before downloading
- *   this software.  If so, you are bound by the terms and conditions
- *   of that agreement, and the following does not apply to you.  See the
- *   LICENSE file included with this distribution for more information.
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
   *
- *   If you did not agree to a different license, then this copy of Lustre
- *   is open source software; you can redistribute it and/or modify it
- *   under the terms of version 2 of the GNU General Public License as
- *   published by the Free Software Foundation.
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
   *
- *   In either case, Lustre is distributed in the hope that it will be
- *   useful, but WITHOUT ANY WARRANTY; without even the implied warranty
- *   of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- *   license text for more details.
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright  2008 Sun Microsystems, Inc. All rights reserved
+ * Use is subject to license terms.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/obdclass/llog_swab.c
   *
   * Swabbing of llog datatypes (from disk or over the wire).
   *
+ * Author: jacob berkman  <jacob@clusterfs.com>
   */
  
  #define DEBUG_SUBSYSTEM S_LOG
@@ -85,6 +99,14 @@ void lustre_swab_ll_fid(struct ll_fid *fid)
  }
  EXPORT_SYMBOL(lustre_swab_ll_fid);
  
+void lustre_swab_lu_fid(struct lu_fid *fid)
+{
+        __swab64s(&fid->f_seq);
+        __swab32s(&fid->f_oid);
+        __swab32s(&fid->f_ver);
+}
+EXPORT_SYMBOL(lustre_swab_lu_fid);
+
  void lustre_swab_llog_rec(struct llog_rec_hdr *rec, struct llog_rec_tail *tail)
  {
          __swab32s(&rec->lrh_len);
@@ -109,8 +131,8 @@ void lustre_swab_llog_rec(struct llog_rec_hdr *rec, struct llog_rec_tail *tail)
                  struct llog_unlink_rec *lur = (struct llog_unlink_rec *)rec;
  
                  __swab64s(&lur->lur_oid);
-                __swab32s(&lur->lur_ogen);
-
+                __swab32s(&lur->lur_ogr);
+                __swab32s(&lur->lur_count);
                  break;
          }
  
@@ -118,7 +140,18 @@ void lustre_swab_llog_rec(struct llog_rec_hdr *rec, struct llog_rec_tail *tail)
                  struct llog_setattr_rec *lsr = (struct llog_setattr_rec *)rec;
  
                  __swab64s(&lsr->lsr_oid);
-                __swab32s(&lsr->lsr_ogen);
+                __swab32s(&lsr->lsr_ogr);
+                __swab32s(&lsr->lsr_uid);
+                __swab32s(&lsr->lsr_gid);
+
+                break;
+        }
+
+        case MDS_SETATTR64_REC: {
+                struct llog_setattr64_rec *lsr = (struct llog_setattr64_rec *)rec;
+
+                __swab64s(&lsr->lsr_oid);
+                __swab32s(&lsr->lsr_ogr);
                  __swab32s(&lsr->lsr_uid);
                  __swab32s(&lsr->lsr_gid);
  
@@ -155,7 +188,7 @@ void lustre_swab_llog_rec(struct llog_rec_hdr *rec, struct llog_rec_tail *tail)
                  __swab32s(&lid->lid_id.lgl_ogen);
                  break;
          }
-
+        case LLOG_JOIN_REC:
          case LLOG_PAD_MAGIC:
          /* ignore old pad records of type 0 */
          case 0:
@@ -251,3 +284,67 @@ void lustre_swab_lustre_cfg(struct lustre_cfg *lcfg)
          return;
  }
  EXPORT_SYMBOL(lustre_swab_lustre_cfg);
+
+/* used only for compatibility with old on-disk cfg_marker data */
+struct cfg_marker32 {
+        __u32   cm_step;
+        __u32   cm_flags;
+        __u32   cm_vers;
+        __u32   padding;
+        __u32   cm_createtime;
+        __u32   cm_canceltime;
+        char    cm_tgtname[MTI_NAME_MAXLEN];
+        char    cm_comment[MTI_NAME_MAXLEN];
+};
+
+#define MTI_NAMELEN32    (MTI_NAME_MAXLEN - \
+        (sizeof(struct cfg_marker) - sizeof(struct cfg_marker32)))
+
+void lustre_swab_cfg_marker(struct cfg_marker *marker, int swab, int size)
+{
+        struct cfg_marker32 *cm32 = (struct cfg_marker32*)marker;
+        ENTRY;
+
+        if (swab) {
+                __swab32s(&marker->cm_step);
+                __swab32s(&marker->cm_flags);
+                __swab32s(&marker->cm_vers);
+        }
+        if (size == sizeof(*cm32)) {
+                __u32 createtime, canceltime;
+                /* There was a problem with the original declaration of
+                 * cfg_marker on 32-bit systems because it used time_t as
+                 * a wire protocol structure, and didn't verify this in
+                 * wirecheck.  We now have to convert the offsets of the
+                 * later fields in order to work on 32- and 64-bit systems.
+                 *
+                 * Fortunately, the cm_comment field has no functional use
+                 * so can be sacrificed when converting the timestamp size.
+                 *
+                 * Overwrite fields from the end first, so they are not
+                 * clobbered, and use memmove() instead of memcpy() because
+                 * the source and target buffers overlap.  bug 16771 */
+                createtime = cm32->cm_createtime;
+                canceltime = cm32->cm_canceltime;
+                memmove(marker->cm_comment, cm32->cm_comment, MTI_NAMELEN32);
+                marker->cm_comment[MTI_NAMELEN32 - 1] = '\0';
+                memmove(marker->cm_tgtname, cm32->cm_tgtname,
+                        sizeof(marker->cm_tgtname));
+                if (swab) {
+                        __swab32s(&createtime);
+                        __swab32s(&canceltime);
+                }
+                marker->cm_createtime = createtime;
+                marker->cm_canceltime = canceltime;
+                CDEBUG(D_CONFIG, "Find old cfg_marker(Srv32b,Clt64b) "
+                       "for target %s, converting\n",
+                       marker->cm_tgtname);
+        } else if (swab) {
+                __swab64s(&marker->cm_createtime);
+                __swab64s(&marker->cm_canceltime);
+        }
+
+        EXIT;
+        return;
+}
+EXPORT_SYMBOL(lustre_swab_cfg_marker);
diff --git a/lustre/obdclass/llog_test.c b/lustre/obdclass/llog_test.c

index 641137e..6721a7e 100644 (file)
--- a/lustre/obdclass/llog_test.c
+++ b/lustre/obdclass/llog_test.c
@@ -1,28 +1,41 @@
  /* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
   * vim:expandtab:shiftwidth=8:tabstop=8:
   *
- *  Copyright (C) 2003 Cluster File Systems, Inc.
- *   Author: Phil Schwan <phil@clusterfs.com>
+ * GPL HEADER START
   *
- *   This file is part of the Lustre file system, http://www.lustre.org
- *   Lustre is a trademark of Cluster File Systems, Inc.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   *
- *   You may have signed or agreed to another license before downloading
- *   this software.  If so, you are bound by the terms and conditions
- *   of that agreement, and the following does not apply to you.  See the
- *   LICENSE file included with this distribution for more information.
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
   *
- *   If you did not agree to a different license, then this copy of Lustre
- *   is open source software; you can redistribute it and/or modify it
- *   under the terms of version 2 of the GNU General Public License as
- *   published by the Free Software Foundation.
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
   *
- *   In either case, Lustre is distributed in the hope that it will be
- *   useful, but WITHOUT ANY WARRANTY; without even the implied warranty
- *   of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- *   license text for more details.
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
   *
- * A kernel module which tests the llog API from the OBD setup function.
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright  2008 Sun Microsystems, Inc. All rights reserved
+ * Use is subject to license terms.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/obdclass/llog_test.c
+ *
+ * Author: Phil Schwan <phil@clusterfs.com>
   */
  
  #ifndef EXPORT_SYMTAB
@@ -224,6 +237,41 @@ static int llog_test_3(struct obd_device *obd, struct llog_handle *llh)
  
          if ((rc = verify_handle("3c", llh, num_recs)))
                  RETURN(rc);
+       
+        CWARN("3d: write log more than BITMAP_SIZE, return -ENOSPC\n");
+        for (i = 0; i < LLOG_BITMAP_SIZE(llh->lgh_hdr) + 1; i++) {
+                struct llog_rec_hdr hdr;
+                char buf_even[24];
+                char buf_odd[32];
+
+                memset(buf_odd, 0, sizeof buf_odd);
+                memset(buf_even, 0, sizeof buf_even);
+                if ((i % 2) == 0) {
+                        hdr.lrh_len = 24;
+                        hdr.lrh_type = OBD_CFG_REC;
+                        rc = llog_write_rec(llh, &hdr, NULL, 0, buf_even, -1);
+                } else {
+                        hdr.lrh_len = 32;
+                        hdr.lrh_type = OBD_CFG_REC;
+                        rc = llog_write_rec(llh, &hdr, NULL, 0, buf_odd, -1);
+                }
+                if (rc) {
+                        if (rc == -ENOSPC) {
+                                break;
+                        } else {
+                                       CERROR("3c: write recs failed at #%d: %d\n",
+                                               i + 1, rc);
+                                       RETURN(rc);
+                        }
+                }
+                num_recs++;
+        }
+       if (rc != -ENOSPC) {
+                CWARN("3d: write record more than BITMAP size!\n");
+                RETURN(-EINVAL);
+        }
+        if ((rc = verify_handle("3d", llh, num_recs)))
+                RETURN(rc);
  
          RETURN(rc);
  }
@@ -712,7 +760,7 @@ static void __exit llog_test_exit(void)
          class_unregister_type("llog_test");
  }
  
-MODULE_AUTHOR("Cluster File Systems, Inc. <info@clusterfs.com>");
+MODULE_AUTHOR("Sun Microsystems, Inc. <http://www.lustre.org/>");
  MODULE_DESCRIPTION("llog test module");
  MODULE_LICENSE("GPL");
  
diff --git a/lustre/obdclass/lprocfs_status.c b/lustre/obdclass/lprocfs_status.c

index c80e811..901481a 100644 (file)
--- a/lustre/obdclass/lprocfs_status.c
+++ b/lustre/obdclass/lprocfs_status.c
@@ -1,26 +1,41 @@
  /* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
   * vim:expandtab:shiftwidth=8:tabstop=8:
   *
- *  Copyright (C) 2002, 2003 Cluster File Systems, Inc.
- *   Author: Hariharan Thantry <thantry@users.sourceforge.net>
+ * GPL HEADER START
   *
- *   This file is part of the Lustre file system, http://www.lustre.org
- *   Lustre is a trademark of Cluster File Systems, Inc.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   *
- *   You may have signed or agreed to another license before downloading
- *   this software.  If so, you are bound by the terms and conditions
- *   of that agreement, and the following does not apply to you.  See the
- *   LICENSE file included with this distribution for more information.
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
   *
- *   If you did not agree to a different license, then this copy of Lustre
- *   is open source software; you can redistribute it and/or modify it
- *   under the terms of version 2 of the GNU General Public License as
- *   published by the Free Software Foundation.
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
   *
- *   In either case, Lustre is distributed in the hope that it will be
- *   useful, but WITHOUT ANY WARRANTY; without even the implied warranty
- *   of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- *   license text for more details.
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright  2008 Sun Microsystems, Inc. All rights reserved
+ * Use is subject to license terms.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/obdclass/lprocfs_status.c
+ *
+ * Author: Hariharan Thantry <thantry@users.sourceforge.net>
   */
  
  #ifndef EXPORT_SYMTAB
@@ -96,32 +111,39 @@ static int lprocfs_obd_snprintf(char **page, int end, int *len,
          return n;
  }
  
-int lprocfs_add_simple(struct proc_dir_entry *root, char *name,
-                       read_proc_t *read_proc, write_proc_t *write_proc,
-                       void *data)
+cfs_proc_dir_entry_t *lprocfs_add_simple(struct proc_dir_entry *root,
+                                        char *name,
+                                        read_proc_t *read_proc,
+                                        write_proc_t *write_proc,
+                                        void *data,
+                                        struct file_operations *fops)
  {
-        struct proc_dir_entry *proc;
+        cfs_proc_dir_entry_t *proc;
          mode_t mode = 0;
-        
+
          if (root == NULL || name == NULL)
-                return -EINVAL;
+                return ERR_PTR(-EINVAL);
          if (read_proc)
                  mode = 0444;
          if (write_proc)
                  mode |= 0200;
+        if (fops)
+                mode = 0644;
          proc = create_proc_entry(name, mode, root);
          if (!proc) {
                  CERROR("LprocFS: No memory to create /proc entry %s", name);
-                return -ENOMEM;
+                return ERR_PTR(-ENOMEM);
          }
          proc->read_proc = read_proc;
          proc->write_proc = write_proc;
          proc->data = data;
-        return 0;
+        if (fops)
+                proc->proc_fops = fops;
+        return proc;
  }
  
-
-static ssize_t lprocfs_fops_read(struct file *f, char __user *buf, size_t size, loff_t *ppos)
+static ssize_t lprocfs_fops_read(struct file *f, char __user *buf, size_t size,
+                                 loff_t *ppos)
  {
          struct proc_dir_entry *dp = PDE(f->f_dentry->d_inode);
          char *page, *start = NULL;
@@ -137,7 +159,7 @@ static ssize_t lprocfs_fops_read(struct file *f, char __user *buf, size_t size,
          LPROCFS_ENTRY();
          OBD_FAIL_TIMEOUT(OBD_FAIL_LPROC_REMOVE, 10);
          if (!dp->deleted && dp->read_proc)
-                rc = dp->read_proc(page, &start, *ppos, PAGE_SIZE, 
+                rc = dp->read_proc(page, &start, *ppos, PAGE_SIZE,
                          &eof, dp->data);
          LPROCFS_EXIT();
          if (rc <= 0)
@@ -169,7 +191,8 @@ out:
          return rc;
  }
  
-static ssize_t lprocfs_fops_write(struct file *f, const char __user *buf, size_t size, loff_t *ppos)
+static ssize_t lprocfs_fops_write(struct file *f, const char __user *buf,
+                                  size_t size, loff_t *ppos)
  {
          struct proc_dir_entry *dp = PDE(f->f_dentry->d_inode);
          int rc = -EIO;
@@ -217,6 +240,17 @@ struct file_operations lprocfs_evict_client_fops = {
  };
  EXPORT_SYMBOL(lprocfs_evict_client_fops);
  
+/**
+ * Add /proc entrys.
+ *
+ * \param root [in]  The parent proc entry on which new entry will be added.
+ * \param list [in]  Array of proc entries to be added.
+ * \param data [in]  The argument to be passed when entries read/write routines
+ *                   are called through /proc file.
+ *
+ * \retval 0   on success
+ *         < 0 on error
+ */
  int lprocfs_add_vars(struct proc_dir_entry *root, struct lprocfs_vars *list,
                       void *data)
  {
@@ -256,10 +290,14 @@ int lprocfs_add_vars(struct proc_dir_entry *root, struct lprocfs_vars *list,
                                              proc_mkdir(cur, cur_root));
                          } else if (proc == NULL) {
                                  mode_t mode = 0;
-                                if (list->read_fptr)
-                                        mode = 0444;
-                                if (list->write_fptr)
-                                        mode |= 0200;
+                                if (list->proc_mode != 0000) {
+                                        mode = list->proc_mode;
+                                } else {
+                                        if (list->read_fptr)
+                                                mode = 0444;
+                                        if (list->write_fptr)
+                                                mode |= 0200;
+                                }
                                  proc = create_proc_entry(cur, mode, cur_root);
                          }
                  }
@@ -298,7 +336,7 @@ void lprocfs_remove(struct proc_dir_entry **rooth)
  
          parent = root->parent;
          LASSERT(parent != NULL);
-        LPROCFS_ENTRY(); /* search vs remove race */
+        LPROCFS_WRITE_ENTRY(); /* search vs remove race */
  
          while (1) {
                  while (temp->subdir != NULL)
@@ -313,14 +351,14 @@ void lprocfs_remove(struct proc_dir_entry **rooth)
                           "0x%p  %s/%s len %d\n", rm_entry, temp->name,
                           rm_entry->name, (int)strlen(rm_entry->name));
  
-                /* Now, the rm_entry->deleted flags is protected 
+                /* Now, the rm_entry->deleted flags is protected
                   * by _lprocfs_lock. */
                  rm_entry->data = NULL;
                  remove_proc_entry(rm_entry->name, temp);
                  if (temp == parent)
                          break;
          }
-        LPROCFS_EXIT();
+        LPROCFS_WRITE_EXIT();
  }
  
  struct proc_dir_entry *lprocfs_register(const char *name,
@@ -399,14 +437,14 @@ int lprocfs_wr_atomic(struct file *file, const char *buffer,
          atomic_t *atm = data;
          int val = 0;
          int rc;
-        
+
          rc = lprocfs_write_helper(buffer, count, &val);
          if (rc < 0)
                  return rc;
  
          if (val <= 0)
                  return -ERANGE;
-                
+
          atomic_set(atm, val);
          return count;
  }
@@ -568,7 +606,7 @@ int lprocfs_rd_conn_uuid(char *page, char **start, off_t off, int count,
          struct ptlrpc_connection *conn;
          int rc = 0;
  
-        LASSERT(obd != NULL); 
+        LASSERT(obd != NULL);
          LPROCFS_CLIMP_CHECK(obd);
          conn = obd->u.cli.cl_import->imp_connection;
          LASSERT(conn != NULL);
@@ -579,12 +617,81 @@ int lprocfs_rd_conn_uuid(char *page, char **start, off_t off, int count,
          return rc;
  }
  
-int lprocfs_at_hist_helper(char *page, int count, int rc, 
+#define flag2str(flag) \
+        if (imp->imp_##flag && max - len > 0) \
+                len += snprintf(str + len, max - len, " " #flag);
+
+/**
+ * Append a space separated list of current set flags to str.
+ */
+static int obd_import_flags2str(struct obd_import *imp, char *str,
+                                          int max)
+{
+        int len = 0;
+
+        if (imp->imp_obd->obd_no_recov)
+                len += snprintf(str, max - len, " no_recov");
+
+        flag2str(invalid);
+        flag2str(deactive);
+        flag2str(replayable);
+        flag2str(pingable);
+        flag2str(recon_bk);
+        flag2str(last_recon);
+        return len;
+}
+#undef flags2str
+
+int lprocfs_rd_import(char *page, char **start, off_t off, int count,
+                      int *eof, void *data)
+{
+        struct obd_device *obd = (struct obd_device *)data;
+        struct obd_import *imp;
+        char *imp_state_name = NULL;
+        int rc = 0;
+
+        LASSERT(obd != NULL);
+        LPROCFS_CLIMP_CHECK(obd);
+        imp = obd->u.cli.cl_import;
+        imp_state_name = ptlrpc_import_state_name(imp->imp_state);
+        *eof = 1;
+
+        rc = snprintf(page, count,
+                      "import: %s\n"
+                      "    target: %s@%s\n"
+                      "    state: %s\n"
+                      "    inflight: %u\n"
+                      "    unregistering: %u\n"
+                      "    conn_cnt: %u\n"
+                      "    generation: %u\n"
+                      "    inval_cnt: %u\n"
+                      "    last_replay_transno: "LPU64"\n"
+                      "    peer_committed_transno: "LPU64"\n"
+                      "    last_trasno_checked: "LPU64"\n"
+                      "    flags:",
+                      obd->obd_name,
+                      obd2cli_tgt(obd), imp->imp_connection->c_remote_uuid.uuid,
+                      imp_state_name,
+                      atomic_read(&imp->imp_inflight),
+                      atomic_read(&imp->imp_unregistering),
+                      imp->imp_conn_cnt,
+                      imp->imp_generation,
+                      atomic_read(&imp->imp_inval_count),
+                      imp->imp_last_replay_transno,
+                      imp->imp_peer_committed_transno,
+                      imp->imp_last_transno_checked);
+        rc += obd_import_flags2str(imp, page + rc, count - rc);
+        rc += snprintf(page+rc, count - rc, "\n");
+        LPROCFS_CLIMP_EXIT(obd);
+        return rc;
+}
+
+int lprocfs_at_hist_helper(char *page, int count, int rc,
                             struct adaptive_timeout *at)
  {
          int i;
          for (i = 0; i < AT_BINS; i++)
-                rc += snprintf(page + rc, count - rc, "%3u ", at->at_hist[i]); 
+                rc += snprintf(page + rc, count - rc, "%3u ", at->at_hist[i]);
          rc += snprintf(page + rc, count - rc, "\n");
          return rc;
  }
@@ -609,20 +716,19 @@ int lprocfs_rd_timeouts(char *page, char **start, off_t off, int count,
  
          /* Some network health info for kicks */
          s2dhms(&ts, now - imp->imp_last_reply_time);
-        rc += snprintf(page + rc, count - rc, 
+        rc += snprintf(page + rc, count - rc,
                         "%-10s : %ld, "DHMS_FMT" ago\n",
                         "last reply", imp->imp_last_reply_time, DHMS_VARS(&ts));
  
-
          cur = at_get(&imp->imp_at.iat_net_latency);
          worst = imp->imp_at.iat_net_latency.at_worst_ever;
          worstt = imp->imp_at.iat_net_latency.at_worst_time;
          s2dhms(&ts, now - worstt);
-        rc += snprintf(page + rc, count - rc, 
+        rc += snprintf(page + rc, count - rc,
                         "%-10s : cur %3u  worst %3u (at %ld, "DHMS_FMT" ago) ",
-                       "network", cur, worst, worstt, DHMS_VARS(&ts)); 
+                       "network", cur, worst, worstt, DHMS_VARS(&ts));
          rc = lprocfs_at_hist_helper(page, count, rc,
-                                    &imp->imp_at.iat_net_latency); 
+                                    &imp->imp_at.iat_net_latency);
  
          for(i = 0; i < IMP_AT_MAX_PORTALS; i++) {
                  if (imp->imp_at.iat_portal[i] == 0)
@@ -633,7 +739,7 @@ int lprocfs_rd_timeouts(char *page, char **start, off_t off, int count,
                  s2dhms(&ts, now - worstt);
                  rc += snprintf(page + rc, count - rc,
                                 "portal %-2d  : cur %3u  worst %3u (at %ld, "
-                               DHMS_FMT" ago) ", imp->imp_at.iat_portal[i], 
+                               DHMS_FMT" ago) ", imp->imp_at.iat_portal[i],
                                 cur, worst, worstt, DHMS_VARS(&ts));
                  rc = lprocfs_at_hist_helper(page, count, rc,
                                            &imp->imp_at.iat_service_estimate[i]);
@@ -643,6 +749,7 @@ int lprocfs_rd_timeouts(char *page, char **start, off_t off, int count,
          return rc;
  }
  
+/* see OBD_CONNECT_* */
  static const char *obd_connect_names[] = {
          "read_only",
          "lov_index",
@@ -675,6 +782,8 @@ static const char *obd_connect_names[] = {
          "change_qunit_size",
          "alt_checksum_algorithm",
          "fid_is_enabled",
+        "version_recovery",
+        "pools",
          NULL
  };
  
@@ -743,7 +852,7 @@ int lprocfs_obd_setup(struct obd_device *obd, struct lprocfs_vars *list)
  
  int lprocfs_obd_cleanup(struct obd_device *obd)
  {
-        if (!obd) 
+        if (!obd)
                  return -EINVAL;
          if (obd->obd_proc_exports_entry) {
                  /* Should be no exports left */
@@ -772,9 +881,12 @@ static void lprocfs_free_client_stats(struct nid_stat *client_stat)
                  lprocfs_free_stats(&client_stat->nid_stats);
  
          if (client_stat->nid_brw_stats)
-                OBD_FREE(client_stat->nid_brw_stats, sizeof(struct brw_stats));
+                OBD_FREE_PTR(client_stat->nid_brw_stats);
+
+        if (client_stat->nid_ldlm_stats)
+                lprocfs_free_stats(&client_stat->nid_ldlm_stats);
  
-        OBD_FREE(client_stat, sizeof(*client_stat));
+        OBD_FREE_PTR(client_stat);
          return;
  
  }
@@ -854,7 +966,7 @@ void lprocfs_free_stats(struct lprocfs_stats **statsh)
          unsigned int num_cpu;
          unsigned int percpusize;
          unsigned int i;
-        
+
          if (!stats || (stats->ls_num == 0))
                  return;
          *statsh = NULL;
@@ -1094,6 +1206,8 @@ void lprocfs_init_ops_stats(int num_private_stats, struct lprocfs_stats *stats)
          LPROCFS_OBD_OP_INIT(num_private_stats, stats, connect);
          LPROCFS_OBD_OP_INIT(num_private_stats, stats, reconnect);
          LPROCFS_OBD_OP_INIT(num_private_stats, stats, disconnect);
+        LPROCFS_OBD_OP_INIT(num_private_stats, stats, fid_init);
+        LPROCFS_OBD_OP_INIT(num_private_stats, stats, fid_fini);
          LPROCFS_OBD_OP_INIT(num_private_stats, stats, statfs);
          LPROCFS_OBD_OP_INIT(num_private_stats, stats, statfs_async);
          LPROCFS_OBD_OP_INIT(num_private_stats, stats, packmd);
@@ -1118,6 +1232,7 @@ void lprocfs_init_ops_stats(int num_private_stats, struct lprocfs_stats *stats)
          LPROCFS_OBD_OP_INIT(num_private_stats, stats, set_async_flags);
          LPROCFS_OBD_OP_INIT(num_private_stats, stats, teardown_async_page);
          LPROCFS_OBD_OP_INIT(num_private_stats, stats, merge_lvb);
+        LPROCFS_OBD_OP_INIT(num_private_stats, stats, update_lvb);
          LPROCFS_OBD_OP_INIT(num_private_stats, stats, adjust_kms);
          LPROCFS_OBD_OP_INIT(num_private_stats, stats, punch);
          LPROCFS_OBD_OP_INIT(num_private_stats, stats, sync);
@@ -1150,6 +1265,32 @@ void lprocfs_init_ops_stats(int num_private_stats, struct lprocfs_stats *stats)
          LPROCFS_OBD_OP_INIT(num_private_stats,stats,unregister_page_removal_cb);
          LPROCFS_OBD_OP_INIT(num_private_stats, stats, register_lock_cancel_cb);
          LPROCFS_OBD_OP_INIT(num_private_stats, stats,unregister_lock_cancel_cb);
+        LPROCFS_OBD_OP_INIT(num_private_stats, stats, pool_new);
+        LPROCFS_OBD_OP_INIT(num_private_stats, stats, pool_rem);
+        LPROCFS_OBD_OP_INIT(num_private_stats, stats, pool_add);
+        LPROCFS_OBD_OP_INIT(num_private_stats, stats, pool_del);
+}
+
+void lprocfs_init_ldlm_stats(struct lprocfs_stats *ldlm_stats)
+{
+        lprocfs_counter_init(ldlm_stats,
+                             LDLM_ENQUEUE - LDLM_FIRST_OPC,
+                             0, "ldlm_enqueue", "reqs");
+        lprocfs_counter_init(ldlm_stats,
+                             LDLM_CONVERT - LDLM_FIRST_OPC,
+                             0, "ldlm_convert", "reqs");
+        lprocfs_counter_init(ldlm_stats,
+                             LDLM_CANCEL - LDLM_FIRST_OPC,
+                             0, "ldlm_cancel", "reqs");
+        lprocfs_counter_init(ldlm_stats,
+                             LDLM_BL_CALLBACK - LDLM_FIRST_OPC,
+                             0, "ldlm_bl_callback", "reqs");
+        lprocfs_counter_init(ldlm_stats,
+                             LDLM_CP_CALLBACK - LDLM_FIRST_OPC,
+                             0, "ldlm_cp_callback", "reqs");
+        lprocfs_counter_init(ldlm_stats,
+                             LDLM_GL_CALLBACK - LDLM_FIRST_OPC,
+                             0, "ldlm_gl_callback", "reqs");
  }
  
  int lprocfs_alloc_obd_stats(struct obd_device *obd, unsigned num_private_stats)
@@ -1192,7 +1333,7 @@ int lprocfs_alloc_obd_stats(struct obd_device *obd, unsigned num_private_stats)
  
  void lprocfs_free_obd_stats(struct obd_device *obd)
  {
-        if (obd->obd_stats) 
+        if (obd->obd_stats)
                  lprocfs_free_stats(&obd->obd_stats);
  }
  
@@ -1212,6 +1353,16 @@ struct exp_uuid_cb_data {
          int                    *len;
  };
  
+static void
+lprocfs_exp_rd_cb_data_init(struct exp_uuid_cb_data *cb_data, char *page,
+                            int count, int *eof, int *len)
+{
+        cb_data->page = page;
+        cb_data->count = count;
+        cb_data->eof = eof;
+        cb_data->len = len;
+}
+
  void lprocfs_exp_print_uuid(void *obj, void *cb_data)
  {
          struct obd_export *exp = (struct obd_export *)obj;
@@ -1233,15 +1384,43 @@ int lprocfs_exp_rd_uuid(char *page, char **start, off_t off, int count,
  
          *eof = 1;
          page[0] = '\0';
-        LASSERT(obd != NULL);
+        lprocfs_exp_rd_cb_data_init(&cb_data, page, count, eof, &len);
+        lustre_hash_for_each_key(obd->obd_nid_hash, &stats->nid,
+                                 lprocfs_exp_print_uuid, &cb_data);
+        return (*cb_data.len);
+}
+
+void lprocfs_exp_print_hash(void *obj, void *cb_data)
+{
+        struct obd_export *exp = (struct obd_export *)obj;
+        struct exp_uuid_cb_data *data = (struct exp_uuid_cb_data *)cb_data;
+        lustre_hash_t *lh;
+
+        lh = exp->exp_lock_hash;
+        if (lh) {
+                if (!*data->len)
+                        *data->len += lustre_hash_debug_header(data->page,
+                                                               data->count);
+
+                *data->len += lustre_hash_debug_str(lh, data->page +
+                                                    *data->len,
+                                                    data->count);
+     }
+}
+
+int lprocfs_exp_rd_hash(char *page, char **start, off_t off, int count,
+                     int *eof,  void *data)
+{
+        struct nid_stat *stats = (struct nid_stat *)data;
+        struct exp_uuid_cb_data cb_data;
+        struct obd_device *obd = stats->nid_obd;
+        int len = 0;
  
-        cb_data.page = page;
-        cb_data.count = count;
-        cb_data.eof = eof;
-        cb_data.len = &len;
-        lustre_hash_bucket_iterate(obd->obd_nid_hash_body,
-                                   &stats->nid, lprocfs_exp_print_uuid,
-                                   &cb_data);
+        *eof = 1;
+        page[0] = '\0';
+        lprocfs_exp_rd_cb_data_init(&cb_data, page, count, eof, &len);
+        lustre_hash_for_each_key(obd->obd_nid_hash, &stats->nid,
+                                 lprocfs_exp_print_hash, &cb_data);
          return (*cb_data.len);
  }
  
@@ -1259,17 +1438,16 @@ void lprocfs_nid_stats_clear_write_cb(void *obj, void *data)
  {
          struct nid_stat *stat = obj;
          int i;
-
+        ENTRY;
          /* object has only hash + iterate_all references.
           * add/delete blocked by hash bucket lock */
          CDEBUG(D_INFO,"refcnt %d\n", stat->nid_exp_ref_count);
-        if(stat->nid_exp_ref_count == 2) {
+        if (stat->nid_exp_ref_count == 2) {
                  hlist_del_init(&stat->nid_hash);
                  stat->nid_exp_ref_count--;
                  spin_lock(&stat->nid_obd->obd_nid_lock);
-                list_del_init(&stat->nid_list);
+                list_move(&stat->nid_list, data);
                  spin_unlock(&stat->nid_obd->obd_nid_lock);
-                list_add(&stat->nid_list, data);
                  EXIT;
                  return;
          }
@@ -1285,7 +1463,6 @@ void lprocfs_nid_stats_clear_write_cb(void *obj, void *data)
          return;
  }
  
-
  int lprocfs_nid_stats_clear_write(struct file *file, const char *buffer,
                                           unsigned long count, void *data)
  {
@@ -1293,11 +1470,12 @@ int lprocfs_nid_stats_clear_write(struct file *file, const char *buffer,
          struct nid_stat *client_stat;
          CFS_LIST_HEAD(free_list);
  
-        lustre_hash_iterate_all(obd->obd_nid_stats_hash_body,
-                                lprocfs_nid_stats_clear_write_cb, &free_list);
+        lustre_hash_for_each(obd->obd_nid_stats_hash,
+                             lprocfs_nid_stats_clear_write_cb, &free_list);
  
          while (!list_empty(&free_list)) {
-                client_stat = list_entry(free_list.next, struct nid_stat, nid_list);
+                client_stat = list_entry(free_list.next, struct nid_stat,
+                                         nid_list);
                  list_del_init(&client_stat->nid_list);
                  lprocfs_free_client_stats(client_stat);
          }
@@ -1309,88 +1487,162 @@ EXPORT_SYMBOL(lprocfs_nid_stats_clear_write);
  int lprocfs_exp_setup(struct obd_export *exp, lnet_nid_t *nid, int *newnid)
  {
          int rc = 0;
-        struct nid_stat *tmp = NULL, *tmp1;
-        struct obd_device *obd = NULL;
+        struct nid_stat *new_stat, *old_stat;
+        struct nid_stat_uuid *cursor, *new_ns_uuid;
+        struct obd_device *obd;
+        cfs_proc_dir_entry_t *entry;
          ENTRY;
  
          *newnid = 0;
  
          if (!exp || !exp->exp_obd || !exp->exp_obd->obd_proc_exports_entry ||
-            !exp->exp_obd->obd_nid_stats_hash_body)
+            !exp->exp_obd->obd_nid_stats_hash)
                  RETURN(-EINVAL);
  
-       /* not test against zero because eric say:
-        * You may only test nid against another nid, or LNET_NID_ANY.  Anything else is
-        * nonsense.*/
+        /* not test against zero because eric say:
+        * You may only test nid against another nid, or LNET_NID_ANY.
+         * Anything else is nonsense.*/
          if (!nid || *nid == LNET_NID_ANY)
                  RETURN(0);
  
          obd = exp->exp_obd;
  
-        CDEBUG(D_CONFIG, "using hash %p\n", obd->obd_nid_stats_hash_body);
+        CDEBUG(D_CONFIG, "using hash %p\n", obd->obd_nid_stats_hash);
  
-        OBD_ALLOC(tmp, sizeof(struct nid_stat));
-        if (tmp == NULL)
+        OBD_ALLOC_PTR(new_stat);
+        if (new_stat == NULL)
                  RETURN(-ENOMEM);
  
-        tmp->nid = *nid;
-        tmp->nid_obd = exp->exp_obd;
-        tmp->nid_exp_ref_count = 1; /* need live in hash after destroy export */
+        OBD_ALLOC_PTR(new_ns_uuid);
+        if (new_ns_uuid == NULL) {
+                OBD_FREE_PTR(new_stat);
+                RETURN(-ENOMEM);
+        }
+        CFS_INIT_LIST_HEAD(&new_ns_uuid->ns_uuid_list);
+        strncpy(new_ns_uuid->ns_uuid.uuid, exp->exp_client_uuid.uuid,
+                sizeof(struct obd_uuid));
+
+        CFS_INIT_LIST_HEAD(&new_stat->nid_uuid_list);
+        new_stat->nid = *nid;
+        new_stat->nid_obd = exp->exp_obd;
+        /* need live in hash after destroy export */
+        new_stat->nid_exp_ref_count = 1;
+
+        old_stat = lustre_hash_findadd_unique(obd->obd_nid_stats_hash,
+                                          nid, &new_stat->nid_hash);
+        CDEBUG(D_INFO, "Found stats %p for nid %s - ref %d\n",
+               old_stat, libcfs_nid2str(*nid), new_stat->nid_exp_ref_count);
+
+        /* Return -EALREADY here so that we know that the /proc
+         * entry already has been created */
+        if (old_stat != new_stat) {
+                int found = 0;
+
+                exp->exp_nid_stats = old_stat;
+
+                /* We need to decrement the refcount if the uuid was
+                 * already in our list */
+                spin_lock(&obd->obd_nid_lock);
+                list_for_each_entry(cursor,
+                                    &old_stat->nid_uuid_list,
+                                    ns_uuid_list) {
+                        if (cursor && obd_uuid_equals(&cursor->ns_uuid,
+                                                      &exp->exp_client_uuid)) {
+                                found = 1;
+                                --old_stat->nid_exp_ref_count;
+                                break;
+                        }
+                }
  
-       /* protect competitive add to list, not need locking on destroy */
-        spin_lock(&obd->obd_nid_lock);
-        list_add(&tmp->nid_list, &obd->obd_nid_stats);
-        spin_unlock(&obd->obd_nid_lock);
+                if (!found)
+                        list_add(&new_ns_uuid->ns_uuid_list,
+                                 &old_stat->nid_uuid_list);
+                else
+                        OBD_FREE_PTR(new_ns_uuid);
  
-        tmp1= lustre_hash_findadd_unique(obd->obd_nid_stats_hash_body, nid,
-                                         &tmp->nid_hash);
-        CDEBUG(D_INFO, "Found stats %p for nid %s - ref %d\n",
-               tmp1, libcfs_nid2str(*nid), tmp->nid_exp_ref_count);
+                spin_unlock(&obd->obd_nid_lock);
  
-        if (tmp1 != tmp) {
-                exp->exp_nid_stats = tmp1;
-                GOTO(destroy_new, rc = 0);
+                GOTO(destroy_new, rc = -EALREADY);
          }
          /* not found - create */
-        tmp->nid_proc = proc_mkdir(libcfs_nid2str(*nid),
+        new_stat->nid_proc = proc_mkdir(libcfs_nid2str(*nid),
                                     obd->obd_proc_exports_entry);
-        if (!tmp->nid_proc) {
+        if (!new_stat->nid_proc) {
                  CERROR("Error making export directory for"
                         " nid %s\n", libcfs_nid2str(*nid));
-                lustre_hash_delitem(obd->obd_nid_stats_hash_body, nid,
-                                    &tmp->nid_hash);
-                GOTO(destroy_new, rc = -ENOMEM);
+                GOTO(destroy_new_ns, rc = -ENOMEM);
          }
  
-        rc = lprocfs_add_simple(tmp->nid_proc, "uuid",
-                                lprocfs_exp_rd_uuid, NULL, tmp);
-        if (rc)
+        /* Add in uuid to our nid_stats list */
+        spin_lock(&obd->obd_nid_lock);
+        list_add(&new_ns_uuid->ns_uuid_list, &new_stat->nid_uuid_list);
+        spin_unlock(&obd->obd_nid_lock);
+
+        entry = lprocfs_add_simple(new_stat->nid_proc, "uuid",
+                                   lprocfs_exp_rd_uuid, NULL, new_stat, NULL);
+        if (IS_ERR(entry)) {
                  CWARN("Error adding the uuid file\n");
+                rc = PTR_ERR(entry);
+                GOTO(destroy_new_ns, rc);
+        }
+
+        entry = lprocfs_add_simple(new_stat->nid_proc, "hash",
+                                lprocfs_exp_rd_hash, NULL, new_stat, NULL);
+        if (IS_ERR(entry)) {
+                CWARN("Error adding the hash file\n");
+                rc = PTR_ERR(entry);
+                lprocfs_remove(&new_stat->nid_proc);
+                GOTO(destroy_new_ns, rc);
+        }
  
-        exp->exp_nid_stats = tmp;
+        exp->exp_nid_stats = new_stat;
          *newnid = 1;
+        /* protect competitive add to list, not need locking on destroy */
+        spin_lock(&obd->obd_nid_lock);
+        list_add(&new_stat->nid_list, &obd->obd_nid_stats);
+        spin_unlock(&obd->obd_nid_lock);
+
          RETURN(rc);
  
+destroy_new_ns:
+        lustre_hash_del(obd->obd_nid_stats_hash, nid, &new_stat->nid_hash);
+        OBD_FREE_PTR(new_ns_uuid);
+
  destroy_new:
-        spin_lock(&obd->obd_nid_lock);
-        list_del(&tmp->nid_list);
-        spin_unlock(&obd->obd_nid_lock);
-        OBD_FREE(tmp, sizeof(struct nid_stat));
+        OBD_FREE_PTR(new_stat);
          RETURN(rc);
  }
  
  int lprocfs_exp_cleanup(struct obd_export *exp)
  {
          struct nid_stat *stat = exp->exp_nid_stats;
+        struct nid_stat_uuid *cursor, *tmp;
+        int found = 0;
  
-        if(!stat)
+        if(!stat || !exp->exp_obd)
                  RETURN(0);
  
-        stat->nid_exp_ref_count--;
-        CDEBUG(D_INFO, "Put stat %p - %d\n", stat, stat->nid_exp_ref_count);
+        spin_lock(&exp->exp_obd->obd_nid_lock);
+        list_for_each_entry_safe(cursor, tmp,
+                                 &stat->nid_uuid_list,
+                                 ns_uuid_list) {
+                if (cursor && obd_uuid_equals(&cursor->ns_uuid,
+                                              &exp->exp_client_uuid)) {
+                        found = 1;
+                        list_del(&cursor->ns_uuid_list);
+                        OBD_FREE_PTR(cursor);
+                        --stat->nid_exp_ref_count;
+                        CDEBUG(D_INFO, "Put stat %p - %d\n", stat,
+                               stat->nid_exp_ref_count);
+                        break;
+                }
+        }
+        spin_unlock(&exp->exp_obd->obd_nid_lock);
+        if (!found)
+                CERROR("obd_export's client uuid %s are not found in its "
+                       "nid_stats list\n", exp->exp_client_uuid.uuid);
  
          exp->exp_nid_stats = NULL;
-        lprocfs_free_stats(&exp->exp_ldlm_stats);
          lprocfs_free_stats(&exp->exp_ops_stats);
  
          return 0;
@@ -1444,7 +1696,8 @@ int lprocfs_write_frac_helper(const char *buffer, unsigned long count,
          return 0;
  }
  
-int lprocfs_read_frac_helper(char *buffer, unsigned long count, long val, int mult)
+int lprocfs_read_frac_helper(char *buffer, unsigned long count, long val,
+                             int mult)
  {
          long decimal_val, frac_val;
          int prtn;
@@ -1462,7 +1715,8 @@ int lprocfs_read_frac_helper(char *buffer, unsigned long count, long val, int mu
  
                  temp_frac = frac_val * 10;
                  buffer[prtn++] = '.';
-                while (frac_bits < 2 && (temp_frac / mult) < 1 ) { /*only reserved 2bits fraction*/
+                while (frac_bits < 2 && (temp_frac / mult) < 1 ) {
+                        /*only reserved 2bits fraction*/
                          buffer[prtn++] ='0';
                          temp_frac *= 10;
                          frac_bits++;
@@ -1473,7 +1727,7 @@ int lprocfs_read_frac_helper(char *buffer, unsigned long count, long val, int mu
                          2. #echo x.0x > /proc/xxx       output result : x.0x
                          3. #echo x.x0 > /proc/xxx       output result : x.x
                          4. #echo x.xx > /proc/xxx       output result : x.xx
-                        Only reserved 2bits fraction.       
+                        Only reserved 2bits fraction.
                   */
                  for (i = 0; i < (5 - prtn); i++)
                          temp_mult *= 10;
@@ -1561,7 +1815,7 @@ int lprocfs_write_frac_u64_helper(const char *buffer, unsigned long count,
          return 0;
  }
  
-int lprocfs_seq_create(cfs_proc_dir_entry_t *parent, 
+int lprocfs_seq_create(cfs_proc_dir_entry_t *parent,
                         char *name, mode_t mode,
                         struct file_operations *seq_fops, void *data)
  {
@@ -1583,7 +1837,7 @@ __inline__ int lprocfs_obd_seq_create(struct obd_device *dev, char *name,
                                        struct file_operations *seq_fops,
                                        void *data)
  {
-        return (lprocfs_seq_create(dev->obd_proc_entry, name, 
+        return (lprocfs_seq_create(dev->obd_proc_entry, name,
                                     mode, seq_fops, data));
  }
  EXPORT_SYMBOL(lprocfs_obd_seq_create);
@@ -1675,11 +1929,17 @@ int lprocfs_obd_rd_recovery_status(char *page, char **start, off_t off,
                                           obd->obd_recovery_end -
                                           obd->obd_recovery_start) <= 0)
                          goto out;
+                if (lprocfs_obd_snprintf(&page, size, &len,
+                                         "delayed_clients: %d/%d\n",
+                                         obd->obd_delayed_clients,
+                                         obd->obd_max_recoverable_clients) <= 0)
+                        goto out;
                  /* Number of clients that have completed recovery */
                  if (lprocfs_obd_snprintf(&page, size, &len,
                                           "completed_clients: %d/%d\n",
                                           obd->obd_max_recoverable_clients -
-                                         obd->obd_recoverable_clients,
+                                         obd->obd_recoverable_clients -
+                                         obd->obd_delayed_clients,
                                           obd->obd_max_recoverable_clients) <= 0)
                          goto out;
                  if (lprocfs_obd_snprintf(&page, size, &len,
@@ -1706,10 +1966,15 @@ int lprocfs_obd_rd_recovery_status(char *page, char **start, off_t off,
                                   obd->obd_connected_clients,
                                   obd->obd_max_recoverable_clients) <= 0)
                  goto out;
+        if (lprocfs_obd_snprintf(&page, size, &len,"delayed_clients: %d/%d\n",
+                                 obd->obd_delayed_clients,
+                                 obd->obd_max_recoverable_clients) <= 0)
+                goto out;
          /* Number of clients that have completed recovery */
          if (lprocfs_obd_snprintf(&page, size, &len,"completed_clients: %d/%d\n",
                                   obd->obd_max_recoverable_clients -
-                                 obd->obd_recoverable_clients,
+                                 obd->obd_recoverable_clients -
+                                 obd->obd_delayed_clients,
                                   obd->obd_max_recoverable_clients) <= 0)
                  goto out;
          if (lprocfs_obd_snprintf(&page, size, &len,"replayed_requests: %d/??\n",
@@ -1729,6 +1994,24 @@ out:
  }
  EXPORT_SYMBOL(lprocfs_obd_rd_recovery_status);
  
+int lprocfs_obd_rd_hash(char *page, char **start, off_t off,
+                        int count, int *eof, void *data)
+{
+        struct obd_device *obd = data;
+        int c = 0;
+
+        if (obd == NULL)
+                return 0;
+
+        c += lustre_hash_debug_header(page, count);
+        c += lustre_hash_debug_str(obd->obd_uuid_hash, page + c, count - c);
+        c += lustre_hash_debug_str(obd->obd_nid_hash, page + c, count - c);
+        c += lustre_hash_debug_str(obd->obd_nid_stats_hash, page+c, count-c);
+
+        return c;
+}
+EXPORT_SYMBOL(lprocfs_obd_rd_hash);
+
  #ifdef CRAY_XT3
  int lprocfs_obd_rd_recovery_maxtime(char *page, char **start, off_t off,
                                      int count, int *eof, void *data)
@@ -1736,7 +2019,7 @@ int lprocfs_obd_rd_recovery_maxtime(char *page, char **start, off_t off,
          struct obd_device *obd = (struct obd_device *)data;
          LASSERT(obd != NULL);
  
-        return snprintf(page, count, "%lu\n", 
+        return snprintf(page, count, "%lu\n",
                          obd->obd_recovery_max_time);
  }
  EXPORT_SYMBOL(lprocfs_obd_rd_recovery_maxtime);
@@ -1758,82 +2041,20 @@ int lprocfs_obd_wr_recovery_maxtime(struct file *file, const char *buffer,
  EXPORT_SYMBOL(lprocfs_obd_wr_recovery_maxtime);
  #endif /* CRAY_XT3 */
  
-#ifdef HAVE_QUOTA_SUPPORT
-int lprocfs_quota_rd_bunit(char *page, char **start, off_t off, int count,
-                           int *eof, void *data)
-{
-        struct obd_device *obd = (struct obd_device *)data;
-        LASSERT(obd != NULL);
-
-        return snprintf(page, count, "%lu\n",
-                        obd->u.obt.obt_qctxt.lqc_bunit_sz);
-}
-EXPORT_SYMBOL(lprocfs_quota_rd_bunit);
-
-int lprocfs_quota_wr_bunit(struct file *file, const char *buffer,
-                           unsigned long count, void *data)
-{
-        struct obd_device *obd = (struct obd_device *)data;
-        int val, rc;
-        LASSERT(obd != NULL);
-
-        rc = lprocfs_write_helper(buffer, count, &val);
-        if (rc)
-                return rc;
-
-        if (val % QUOTABLOCK_SIZE ||
-            val <= obd->u.obt.obt_qctxt.lqc_btune_sz)
-                return -EINVAL;
-
-        obd->u.obt.obt_qctxt.lqc_bunit_sz = val;
-        return count;
-}
-EXPORT_SYMBOL(lprocfs_quota_wr_bunit);
-
-int lprocfs_quota_rd_btune(char *page, char **start, off_t off, int count,
-                           int *eof, void *data)
-{
-        struct obd_device *obd = (struct obd_device *)data;
-        LASSERT(obd != NULL);
-
-        return snprintf(page, count, "%lu\n",
-                        obd->u.obt.obt_qctxt.lqc_btune_sz);
-}
-EXPORT_SYMBOL(lprocfs_quota_rd_btune);
-
-int lprocfs_quota_wr_btune(struct file *file, const char *buffer,
-                           unsigned long count, void *data)
-{
-        struct obd_device *obd = (struct obd_device *)data;
-        int val, rc;
-        LASSERT(obd != NULL);
-
-        rc = lprocfs_write_helper(buffer, count, &val);
-        if (rc)
-                return rc;
-
-        if (val <= QUOTABLOCK_SIZE * MIN_QLIMIT || val % QUOTABLOCK_SIZE ||
-            val >= obd->u.obt.obt_qctxt.lqc_bunit_sz)
-                return -EINVAL;
-
-        obd->u.obt.obt_qctxt.lqc_btune_sz = val;
-        return count;
-}
-EXPORT_SYMBOL(lprocfs_quota_wr_btune);
-
-int lprocfs_quota_rd_iunit(char *page, char **start, off_t off, int count,
-                           int *eof, void *data)
+#ifdef HAVE_DELAYED_RECOVERY
+int lprocfs_obd_rd_stale_export_age(char *page, char **start, off_t off,
+                                    int count, int *eof, void *data)
  {
          struct obd_device *obd = (struct obd_device *)data;
          LASSERT(obd != NULL);
  
-        return snprintf(page, count, "%lu\n",
-                        obd->u.obt.obt_qctxt.lqc_iunit_sz);
+        return snprintf(page, count, "%u\n",
+                        obd->u.obt.obt_stale_export_age);
  }
-EXPORT_SYMBOL(lprocfs_quota_rd_iunit);
+EXPORT_SYMBOL(lprocfs_obd_rd_stale_export_age);
  
-int lprocfs_quota_wr_iunit(struct file *file, const char *buffer,
-                           unsigned long count, void *data)
+int lprocfs_obd_wr_stale_export_age(struct file *file, const char *buffer,
+                                    unsigned long count, void *data)
  {
          struct obd_device *obd = (struct obd_device *)data;
          int val, rc;
@@ -1843,58 +2064,41 @@ int lprocfs_quota_wr_iunit(struct file *file, const char *buffer,
          if (rc)
                  return rc;
  
-        if (val <= obd->u.obt.obt_qctxt.lqc_itune_sz)
-                return -EINVAL;
-
-        obd->u.obt.obt_qctxt.lqc_iunit_sz = val;
+        target_trans_table_recalc(obd, val);
+        obd->u.obt.obt_stale_export_age = val;
          return count;
  }
-EXPORT_SYMBOL(lprocfs_quota_wr_iunit);
+EXPORT_SYMBOL(lprocfs_obd_wr_stale_export_age);
  
-int lprocfs_quota_rd_itune(char *page, char **start, off_t off, int count,
-                           int *eof, void *data)
+static int obd_stale_exports_seq_show(struct seq_file *seq, void *v)
  {
-        struct obd_device *obd = (struct obd_device *)data;
-        LASSERT(obd != NULL);
+        struct obd_device *obd = seq->private;
+        struct obd_export *exp;
  
-        return snprintf(page, count, "%lu\n",
-                        obd->u.obt.obt_qctxt.lqc_itune_sz);
+        spin_lock(&obd->obd_dev_lock);
+        list_for_each_entry(exp, &obd->obd_delayed_exports,
+                            exp_obd_chain) {
+                seq_printf(seq, "%s: %ld seconds ago%s\n",
+                           obd_uuid2str(&exp->exp_client_uuid),
+                           cfs_time_current_sec() - exp->exp_last_request_time,
+                           exp_expired(exp, obd->u.obt.obt_stale_export_age) ?
+                                       " [EXPIRED]" : "");
+        }
+        spin_unlock(&obd->obd_dev_lock);
+        return 0;
  }
-EXPORT_SYMBOL(lprocfs_quota_rd_itune);
-
-int lprocfs_quota_wr_itune(struct file *file, const char *buffer,
-                           unsigned long count, void *data)
-{
-        struct obd_device *obd = (struct obd_device *)data;
-        int val, rc;
-        LASSERT(obd != NULL);
  
-        rc = lprocfs_write_helper(buffer, count, &val);
-        if (rc)
-                return rc;
+LPROC_SEQ_FOPS_RO(obd_stale_exports);
  
-        if (val <= MIN_QLIMIT ||
-            val >= obd->u.obt.obt_qctxt.lqc_iunit_sz)
-                return -EINVAL;
-
-        obd->u.obt.obt_qctxt.lqc_itune_sz = val;
-        return count;
-}
-EXPORT_SYMBOL(lprocfs_quota_wr_itune);
-
-int lprocfs_quota_rd_switch_seconds(char *page, char **start, off_t off,
-                                    int count, int *eof, void *data)
+int lprocfs_obd_attach_stale_exports(struct obd_device *dev)
  {
-        struct obd_device *obd = (struct obd_device *)data;
-        LASSERT(obd != NULL);
-
-        return snprintf(page, count, "%d\n",
-                        obd->u.obt.obt_qctxt.lqc_switch_seconds);
+        return lprocfs_obd_seq_create(dev, "stale_exports", 0444,
+                                      &obd_stale_exports_fops, dev);
  }
-EXPORT_SYMBOL(lprocfs_quota_rd_switch_seconds);
+EXPORT_SYMBOL(lprocfs_obd_attach_stale_exports);
  
-int lprocfs_quota_wr_switch_seconds(struct file *file, const char *buffer,
-                                    unsigned long count, void *data)
+int lprocfs_obd_wr_flush_stale_exports(struct file *file, const char *buffer,
+                                       unsigned long count, void *data)
  {
          struct obd_device *obd = (struct obd_device *)data;
          int val, rc;
@@ -1904,17 +2108,12 @@ int lprocfs_quota_wr_switch_seconds(struct file *file, const char *buffer,
          if (rc)
                  return rc;
  
-        if (val <= 10)
-                return -EINVAL;
-
-        obd->u.obt.obt_qctxt.lqc_switch_seconds = val;
+        class_disconnect_expired_exports(obd);
          return count;
  }
-EXPORT_SYMBOL(lprocfs_quota_wr_switch_seconds);
-
+EXPORT_SYMBOL(lprocfs_obd_wr_flush_stale_exports);
  #endif
  
-
  EXPORT_SYMBOL(lprocfs_register);
  EXPORT_SYMBOL(lprocfs_srch);
  EXPORT_SYMBOL(lprocfs_remove);
@@ -1928,6 +2127,7 @@ EXPORT_SYMBOL(lprocfs_free_stats);
  EXPORT_SYMBOL(lprocfs_clear_stats);
  EXPORT_SYMBOL(lprocfs_register_stats);
  EXPORT_SYMBOL(lprocfs_init_ops_stats);
+EXPORT_SYMBOL(lprocfs_init_ldlm_stats);
  EXPORT_SYMBOL(lprocfs_alloc_obd_stats);
  EXPORT_SYMBOL(lprocfs_free_obd_stats);
  EXPORT_SYMBOL(lprocfs_exp_setup);
@@ -1946,6 +2146,7 @@ EXPORT_SYMBOL(lprocfs_rd_conn_uuid);
  EXPORT_SYMBOL(lprocfs_rd_num_exports);
  EXPORT_SYMBOL(lprocfs_rd_numrefs);
  EXPORT_SYMBOL(lprocfs_at_hist_helper);
+EXPORT_SYMBOL(lprocfs_rd_import);
  EXPORT_SYMBOL(lprocfs_rd_timeouts);
  EXPORT_SYMBOL(lprocfs_rd_blksize);
  EXPORT_SYMBOL(lprocfs_rd_kbytestotal);
diff --git a/lustre/obdclass/lustre_handles.c b/lustre/obdclass/lustre_handles.c

index 05b10c4..e830330 100644 (file)
--- a/lustre/obdclass/lustre_handles.c
+++ b/lustre/obdclass/lustre_handles.c
@@ -1,26 +1,41 @@
  /* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
   * vim:expandtab:shiftwidth=8:tabstop=8:
   *
- * Copyright (C) 2002 Cluster File Systems, Inc.
- *   Author: Phil Schwan <phil@clusterfs.com>
+ * GPL HEADER START
   *
- *   This file is part of the Lustre file system, http://www.lustre.org
- *   Lustre is a trademark of Cluster File Systems, Inc.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   *
- *   You may have signed or agreed to another license before downloading
- *   this software.  If so, you are bound by the terms and conditions
- *   of that agreement, and the following does not apply to you.  See the
- *   LICENSE file included with this distribution for more information.
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
   *
- *   If you did not agree to a different license, then this copy of Lustre
- *   is open source software; you can redistribute it and/or modify it
- *   under the terms of version 2 of the GNU General Public License as
- *   published by the Free Software Foundation.
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
   *
- *   In either case, Lustre is distributed in the hope that it will be
- *   useful, but WITHOUT ANY WARRANTY; without even the implied warranty
- *   of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- *   license text for more details.
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright  2008 Sun Microsystems, Inc. All rights reserved
+ * Use is subject to license terms.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/obdclass/lustre_handles.c
+ *
+ * Author: Phil Schwan <phil@clusterfs.com>
   */
  
  #define DEBUG_SUBSYSTEM S_CLASS
@@ -164,7 +179,7 @@ void *class_handle2object(__u64 cookie)
                          continue;
  
                  spin_lock(&h->h_lock);
-                if (likely(h->h_cookie != 0)) {
+                if (likely(h->h_in != 0)) {
                          h->h_addref(h);
                          retval = h;
                  }
diff --git a/lustre/obdclass/lustre_peer.c b/lustre/obdclass/lustre_peer.c

index c08b600..341d068 100644 (file)
--- a/lustre/obdclass/lustre_peer.c
+++ b/lustre/obdclass/lustre_peer.c
@@ -1,26 +1,37 @@
  /* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
   * vim:expandtab:shiftwidth=8:tabstop=8:
   *
- *  Copyright (c) 2002, 2003 Cluster File Systems, Inc.
+ * GPL HEADER START
   *
- *   This file is part of the Lustre file system, http://www.lustre.org
- *   Lustre is a trademark of Cluster File Systems, Inc.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   *
- *   You may have signed or agreed to another license before downloading
- *   this software.  If so, you are bound by the terms and conditions
- *   of that agreement, and the following does not apply to you.  See the
- *   LICENSE file included with this distribution for more information.
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
   *
- *   If you did not agree to a different license, then this copy of Lustre
- *   is open source software; you can redistribute it and/or modify it
- *   under the terms of version 2 of the GNU General Public License as
- *   published by the Free Software Foundation.
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
   *
- *   In either case, Lustre is distributed in the hope that it will be
- *   useful, but WITHOUT ANY WARRANTY; without even the implied warranty
- *   of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- *   license text for more details.
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
   *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright  2008 Sun Microsystems, Inc. All rights reserved
+ * Use is subject to license terms.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
   */
  
  #define DEBUG_SUBSYSTEM S_RPC
diff --git a/lustre/obdclass/obd_config.c b/lustre/obdclass/obd_config.c

index c56aa8f..f60629f 100644 (file)
--- a/lustre/obdclass/obd_config.c
+++ b/lustre/obdclass/obd_config.c
@@ -1,28 +1,41 @@
  /* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
   * vim:expandtab:shiftwidth=8:tabstop=8:
   *
- *  Copyright (c) 2001-2006 Cluster File Systems, Inc.
+ * GPL HEADER START
   *
- *   This file is part of the Lustre file system, http://www.lustre.org
- *   Lustre is a trademark of Cluster File Systems, Inc.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   *
- *   You may have signed or agreed to another license before downloading
- *   this software.  If so, you are bound by the terms and conditions
- *   of that agreement, and the following does not apply to you.  See the
- *   LICENSE file included with this distribution for more information.
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
   *
- *   If you did not agree to a different license, then this copy of Lustre
- *   is open source software; you can redistribute it and/or modify it
- *   under the terms of version 2 of the GNU General Public License as
- *   published by the Free Software Foundation.
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
   *
- *   In either case, Lustre is distributed in the hope that it will be
- *   useful, but WITHOUT ANY WARRANTY; without even the implied warranty
- *   of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- *   license text for more details.
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
   *
- * Config API
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
   *
+ * GPL HEADER END
+ */
+/*
+ * Copyright  2008 Sun Microsystems, Inc. All rights reserved
+ * Use is subject to license terms.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/obdclass/obd_config.c
+ *
+ * Config API
   */
  
  #define DEBUG_SUBSYSTEM S_CLASS
@@ -34,14 +47,16 @@
  #include <obd_class.h>
  #include <obd.h>
  #endif
+#include <lustre_disk.h>
  #include <lustre_log.h>
  #include <lprocfs_status.h>
  #include <libcfs/list.h>
  #include <lustre_param.h>
  #include <class_hash.h>
  
-extern struct lustre_hash_operations uuid_hash_operations;
-extern struct lustre_hash_operations nid_hash_operations;
+static lustre_hash_ops_t uuid_hash_ops;
+static lustre_hash_ops_t nid_hash_ops;
+static lustre_hash_ops_t nid_stat_hash_ops;
  
  /*********** string parsing utils *********/
  
@@ -50,15 +65,15 @@ int class_find_param(char *buf, char *key, char **valp)
  {
          char *ptr;
  
-        if (!buf) 
+        if (!buf)
                  return 1;
  
-        if ((ptr = strstr(buf, key)) == NULL) 
+        if ((ptr = strstr(buf, key)) == NULL)
                  return 1;
  
-        if (valp) 
+        if (valp)
                  *valp = ptr + strlen(key);
-        
+
          return 0;
  }
  
@@ -66,19 +81,19 @@ int class_find_param(char *buf, char *key, char **valp)
     valp points to first char after key. */
  int class_match_param(char *buf, char *key, char **valp)
  {
-        if (!buf) 
+        if (!buf)
                  return 1;
  
-        if (memcmp(buf, key, strlen(key)) != 0) 
+        if (memcmp(buf, key, strlen(key)) != 0)
                  return 1;
  
-        if (valp) 
+        if (valp)
                  *valp = buf + strlen(key);
-        
+
          return 0;
  }
  
-/* 0 is good nid, 
+/* 0 is good nid,
     1 not found
     < 0 error
     endh is set to next separator */
@@ -86,16 +101,16 @@ int class_parse_nid(char *buf, lnet_nid_t *nid, char **endh)
  {
          char tmp, *endp;
  
-        if (!buf) 
+        if (!buf)
                  return 1;
-        while (*buf == ',' || *buf == ':') 
+        while (*buf == ',' || *buf == ':')
                  buf++;
-        if (*buf == ' ' || *buf == '/' || *buf == '\0') 
+        if (*buf == ' ' || *buf == '/' || *buf == '\0')
                  return 1;
  
          /* nid separators or end of nids */
          endp = strpbrk(buf, ",: /");
-        if (endp == NULL) 
+        if (endp == NULL)
                  endp = buf + strlen(buf);
  
          tmp = *endp;
@@ -108,7 +123,7 @@ int class_parse_nid(char *buf, lnet_nid_t *nid, char **endh)
          }
          *endp = tmp;
  
-        if (endh) 
+        if (endh)
                  *endh = endp;
          CDEBUG(D_INFO, "Nid %s\n", libcfs_nid2str(*nid));
          return 0;
@@ -182,17 +197,18 @@ int class_attach(struct lustre_cfg *lcfg)
          }
          LASSERTF(obd != NULL, "Cannot get obd device %s of type %s\n",
                   name, typename);
-        LASSERTF(obd->obd_magic == OBD_DEVICE_MAGIC, 
+        LASSERTF(obd->obd_magic == OBD_DEVICE_MAGIC,
                   "obd %p obd_magic %08X != %08X\n",
                   obd, obd->obd_magic, OBD_DEVICE_MAGIC);
-        LASSERTF(strncmp(obd->obd_name, name, strlen(name)) == 0, "%p obd_name %s != %s\n",
-                 obd, obd->obd_name, name);
+        LASSERTF(strncmp(obd->obd_name, name, strlen(name)) == 0,
+                 "%p obd_name %s != %s\n", obd, obd->obd_name, name);
  
          rwlock_init(&obd->obd_pool_lock);
          obd->obd_pool_limit = 0;
          obd->obd_pool_slv = 0;
  
          CFS_INIT_LIST_HEAD(&obd->obd_exports);
+        CFS_INIT_LIST_HEAD(&obd->obd_delayed_exports);
          CFS_INIT_LIST_HEAD(&obd->obd_exports_timed);
          CFS_INIT_LIST_HEAD(&obd->obd_nid_stats);
          spin_lock_init(&obd->obd_nid_lock);
@@ -210,12 +226,11 @@ int class_attach(struct lustre_cfg *lcfg)
          cfs_waitq_init(&obd->obd_next_transno_waitq);
          cfs_waitq_init(&obd->obd_evict_inprogress_waitq);
          cfs_waitq_init(&obd->obd_llog_waitq);
+        init_mutex(&obd->obd_llog_alloc);
+        init_mutex(&obd->obd_llog_cat_process);
          CFS_INIT_LIST_HEAD(&obd->obd_recovery_queue);
          CFS_INIT_LIST_HEAD(&obd->obd_delayed_reply_queue);
  
-        spin_lock_init(&obd->obd_uncommitted_replies_lock);
-        CFS_INIT_LIST_HEAD(&obd->obd_uncommitted_replies);
-
          len = strlen(uuid);
          if (len >= sizeof(obd->obd_uuid)) {
                  CERROR("uuid must be < %d bytes long\n",
@@ -254,9 +269,11 @@ int class_setup(struct obd_device *obd, struct lustre_cfg *lcfg)
          ENTRY;
  
          LASSERT(obd != NULL);
-        LASSERTF(obd == class_num2obd(obd->obd_minor), "obd %p != obd_devs[%d] %p\n", 
+        LASSERTF(obd == class_num2obd(obd->obd_minor),
+                 "obd %p != obd_devs[%d] %p\n",
                   obd, obd->obd_minor, class_num2obd(obd->obd_minor));
-        LASSERTF(obd->obd_magic == OBD_DEVICE_MAGIC, "obd %p obd_magic %08x != %08x\n", 
+        LASSERTF(obd->obd_magic == OBD_DEVICE_MAGIC,
+                 "obd %p obd_magic %08x != %08x\n",
                   obd, obd->obd_magic, OBD_DEVICE_MAGIC);
  
          /* have we attached a type to this device? */
@@ -282,30 +299,33 @@ int class_setup(struct obd_device *obd, struct lustre_cfg *lcfg)
          /* just leave this on forever.  I can't use obd_set_up here because
             other fns check that status, and we're not actually set up yet. */
          obd->obd_starting = 1;
+        obd->obd_uuid_hash = NULL;
+        obd->obd_nid_hash = NULL;
+        obd->obd_nid_stats_hash = NULL;
          spin_unlock(&obd->obd_dev_lock);
  
-        /* create an uuid-export hash body */
-        err = lustre_hash_init(&obd->obd_uuid_hash_body, "UUID_HASH",
-                               128, &uuid_hash_operations);
-        if (err)
-                GOTO(err_hash, err);
-
-        /* create a nid-export hash body */
-        err = lustre_hash_init(&obd->obd_nid_hash_body, "NID_HASH",
-                               128, &nid_hash_operations);
-        if (err)
-                GOTO(err_hash, err);
+        /* create an uuid-export lustre hash */
+        obd->obd_uuid_hash = lustre_hash_init("UUID_HASH", 7, 7,
+                                              &uuid_hash_ops, 0);
+        if (!obd->obd_uuid_hash)
+                GOTO(err_hash, err = -ENOMEM);
  
-        /* create a nid-stats hash body */
-        err = lustre_hash_init(&obd->obd_nid_stats_hash_body, "NID_STATS",
-                               128, &nid_stat_hash_operations);
-        if (err)
-                GOTO(err_hash, err);
+        /* create a nid-export lustre hash */
+        obd->obd_nid_hash = lustre_hash_init("NID_HASH", 7, 7,
+                                             &nid_hash_ops, 0);
+        if (!obd->obd_nid_hash)
+                GOTO(err_hash, err = -ENOMEM);
  
+        /* create a nid-stats lustre hash */
+        obd->obd_nid_stats_hash = lustre_hash_init("NID_STATS", 7, 7,
+                                                   &nid_stat_hash_ops, 0);
+        if (!obd->obd_nid_stats_hash)
+                GOTO(err_hash, err = -ENOMEM);
  
          exp = class_new_export(obd, &obd->obd_uuid);
          if (IS_ERR(exp))
-                RETURN(PTR_ERR(exp));
+                GOTO(err_hash, err = PTR_ERR(exp));
+
          obd->obd_self_export = exp;
          list_del_init(&exp->exp_obd_chain_timed);
          class_export_put(exp);
@@ -324,17 +344,25 @@ int class_setup(struct obd_device *obd, struct lustre_cfg *lcfg)
                 obd->obd_name, obd->obd_uuid.uuid);
  
          RETURN(0);
-
  err_exp:
          class_unlink_export(obd->obd_self_export);
          obd->obd_self_export = NULL;
  err_hash:
-        lustre_hash_exit(&obd->obd_uuid_hash_body);
-        lustre_hash_exit(&obd->obd_nid_hash_body);
-        lustre_hash_exit(&obd->obd_nid_stats_hash_body);
+        if (obd->obd_uuid_hash) {
+                lustre_hash_exit(obd->obd_uuid_hash);
+                obd->obd_uuid_hash = NULL;
+        }
+        if (obd->obd_nid_hash) {
+                lustre_hash_exit(obd->obd_nid_hash);
+                obd->obd_nid_hash = NULL;
+        }
+        if (obd->obd_nid_stats_hash) {
+                lustre_hash_exit(obd->obd_nid_stats_hash);
+                obd->obd_nid_stats_hash = NULL;
+        }
          obd->obd_starting = 0;
          CERROR("setup %s failed (%d)\n", obd->obd_name, err);
-        RETURN(err);
+        return err;
  }
  
  int class_detach(struct obd_device *obd, struct lustre_cfg *lcfg)
@@ -368,19 +396,22 @@ int class_detach(struct obd_device *obd, struct lustre_cfg *lcfg)
  
  static void dump_exports(struct obd_device *obd)
  {
-        struct obd_export *exp, *n;
+        struct obd_export *exp;
  
-        list_for_each_entry_safe(exp, n, &obd->obd_exports, exp_obd_chain) {
+        spin_lock(&obd->obd_dev_lock);
+        list_for_each_entry(exp, &obd->obd_exports, exp_obd_chain) {
                  struct ptlrpc_reply_state *rs;
                  struct ptlrpc_reply_state *first_reply = NULL;
                  int                        nreplies = 0;
  
+                spin_lock(&exp->exp_lock);
                  list_for_each_entry (rs, &exp->exp_outstanding_replies,
                                       rs_exp_list) {
                          if (nreplies == 0)
                                  first_reply = rs;
                          nreplies++;
                  }
+                spin_unlock(&exp->exp_lock);
  
                  CDEBUG(D_IOCTL, "%s: %p %s %s %d %d %d: %p %s\n",
                         obd->obd_name, exp, exp->exp_client_uuid.uuid,
@@ -389,6 +420,7 @@ static void dump_exports(struct obd_device *obd)
                         exp->exp_failed, nreplies, first_reply,
                         nreplies > 3 ? "..." : "");
          }
+        spin_unlock(&obd->obd_dev_lock);
  }
  
  int class_cleanup(struct obd_device *obd, struct lustre_cfg *lcfg)
@@ -421,7 +453,7 @@ int class_cleanup(struct obd_device *obd, struct lustre_cfg *lcfg)
                                  obd->obd_force = 1;
                                  break;
                          case 'A':
-                                LCONSOLE_WARN("Failing over %s\n", 
+                                LCONSOLE_WARN("Failing over %s\n",
                                                obd->obd_name);
                                  obd->obd_fail = 1;
                                  obd->obd_no_transno = 1;
@@ -441,7 +473,7 @@ int class_cleanup(struct obd_device *obd, struct lustre_cfg *lcfg)
          /* The three references that should be remaining are the
           * obd_self_export and the attach and setup references. */
          if (atomic_read(&obd->obd_refcount) > 3) {
-#if 0           /* We should never fail to cleanup with mountconf */ 
+#if 0           /* We should never fail to cleanup with mountconf */
                  if (!(obd->obd_fail || obd->obd_force)) {
                          CERROR("OBD %s is still busy with %d references\n"
                                 "You should stop active file system users,"
@@ -453,7 +485,7 @@ int class_cleanup(struct obd_device *obd, struct lustre_cfg *lcfg)
                          RETURN(-EBUSY);
                  }
  #endif
-                /* refcounf - 3 might be the number of real exports 
+                /* refcounf - 3 might be the number of real exports
                     (excluding self export). But class_incref is called
                     by other things as well, so don't count on it. */
                  CDEBUG(D_IOCTL, "%s: forcing exports to disconnect: %d\n",
@@ -465,13 +497,13 @@ int class_cleanup(struct obd_device *obd, struct lustre_cfg *lcfg)
          LASSERT(obd->obd_self_export);
  
          /* destroy an uuid-export hash body */
-        lustre_hash_exit(&obd->obd_uuid_hash_body);
+        lustre_hash_exit(obd->obd_uuid_hash);
  
          /* destroy a nid-export hash body */
-        lustre_hash_exit(&obd->obd_nid_hash_body);
+        lustre_hash_exit(obd->obd_nid_hash);
  
          /* destroy a nid-stats hash body */
-        lustre_hash_exit(&obd->obd_nid_stats_hash_body);
+        lustre_hash_exit(obd->obd_nid_stats_hash);
  
          /* Precleanup stage 1, we must make sure all exports (other than the
             self-export) get destroyed. */
@@ -479,7 +511,6 @@ int class_cleanup(struct obd_device *obd, struct lustre_cfg *lcfg)
          if (err)
                  CERROR("Precleanup %s returned %d\n",
                         obd->obd_name, err);
-
          class_decref(obd);
          obd->obd_set_up = 0;
  
@@ -520,9 +551,7 @@ void class_decref(struct obd_device *obd)
                                 obd->obd_name, err);
  
                  spin_lock(&obd->obd_self_export->exp_lock);
-                obd->obd_self_export->exp_flags |=
-                        (obd->obd_fail ? OBD_OPT_FAILOVER : 0) |
-                        (obd->obd_force ? OBD_OPT_FORCE : 0);
+                obd->obd_self_export->exp_flags |= exp_flags_from_obd(obd);
                  spin_unlock(&obd->obd_self_export->exp_lock);
  
                  /* note that we'll recurse into class_decref again */
@@ -563,7 +592,7 @@ int class_add_conn(struct obd_device *obd, struct lustre_cfg *lcfg)
                  RETURN(-EINVAL);
          }
          if (strcmp(obd->obd_type->typ_name, LUSTRE_MDC_NAME) &&
-            strcmp(obd->obd_type->typ_name, LUSTRE_OSC_NAME) && 
+            strcmp(obd->obd_type->typ_name, LUSTRE_OSC_NAME) &&
              strcmp(obd->obd_type->typ_name, LUSTRE_MGC_NAME)) {
                  CERROR("can't add connection on non-client dev\n");
                  RETURN(-EINVAL);
@@ -670,7 +699,7 @@ out:
                  OBD_FREE(lprof->lp_osc, osclen);
          if (lprof->lp_profile)
                  OBD_FREE(lprof->lp_profile, proflen);
-        OBD_FREE(lprof, sizeof(*lprof));        
+        OBD_FREE(lprof, sizeof(*lprof));
          RETURN(err);
  }
  
@@ -791,7 +820,7 @@ int class_process_config(struct lustre_cfg *lcfg)
          }
          case LCFG_PARAM: {
                  /* llite has no obd */
-                if ((class_match_param(lustre_cfg_string(lcfg, 1), 
+                if ((class_match_param(lustre_cfg_string(lcfg, 1),
                                         PARAM_LLITE, 0) == 0) &&
                      client_process_config) {
                          err = (*client_process_config)(lcfg);
@@ -835,6 +864,28 @@ int class_process_config(struct lustre_cfg *lcfg)
                  err = class_del_conn(obd, lcfg);
                  GOTO(out, err = 0);
          }
+        case LCFG_POOL_NEW: {
+                err = obd_pool_new(obd, lustre_cfg_string(lcfg, 2));
+                GOTO(out, err = 0);
+                break;
+        }
+        case LCFG_POOL_ADD: {
+                err = obd_pool_add(obd, lustre_cfg_string(lcfg, 2),
+                                   lustre_cfg_string(lcfg, 3));
+                GOTO(out, err = 0);
+                break;
+        }
+        case LCFG_POOL_REM: {
+                err = obd_pool_rem(obd, lustre_cfg_string(lcfg, 2),
+                                   lustre_cfg_string(lcfg, 3));
+                GOTO(out, err = 0);
+                break;
+        }
+        case LCFG_POOL_DEL: {
+                err = obd_pool_del(obd, lustre_cfg_string(lcfg, 2));
+                GOTO(out, err = 0);
+                break;
+        }
          default: {
                  err = obd_process_config(obd, sizeof(*lcfg), lcfg);
                  GOTO(out, err);
@@ -843,14 +894,14 @@ int class_process_config(struct lustre_cfg *lcfg)
          }
  out:
          if ((err < 0) && !(lcfg->lcfg_command & LCFG_REQUIRED)) {
-                CWARN("Ignoring error %d on optional command %#x\n", err, 
+                CWARN("Ignoring error %d on optional command %#x\n", err,
                        lcfg->lcfg_command);
                  err = 0;
          }
          return err;
  }
  
-int class_process_proc_param(char *prefix, struct lprocfs_vars *lvars, 
+int class_process_proc_param(char *prefix, struct lprocfs_vars *lvars,
                               struct lustre_cfg *lcfg, void *data)
  {
  #ifdef __KERNEL__
@@ -862,7 +913,7 @@ int class_process_proc_param(char *prefix, struct lprocfs_vars *lvars,
          ENTRY;
  
          if (lcfg->lcfg_command != LCFG_PARAM) {
-                CERROR("Unknown command: %d\n", lcfg->lcfg_command);
+                CERROR("Unknown command: %x\n", lcfg->lcfg_command);
                  RETURN(-EINVAL);
          }
  
@@ -898,26 +949,26 @@ int class_process_proc_param(char *prefix, struct lprocfs_vars *lvars,
                                                                 vallen, data);
                                          set_fs(oldfs);
                                  }
-                                if (rc < 0) 
-                                        CERROR("writing proc entry %s err %d\n", 
+                                if (rc < 0)
+                                        CERROR("writing proc entry %s err %d\n",
                                                 var->name, rc);
                                  break;
                          }
                          j++;
-                }    
+                }
                  if (!matched) {
                          CERROR("%s: unknown param %s\n",
                                 (char *)lustre_cfg_string(lcfg, 0), key);
                          /* rc = -EINVAL;       continue parsing other params */
                  } else {
-                        LCONSOLE_INFO("%s.%.*s: set parameter %.*s=%s\n", 
-                                      (char *)lustre_cfg_string(lcfg, 0),
+                        LCONSOLE_INFO("%s.%.*s: set parameter %.*s=%s\n",
+                                      lustre_cfg_string(lcfg, 0),
                                        (int)strlen(prefix) - 1, prefix,
                                        (int)(sval - key - 1), key, sval);
                  }
          }
-        
-        if (rc > 0) 
+
+        if (rc > 0)
                  rc = 0;
          RETURN(rc);
  #else
@@ -944,7 +995,7 @@ static int class_config_llog_handler(struct llog_handle * handle,
          char *cfg_buf = (char*) (rec + 1);
          int rc = 0;
          ENTRY;
-        
+
          //class_config_dump_handler(handle, rec, data);
  
          switch (rec->lrh_type) {
@@ -953,11 +1004,13 @@ static int class_config_llog_handler(struct llog_handle * handle,
                  struct lustre_cfg_bufs bufs;
                  char *inst_name = NULL;
                  int inst_len = 0;
-                int inst = 0;
+                int inst = 0, swab = 0;
  
                  lcfg = (struct lustre_cfg *)cfg_buf;
-                if (lcfg->lcfg_version == __swab32(LUSTRE_CFG_VERSION))
+                if (lcfg->lcfg_version == __swab32(LUSTRE_CFG_VERSION)) {
                          lustre_swab_lustre_cfg(lcfg);
+                        swab = 1;
+                }
  
                  rc = lustre_cfg_sanity_check(cfg_buf, cfg_len);
                  if (rc)
@@ -966,17 +1019,19 @@ static int class_config_llog_handler(struct llog_handle * handle,
                  /* Figure out config state info */
                  if (lcfg->lcfg_command == LCFG_MARKER) {
                          struct cfg_marker *marker = lustre_cfg_buf(lcfg, 1);
+                        lustre_swab_cfg_marker(marker, swab,
+                                               LUSTRE_CFG_BUFLEN(lcfg, 1));
                          CDEBUG(D_CONFIG, "Marker, inst_flg=%#x mark_flg=%#x\n",
                                 clli->cfg_flags, marker->cm_flags);
                          if (marker->cm_flags & CM_START) {
                                  /* all previous flags off */
                                  clli->cfg_flags = CFG_F_MARKER;
-                                if (marker->cm_flags & CM_SKIP) { 
+                                if (marker->cm_flags & CM_SKIP) {
                                          clli->cfg_flags |= CFG_F_SKIP;
                                          CDEBUG(D_CONFIG, "SKIP #%d\n",
                                                 marker->cm_step);
                                  } else if ((marker->cm_flags & CM_EXCLUDE) ||
-                                           lustre_check_exclusion(clli->cfg_sb, 
+                                           lustre_check_exclusion(clli->cfg_sb,
                                                            marker->cm_tgtname)) {
                                          clli->cfg_flags |= CFG_F_EXCLUDE;
                                          CDEBUG(D_CONFIG, "EXCLUDE %d\n",
@@ -986,12 +1041,12 @@ static int class_config_llog_handler(struct llog_handle * handle,
                                  clli->cfg_flags = 0;
                          }
                  }
-                /* A config command without a start marker before it is 
+                /* A config command without a start marker before it is
                     illegal (post 146) */
                  if (!(clli->cfg_flags & CFG_F_COMPAT146) &&
-                    !(clli->cfg_flags & CFG_F_MARKER) && 
+                    !(clli->cfg_flags & CFG_F_MARKER) &&
                      (lcfg->lcfg_command != LCFG_MARKER)) {
-                        CWARN("Config not inside markers, ignoring! (%#x)\n", 
+                        CWARN("Config not inside markers, ignoring! (%#x)\n",
                                clli->cfg_flags);
                          clli->cfg_flags |= CFG_F_SKIP;
                  }
@@ -1004,14 +1059,34 @@ static int class_config_llog_handler(struct llog_handle * handle,
                          break;
                  }
  
-                if ((clli->cfg_flags & CFG_F_EXCLUDE) && 
+                /**
+                 * For interop mode between 1.8 and 2.0:
+                 * skip "lmv" configuration which exists since 2.0.
+                 */
+                {
+                        char *devname = lustre_cfg_string(lcfg, 0);
+                        char *typename = lustre_cfg_string(lcfg, 1);
+
+                        if (devname)
+                                devname += strlen(devname) - strlen("clilmv");
+
+                        if ((lcfg->lcfg_command == LCFG_ATTACH && typename &&
+                             strcmp(typename, "lmv") == 0) ||
+                            (devname && strcmp(devname, "clilmv") == 0)) {
+                                CWARN("skipping 'lmv' config: cmd=%x,%s:%s\n",
+                                       lcfg->lcfg_command, devname, typename);
+                                GOTO(out, rc = 0);
+                        }
+                }
+
+                if ((clli->cfg_flags & CFG_F_EXCLUDE) &&
                      (lcfg->lcfg_command == LCFG_LOV_ADD_OBD))
                          /* Add inactive instead */
                          lcfg->lcfg_command = LCFG_LOV_ADD_INA;
  
                  lustre_cfg_bufs_init(&bufs, lcfg);
  
-                if (clli && clli->cfg_instance && 
+                if (clli && clli->cfg_instance &&
                      LUSTRE_CFG_BUFLEN(lcfg, 0) > 0){
                          inst = 1;
                          inst_len = LUSTRE_CFG_BUFLEN(lcfg, 0) +
@@ -1023,13 +1098,13 @@ static int class_config_llog_handler(struct llog_handle * handle,
                                  lustre_cfg_string(lcfg, 0),
                                  clli->cfg_instance);
                          lustre_cfg_bufs_set_string(&bufs, 0, inst_name);
-                        CDEBUG(D_CONFIG, "cmd %x, instance name: %s\n", 
+                        CDEBUG(D_CONFIG, "cmd %x, instance name: %s\n",
                                 lcfg->lcfg_command, inst_name);
                  }
  
                  /* we override the llog's uuid for clients, to insure they
                  are unique */
-                if (clli && clli->cfg_instance && 
+                if (clli && clli->cfg_instance &&
                      lcfg->lcfg_command == LCFG_ATTACH) {
                          lustre_cfg_bufs_set_string(&bufs, 2,
                                                     clli->cfg_uuid.uuid);
@@ -1096,15 +1171,15 @@ int class_config_parse_llog(struct llog_ctxt *ctxt, char *name,
  
          /* continue processing from where we last stopped to end-of-log */
          if (cfg)
-                cd.first_idx = cfg->cfg_last_idx;
-        cd.last_idx = 0;
+                cd.lpcd_first_idx = cfg->cfg_last_idx;
+        cd.lpcd_last_idx = 0;
  
          rc = llog_process(llh, class_config_llog_handler, cfg, &cd);
  
-        CDEBUG(D_CONFIG, "Processed log %s gen %d-%d (rc=%d)\n", name, 
-               cd.first_idx + 1, cd.last_idx, rc);
+        CDEBUG(D_CONFIG, "Processed log %s gen %d-%d (rc=%d)\n", name,
+               cd.lpcd_first_idx + 1, cd.lpcd_last_idx, rc);
          if (cfg)
-                cfg->cfg_last_idx = cd.last_idx;
+                cfg->cfg_last_idx = cd.lpcd_last_idx;
  
  parse_out:
          rc2 = llog_close(llh);
@@ -1156,7 +1231,7 @@ int class_config_dump_handler(struct llog_handle * handle,
                  if (lcfg->lcfg_command == LCFG_MARKER) {
                          struct cfg_marker *marker = lustre_cfg_buf(lcfg, 1);
                          ptr += snprintf(ptr, end-ptr, "marker=%d(%#x)%s '%s'",
-                                        marker->cm_step, marker->cm_flags, 
+                                        marker->cm_step, marker->cm_flags,
                                          marker->cm_tgtname, marker->cm_comment);
                  } else {
                          for (i = 0; i <  lcfg->lcfg_bufcount; i++) {
@@ -1244,3 +1319,187 @@ out:
          RETURN(rc);
  }
  
+/*
+ * uuid<->export lustre hash operations
+ */
+
+static unsigned
+uuid_hash(lustre_hash_t *lh,  void *key, unsigned mask)
+{
+        return lh_djb2_hash(((struct obd_uuid *)key)->uuid,
+                            sizeof(((struct obd_uuid *)key)->uuid), mask);
+}
+
+static void *
+uuid_key(struct hlist_node *hnode)
+{
+        struct obd_export *exp;
+
+        exp = hlist_entry(hnode, struct obd_export, exp_uuid_hash);
+
+        RETURN(&exp->exp_client_uuid);
+}
+
+/*
+ * NOTE: It is impossible to find an export that is in failed
+ *       state with this function
+ */
+static int
+uuid_compare(void *key, struct hlist_node *hnode)
+{
+        struct obd_export *exp;
+
+        LASSERT(key);
+        exp = hlist_entry(hnode, struct obd_export, exp_uuid_hash);
+
+        RETURN(obd_uuid_equals((struct obd_uuid *)key,&exp->exp_client_uuid) &&
+               !exp->exp_failed);
+}
+
+static void *
+uuid_export_get(struct hlist_node *hnode)
+{
+        struct obd_export *exp;
+
+        exp = hlist_entry(hnode, struct obd_export, exp_uuid_hash);
+        class_export_get(exp);
+
+        RETURN(exp);
+}
+
+static void *
+uuid_export_put(struct hlist_node *hnode)
+{
+        struct obd_export *exp;
+
+        exp = hlist_entry(hnode, struct obd_export, exp_uuid_hash);
+        class_export_put(exp);
+
+        RETURN(exp);
+}
+
+static lustre_hash_ops_t uuid_hash_ops = {
+        .lh_hash    = uuid_hash,
+        .lh_key     = uuid_key,
+        .lh_compare = uuid_compare,
+        .lh_get     = uuid_export_get,
+        .lh_put     = uuid_export_put,
+};
+
+
+/*
+ * nid<->export hash operations
+ */
+
+static unsigned
+nid_hash(lustre_hash_t *lh,  void *key, unsigned mask)
+{
+        return lh_djb2_hash(key, sizeof(lnet_nid_t), mask);
+}
+
+static void *
+nid_key(struct hlist_node *hnode)
+{
+        struct obd_export *exp;
+
+        exp = hlist_entry(hnode, struct obd_export, exp_nid_hash);
+
+        RETURN(&exp->exp_connection->c_peer.nid);
+}
+
+/*
+ * NOTE: It is impossible to find an export that is in failed
+ *       state with this function
+ */
+static int
+nid_compare(void *key, struct hlist_node *hnode)
+{
+        struct obd_export *exp;
+
+        LASSERT(key);
+        exp = hlist_entry(hnode, struct obd_export, exp_nid_hash);
+
+        RETURN(exp->exp_connection->c_peer.nid == *(lnet_nid_t *)key &&
+               !exp->exp_failed);
+}
+
+static void *
+nid_export_get(struct hlist_node *hnode)
+{
+        struct obd_export *exp;
+
+        exp = hlist_entry(hnode, struct obd_export, exp_nid_hash);
+        class_export_get(exp);
+
+        RETURN(exp);
+}
+
+static void *
+nid_export_put(struct hlist_node *hnode)
+{
+        struct obd_export *exp;
+
+        exp = hlist_entry(hnode, struct obd_export, exp_nid_hash);
+        class_export_put(exp);
+
+        RETURN(exp);
+}
+
+static lustre_hash_ops_t nid_hash_ops = {
+        .lh_hash    = nid_hash,
+        .lh_key     = nid_key,
+        .lh_compare = nid_compare,
+        .lh_get     = nid_export_get,
+        .lh_put     = nid_export_put,
+};
+
+
+/*
+ * nid<->nidstats hash operations
+ */
+
+static void *
+nidstats_key(struct hlist_node *hnode)
+{
+        struct nid_stat *ns;
+
+        ns = hlist_entry(hnode, struct nid_stat, nid_hash);
+
+        RETURN(&ns->nid);
+}
+
+static int
+nidstats_compare(void *key, struct hlist_node *hnode)
+{
+        RETURN(*(lnet_nid_t *)nidstats_key(hnode) == *(lnet_nid_t *)key);
+}
+
+static void *
+nidstats_get(struct hlist_node *hnode)
+{
+        struct nid_stat *ns;
+
+        ns = hlist_entry(hnode, struct nid_stat, nid_hash);
+        ns->nid_exp_ref_count++;
+
+        RETURN(ns);
+}
+
+static void *
+nidstats_put(struct hlist_node *hnode)
+{
+        struct nid_stat *ns;
+
+        ns = hlist_entry(hnode, struct nid_stat, nid_hash);
+        ns->nid_exp_ref_count--;
+
+        RETURN(ns);
+}
+
+static lustre_hash_ops_t nid_stat_hash_ops = {
+        .lh_hash    = nid_hash,
+        .lh_key     = nidstats_key,
+        .lh_compare = nidstats_compare,
+        .lh_get     = nidstats_get,
+        .lh_put     = nidstats_put,
+};
diff --git a/lustre/obdclass/obd_mount.c b/lustre/obdclass/obd_mount.c

index aa97caa..d240cba 100644 (file)
--- a/lustre/obdclass/obd_mount.c
+++ b/lustre/obdclass/obd_mount.c
@@ -1,26 +1,43 @@
  /* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
   * vim:expandtab:shiftwidth=8:tabstop=8:
   *
- *  lustre/obdclass/obd_mount.c
- *  Client/server mount routines
+ * GPL HEADER START
   *
- *  Copyright (c) 2006 Cluster File Systems, Inc.
- *   Author: Nathan Rutman <nathan@clusterfs.com>
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   *
- *   This file is part of Lustre, http://www.lustre.org/
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
   *
- *   Lustre is free software; you can redistribute it and/or
- *   modify it under the terms of version 2 of the GNU General Public
- *   License as published by the Free Software Foundation.
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
   *
- *   Lustre is distributed in the hope that it will be useful,
- *   but WITHOUT ANY WARRANTY; without even the implied warranty of
- *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- *   GNU General Public License for more details.
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
   *
- *   You should have received a copy of the GNU General Public License
- *   along with Lustre; if not, write to the Free Software
- *   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright  2008 Sun Microsystems, Inc. All rights reserved
+ * Use is subject to license terms.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/obdclass/obd_mount.c
+ *
+ * Client/server mount routines
+ *
+ * Author: Nathan Rutman <nathan@clusterfs.com>
   */
  
  
@@ -150,7 +167,7 @@ struct lustre_mount_info *server_get_mount(char *name)
          lsi = s2lsi(lmi->lmi_sb);
          mntget(lmi->lmi_mnt);
          atomic_inc(&lsi->lsi_mounts);
-
+        
          CDEBUG(D_MOUNT, "get_mnt %p from %s, refs=%d, vfscount=%d\n",
                 lmi->lmi_mnt, name, atomic_read(&lsi->lsi_mounts),
                 atomic_read(&lmi->lmi_mnt->mnt_count));
@@ -181,7 +198,7 @@ int server_put_mount(char *name, struct vfsmount *mnt)
  
          /* This might be the last one, can't deref after this */
          unlock_mntput(mnt);
-        
+
          down(&lustre_mount_info_lock);
          lmi = server_find_mount(name);
          up(&lustre_mount_info_lock);
@@ -377,7 +394,7 @@ int lustre_process_log(struct super_block *sb, char *logname,
                                     "communication errors between this node and "
                                     "the MGS, a bad configuration, or other "
                                     "errors. See the syslog for more "
-                                   "information.\n", mgc->obd_name, logname, 
+                                   "information.\n", mgc->obd_name, logname,
                                     rc);
  
          /* class_obd_list(); */
@@ -481,16 +498,18 @@ static int server_start_mgs(struct super_block *sb)
  
          rc = server_register_mount(LUSTRE_MGS_OBDNAME, sb, mnt);
  
-        if (!rc &&
-            ((rc = lustre_start_simple(LUSTRE_MGS_OBDNAME, LUSTRE_MGS_NAME,
-                                       LUSTRE_MGS_OBDNAME, 0, 0))))
-                server_deregister_mount(LUSTRE_MGS_OBDNAME);
+        if (!rc) {
+                rc = lustre_start_simple(LUSTRE_MGS_OBDNAME, LUSTRE_MGS_NAME,
+                                         LUSTRE_MGS_OBDNAME, 0, 0);
+                /* Do NOT call server_deregister_mount() here. This leads to
+                 * inability cleanup cleanly and free lsi and other stuff when
+                 * mgs calls server_put_mount() in error handling case. -umka */
+        }
  
          if (rc)
                  LCONSOLE_ERROR_MSG(0x15e, "Failed to start MGS '%s' (%d).  Is "
-                                   "the 'mgs' module loaded?\n", 
+                                   "the 'mgs' module loaded?\n",
                                     LUSTRE_MGS_OBDNAME, rc);
-
          RETURN(rc);
  }
  
@@ -517,7 +536,12 @@ static int server_stop_mgs(struct super_block *sb)
  
  DECLARE_MUTEX(mgc_start_lock);
  
-/* Set up a mgcobd to process startup logs */
+/** Set up a mgc obd to process startup logs
+ *
+ * \param sb [in] super block of the mgc obd
+ *
+ * \retval 0 success, otherwise error code
+ */
  static int lustre_start_mgc(struct super_block *sb)
  {
          struct lustre_handle mgc_conn = {0, };
@@ -574,7 +598,7 @@ static int lustre_start_mgc(struct super_block *sb)
          mutex_down(&mgc_start_lock);
  
          obd = class_name2obd(mgcname);
-        if (obd) {
+        if (obd && !obd->obd_stopping) {
                  /* Re-using an existing MGC */
                  atomic_inc(&obd->u.cli.cl_mgc_refcount);
  
@@ -704,7 +728,8 @@ static int lustre_start_mgc(struct super_block *sb)
          OBD_ALLOC_PTR(data);
          if (data == NULL)
                  GOTO(out, rc = -ENOMEM);
-        data->ocd_connect_flags = OBD_CONNECT_VERSION | OBD_CONNECT_AT;
+        data->ocd_connect_flags = OBD_CONNECT_VERSION | OBD_CONNECT_AT |
+                                  OBD_CONNECT_FID;
          data->ocd_version = LUSTRE_VERSION_CODE;
          /* We connect to the MGS at setup, and don't disconnect until cleanup */
          rc = obd_connect(&mgc_conn, obd, &(obd->obd_uuid), data, NULL);
@@ -755,7 +780,7 @@ static int lustre_stop_mgc(struct super_block *sb)
                  GOTO(out, rc = -EBUSY);
          }
  
-        /* The MGC has no recoverable data in any case. 
+        /* The MGC has no recoverable data in any case.
           * force shotdown set in umount_begin */
          obd->obd_no_recov = 1;
  
@@ -1069,7 +1094,7 @@ static int server_start_targets(struct super_block *sb, struct vfsmount *mnt)
          if (rc == -EINVAL) {
                  LCONSOLE_ERROR_MSG(0x160, "The MGS is refusing to allow this "
                                     "server (%s) to start.  Please see messages "
-                                   "on the MGS node.\n", 
+                                   "on the MGS node.\n",
                                     lsi->lsi_ldd->ldd_svname);
                  GOTO(out_mgc, rc);
          }
@@ -1089,6 +1114,9 @@ static int server_start_targets(struct super_block *sb, struct vfsmount *mnt)
          if (rc) {
                  CERROR("failed to start server %s: %d\n",
                         lsi->lsi_ldd->ldd_svname, rc);
+                /* Do NOT call server_deregister_mount() here. This makes it
+                 * impossible to find mount later in cleanup time and leaves
+                 * @lsi and othder stuff leaked. -umka */
                  GOTO(out_mgc, rc);
          }
  
@@ -1148,10 +1176,8 @@ static int lustre_free_lsi(struct super_block *sb)
          struct lustre_sb_info *lsi = s2lsi(sb);
          ENTRY;
  
-        if (!lsi)
-                RETURN(0);
-
-        CDEBUG(D_MOUNT, "Freeing lsi\n");
+        LASSERT(lsi != NULL);
+        CDEBUG(D_MOUNT, "Freeing lsi %p\n", lsi);
  
          /* someone didn't call server_put_mount. */
          LASSERT(atomic_read(&lsi->lsi_mounts) == 0);
@@ -1191,10 +1217,9 @@ static int lustre_put_lsi(struct super_block *sb)
          struct lustre_sb_info *lsi = s2lsi(sb);
          ENTRY;
  
-        LASSERT(lsi);
+        LASSERT(lsi != NULL);
  
          CDEBUG(D_MOUNT, "put %p %d\n", sb, atomic_read(&lsi->lsi_mounts));
-
          if (atomic_dec_and_test(&lsi->lsi_mounts)) {
                  lustre_free_lsi(sb);
                  RETURN(1);
@@ -1226,14 +1251,28 @@ static struct vfsmount *server_kernel_mount(struct super_block *sb)
             Note ext3/ldiskfs can't be mounted ro. */
          s_flags = sb->s_flags;
  
+        /* allocate memory for options */
+        OBD_PAGE_ALLOC(__page, CFS_ALLOC_STD);
+        if (!__page)
+                GOTO(out_free, rc = -ENOMEM);
+        page = (unsigned long)cfs_page_address(__page);
+        options = (char *)page;
+        memset(options, 0, CFS_PAGE_SIZE);
+
+        /* mount-line options must be added for pre-mount because it may
+         * contain mount options such as journal_dev which are required
+         * to mount successfuly the underlying filesystem */
+        if (lmd->lmd_opts && (*(lmd->lmd_opts) != 0))
+                strncat(options, lmd->lmd_opts, CFS_PAGE_SIZE - 1);
+
          /* Pre-mount ldiskfs to read the MOUNT_DATA_FILE */
          CDEBUG(D_MOUNT, "Pre-mount ldiskfs %s\n", lmd->lmd_dev);
-        mnt = ll_kern_mount("ldiskfs", s_flags, lmd->lmd_dev, 0);
+        mnt = ll_kern_mount("ldiskfs", s_flags, lmd->lmd_dev, (void *)options);
          if (IS_ERR(mnt)) {
                  rc = PTR_ERR(mnt);
-#if (LINUX_VERSION_CODE >= KERNEL_VERSION(2,5,0))
                  /* 2.6 kernels: if ldiskfs fails, try ldiskfs2 */
-                mnt = ll_kern_mount("ldiskfs2", s_flags, lmd->lmd_dev, 0);
+                mnt = ll_kern_mount("ldiskfs2", s_flags, lmd->lmd_dev,
+                                    (void *)options);
                  if (IS_ERR(mnt)) {
                          int rc2 = PTR_ERR(mnt);
                          CERROR("premount %s:%#lx ldiskfs failed: %d, ldiskfs2 "
@@ -1241,15 +1280,6 @@ static struct vfsmount *server_kernel_mount(struct super_block *sb)
                                 lmd->lmd_dev, s_flags, rc, rc2);
                          GOTO(out_free, rc);
                  }
-#else
-                /* 2.4 kernels: if ldiskfs fails, try ext3 */
-                mnt = ll_kern_mount("ext3", s_flags, lmd->lmd_dev, 0);
-                if (IS_ERR(mnt)) {
-                        rc = PTR_ERR(mnt);
-                        CERROR("premount ext3 failed: rc = %d\n", rc);
-                        GOTO(out_free, rc);
-                }
-#endif
          }
  
          OBD_SET_CTXT_MAGIC(&mount_ctxt);
@@ -1268,12 +1298,6 @@ static struct vfsmount *server_kernel_mount(struct super_block *sb)
          /* Done with our pre-mount, now do the real mount. */
  
          /* Glom up mount options */
-        OBD_PAGE_ALLOC(__page, CFS_ALLOC_STD);
-        if (!__page)
-                GOTO(out_free, rc = -ENOMEM);
-        page = (unsigned long)cfs_page_address(__page);
-
-        options = (char *)page;
          memset(options, 0, CFS_PAGE_SIZE);
          strncpy(options, ldd->ldd_mount_opts, CFS_PAGE_SIZE - 2);
  
@@ -1293,18 +1317,24 @@ static struct vfsmount *server_kernel_mount(struct super_block *sb)
                 MT_STR(ldd), lmd->lmd_dev, options);
          mnt = ll_kern_mount(MT_STR(ldd), s_flags, lmd->lmd_dev,
                              (void *)options);
-        OBD_PAGE_FREE(__page);
          if (IS_ERR(mnt)) {
                  rc = PTR_ERR(mnt);
                  CERROR("ll_kern_mount failed: rc = %d\n", rc);
                  GOTO(out_free, rc);
          }
  
+        if (lmd->lmd_flags & LMD_FLG_ABORT_RECOV)
+                simple_truncate(mnt->mnt_sb->s_root, mnt, LAST_RCVD,
+                                LR_CLIENT_START);
+
+        OBD_PAGE_FREE(__page);
          lsi->lsi_ldd = ldd;   /* freed at lsi cleanup */
          CDEBUG(D_SUPER, "%s: mnt = %p\n", lmd->lmd_dev, mnt);
          RETURN(mnt);
  
  out_free:
+        if (__page)
+                OBD_PAGE_FREE(__page);
          OBD_FREE(ldd, sizeof(*ldd));
          lsi->lsi_ldd = NULL;
          RETURN(ERR_PTR(rc));
@@ -1322,10 +1352,9 @@ static void server_wait_finished(struct vfsmount *mnt)
                  LCONSOLE_WARN("Mount still busy with %d refs, waiting for "
                                "%d secs...\n",
                                atomic_read(&mnt->mnt_count), retries);
-
                  /* Wait for a bit */
                  retries -= 5;
-                lwi = LWI_TIMEOUT(5 * HZ, NULL, NULL);
+                lwi = LWI_TIMEOUT(cfs_time_seconds(5), NULL, NULL);
                  l_wait_event(waitq, 0, &lwi);
          }
          if (atomic_read(&mnt->mnt_count) > 1) {
@@ -1354,7 +1383,7 @@ static void server_put_super(struct super_block *sb)
          CDEBUG(D_MOUNT, "server put_super %s\n", tmpname);
  
          /* Stop the target */
-        if (!(lsi->lsi_lmd->lmd_flags & LMD_FLG_NOSVC) && 
+        if (!(lsi->lsi_lmd->lmd_flags & LMD_FLG_NOSVC) &&
              (IS_MDT(lsi->lsi_ldd) || IS_OST(lsi->lsi_ldd))) {
                  struct lustre_profile *lprof = NULL;
  
@@ -1560,9 +1589,9 @@ static int server_fill_super(struct super_block *sb)
          if (IS_ERR(mnt)) {
                  rc = PTR_ERR(mnt);
                  CERROR("Unable to mount device %s: %d\n",
-                      lsi->lsi_lmd->lmd_dev, rc);
+                       lsi->lsi_lmd->lmd_dev, rc);
                  lustre_put_lsi(sb);
-                GOTO(out, rc);
+                RETURN(rc);
          }
          lsi->lsi_srv_mnt = mnt;
  
@@ -1574,14 +1603,14 @@ static int server_fill_super(struct super_block *sb)
          if (class_name2obd(lsi->lsi_ldd->ldd_svname)) {
                  LCONSOLE_ERROR_MSG(0x161, "The target named %s is already "
                                     "running. Double-mount may have compromised "
-                                   "the disk journal.\n", 
+                                   "the disk journal.\n",
                                     lsi->lsi_ldd->ldd_svname);
-                unlock_mntput(mnt);
                  lustre_put_lsi(sb);
-                GOTO(out, rc = -EALREADY);
+                unlock_mntput(mnt);
+                RETURN(-EALREADY);
          }
  
-        /* start MGS before MGC */
+        /* Start MGS before MGC */
          if (IS_MGS(lsi->lsi_ldd) && !(lsi->lsi_lmd->lmd_flags & LMD_FLG_NOMGS)) {
                  rc = server_start_mgs(sb);
                  if (rc)
@@ -1619,11 +1648,12 @@ static int server_fill_super(struct super_block *sb)
                        lsi->lsi_ldd->ldd_svname, lsi->lsi_lmd->lmd_dev);
  
          RETURN(0);
-
  out_mnt:
+        /* We jump here in case of failure while starting targets or MGS.
+         * In this case we can't just put @mnt and have to do real cleanup
+         * with stoping targets, etc. */
          server_put_super(sb);
-out:
-        RETURN(rc);
+        return rc;
  }
  
  /* Get the index from the obd name.
@@ -1838,8 +1868,11 @@ static int lmd_parse(char *options, struct lustre_mount_data *lmd)
                             must be the last one. */
                          *s1 = '\0';
                          break;
+                } else if (strncmp(s1, "loop=", 5) == 0) {
+                        clear++;
                  }
  
+
                  /* Find next opt */
                  s2 = strchr(s1, ',');
                  if (s2 == NULL) {
@@ -1860,8 +1893,9 @@ static int lmd_parse(char *options, struct lustre_mount_data *lmd)
                  goto invalid;
          }
  
-        s1 = strrchr(devname, ':');
+        s1 = strstr(devname, ":/");
          if (s1) {
+                ++s1;
                  lmd->lmd_flags = LMD_FLG_CLIENT;
                  /* Remove leading /s from fsname */
                  while (*++s1 == '/') ;
@@ -1919,7 +1953,7 @@ int lustre_fill_super(struct super_block *sb, void *data, int silent)
          /* Figure out the lmd from the mount options */
          if (lmd_parse((char *)data, lmd)) {
                  lustre_put_lsi(sb);
-                RETURN(-EINVAL);
+                GOTO(out, rc = -EINVAL);
          }
  
          if (lmd_is_client(lmd)) {
@@ -1928,18 +1962,19 @@ int lustre_fill_super(struct super_block *sb, void *data, int silent)
                          LCONSOLE_ERROR_MSG(0x165, "Nothing registered for "
                                             "client mount! Is the 'lustre' "
                                             "module loaded?\n");
+                        lustre_put_lsi(sb);
                          rc = -ENODEV;
                  } else {
                          rc = lustre_start_mgc(sb);
                          if (rc) {
                                  lustre_stop_mgc(sb);
-                                goto out;
+                                lustre_put_lsi(sb);
+                                GOTO(out, rc);
                          }
                          /* Connect and start */
                          /* (should always be ll_fill_super) */
                          rc = (*client_fill_super)(sb);
                          /* c_f_s will call lustre_common_put_super on failure */
-
                  }
          } else {
                  CDEBUG(D_MOUNT, "Mounting server from %s\n", lmd->lmd_dev);
@@ -1951,14 +1986,18 @@ int lustre_fill_super(struct super_block *sb, void *data, int silent)
                  /* s_f_s will call server_put_super on failure */
          }
  
+        /* If error happens in fill_super() call, @lsi will be killed there.
+         * This is why we do not put it here. */
+        GOTO(out, rc);
  out:
-        if (rc){
+        if (rc) {
                  CERROR("Unable to mount %s (%d)\n",
                         s2lsi(sb) ? lmd->lmd_dev : "", rc);
          } else {
-                CDEBUG(D_SUPER, "mount %s complete\n", lmd->lmd_dev);
+                CDEBUG(D_SUPER, "Mount %s complete\n", 
+                       lmd->lmd_dev);
          }
-        RETURN(rc);
+        return rc;
  }
  
  
@@ -1976,7 +2015,6 @@ void lustre_register_kill_super_cb(void (*cfs)(struct super_block *sb))
  
  /***************** FS registration ******************/
  
-#if (LINUX_VERSION_CODE >= KERNEL_VERSION(2,5,0))
  /* 2.5 and later */
  #if (LINUX_VERSION_CODE < KERNEL_VERSION(2,6,18))
  struct super_block * lustre_get_sb(struct file_system_type *fs_type,
@@ -2018,28 +2056,6 @@ struct file_system_type lustre_fs_type = {
                          LL_RENAME_DOES_D_MOVE,
  };
  
-#else
-/* 2.4 */
-static struct super_block *lustre_read_super(struct super_block *sb,
-                                             void *data, int silent)
-{
-        int rc;
-        ENTRY;
-
-        rc = lustre_fill_super(sb, data, silent);
-        if (rc)
-                RETURN(NULL);
-        RETURN(sb);
-}
-
-static struct file_system_type lustre_fs_type = {
-        .owner          = THIS_MODULE,
-        .name           = "lustre",
-        .fs_flags       = FS_NFSEXP_FSID,
-        .read_super     = lustre_read_super,
-};
-#endif
-
  int lustre_register_fs(void)
  {
          return register_filesystem(&lustre_fs_type);
@@ -2061,5 +2077,3 @@ EXPORT_SYMBOL(server_register_target);
  EXPORT_SYMBOL(server_name2index);
  EXPORT_SYMBOL(server_mti_print);
  EXPORT_SYMBOL(do_lcfg);
-
-
diff --git a/lustre/obdclass/obdo.c b/lustre/obdclass/obdo.c

index b17b4d3..95e5d52 100644 (file)
--- a/lustre/obdclass/obdo.c
+++ b/lustre/obdclass/obdo.c
@@ -1,28 +1,41 @@
  /* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
   * vim:expandtab:shiftwidth=8:tabstop=8:
   *
- * Object Devices Class Driver
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
   *
- *  Copyright (C) 2001-2003 Cluster File Systems, Inc.
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
   *
- *   This file is part of the Lustre file system, http://www.lustre.org
- *   Lustre is a trademark of Cluster File Systems, Inc.
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
   *
- *   You may have signed or agreed to another license before downloading
- *   this software.  If so, you are bound by the terms and conditions
- *   of that agreement, and the following does not apply to you.  See the
- *   LICENSE file included with this distribution for more information.
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
   *
- *   If you did not agree to a different license, then this copy of Lustre
- *   is open source software; you can redistribute it and/or modify it
- *   under the terms of version 2 of the GNU General Public License as
- *   published by the Free Software Foundation.
+ * GPL HEADER END
+ */
+/*
+ * Copyright  2008 Sun Microsystems, Inc. All rights reserved
+ * Use is subject to license terms.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
   *
- *   In either case, Lustre is distributed in the hope that it will be
- *   useful, but WITHOUT ANY WARRANTY; without even the implied warranty
- *   of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- *   license text for more details.
+ * lustre/obdclass/obdo.c
   *
+ * Object Devices Class Driver
   * These are the only exported functions, they provide some generic
   * infrastructure for managing object devices
   */
@@ -69,6 +82,10 @@ void obdo_cpy_md(struct obdo *dst, struct obdo *src, obd_flag valid)
                  dst->o_flags = src->o_flags;
          if (valid & OBD_MD_FLGENER)
                  dst->o_generation = src->o_generation;
+        if (valid & OBD_MD_FLHANDLE)
+                dst->o_handle = src->o_handle;
+        if (valid & OBD_MD_FLCOOKIE)
+                dst->o_lcookie = src->o_lcookie;
  
          dst->o_valid |= valid;
  }
diff --git a/lustre/obdclass/statfs_pack.c b/lustre/obdclass/statfs_pack.c

index 8e20f85..6196380 100644 (file)
--- a/lustre/obdclass/statfs_pack.c
+++ b/lustre/obdclass/statfs_pack.c
@@ -1,29 +1,43 @@
  /* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
   * vim:expandtab:shiftwidth=8:tabstop=8:
   *
- *  Copyright (C) 2002, 2003 Cluster File Systems, Inc.
- *   Author: Andreas Dilger <adilger@clusterfs.com>
+ * GPL HEADER START
   *
- *   This file is part of the Lustre file system, http://www.lustre.org
- *   Lustre is a trademark of Cluster File Systems, Inc.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   *
- *   You may have signed or agreed to another license before downloading
- *   this software.  If so, you are bound by the terms and conditions
- *   of that agreement, and the following does not apply to you.  See the
- *   LICENSE file included with this distribution for more information.
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
   *
- *   If you did not agree to a different license, then this copy of Lustre
- *   is open source software; you can redistribute it and/or modify it
- *   under the terms of version 2 of the GNU General Public License as
- *   published by the Free Software Foundation.
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
   *
- *   In either case, Lustre is distributed in the hope that it will be
- *   useful, but WITHOUT ANY WARRANTY; without even the implied warranty
- *   of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- *   license text for more details.
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright  2008 Sun Microsystems, Inc. All rights reserved
+ * Use is subject to license terms.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/obdclass/statfs_pack.c
   *
   * (Un)packing of OST/MDS requests
   *
+ * Author: Andreas Dilger <adilger@clusterfs.com>
   */
  
  #define DEBUG_SUBSYSTEM S_CLASS
diff --git a/lustre/obdclass/target.c b/lustre/obdclass/target.c

new file mode 100644 (file)

index 0000000..c723a99
--- /dev/null
+++ b/lustre/obdclass/target.c
@@ -0,0 +1,186 @@
+/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
+ * vim:expandtab:shiftwidth=8:tabstop=8:
+ *
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright  2008 Sun Microsystems, Inc. All rights reserved
+ * Use is subject to license terms.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/obdclass/class_target.c
+ *
+ * Common methods for target devices
+ *
+ * Author: Mikhail Pershin
+ */
+#define DEBUG_SUBSYSTEM S_CLASS
+#ifndef EXPORT_SYMTAB
+# define EXPORT_SYMTAB
+#endif
+
+#include <obd_class.h>
+#include <lustre_fsfilt.h>
+#include <libcfs/list.h>
+#include <lustre_disk.h>
+#include <lustre_lib.h>
+#include <linux/slab.h>
+#include <lustre_param.h>
+#include <obd.h>
+
+/**
+ * Initialize trans_table if it is not initialized yet
+ */
+void target_trans_table_init(struct obd_device *obd)
+{
+        struct lr_server_data *lsd = obd->u.obt.obt_lsd;
+        /** new export or from 1.6 fs */
+        if (le32_to_cpu(lsd->lsd_trans_table_time) == 0)
+                lsd->lsd_trans_table_time = cpu_to_le32(cfs_time_current_sec());
+        /** number of interval changed, write new value */
+        if (le32_to_cpu(lsd->lsd_expire_intervals) == 0)
+                lsd->lsd_expire_intervals = cpu_to_le32(LR_EXPIRE_INTERVALS);
+}
+EXPORT_SYMBOL(target_trans_table_init);
+
+/**
+ * Calculate time by index. All expiration time is divided by LR_EXPIRE_INTERVALS,
+ * so time of each index is calculated from time of first index
+ */
+static inline
+__u32 target_trans_table_slot2time(struct obd_device_target *obt, int idx)
+{
+        __u32 time = le32_to_cpu(obt->obt_lsd->lsd_trans_table_time);
+        __u32 age;
+
+        age = obt->obt_stale_export_age /
+              le32_to_cpu(obt->obt_lsd->lsd_expire_intervals) * idx;
+        return cfs_time_sub(time, age);
+}
+
+/**
+ * Check trans table in server_data to get last time this export was seen
+ */
+__u32 target_trans_table_last_time(struct obd_export *exp)
+{
+        struct obd_device_target *obt = &exp->exp_obd->u.obt;
+        const __u32 slots = le32_to_cpu(obt->obt_lsd->lsd_expire_intervals);
+        __u32 time = cfs_time_current_sec();
+        int i, idx = slots;
+
+        /** return current time */
+        if (obt->obt_stale_export_age == 0)
+                return time;
+
+        spin_lock(&obt->obt_trans_table_lock);
+        for (i = 0; i < slots; i++)
+                if (exp->exp_last_committed <=
+                    le64_to_cpu(obt->obt_lsd->lsd_trans_table[i]))
+                        idx = i;
+        if (idx < slots)
+                time = target_trans_table_slot2time(obt, idx);
+        spin_unlock(&obt->obt_trans_table_lock);
+        return time;
+}
+EXPORT_SYMBOL(target_trans_table_last_time);
+
+/**
+ * Recalculate trans_table slots data if stale_export_age is changed
+ */
+void target_trans_table_recalc(struct obd_device *obd, __u32 new_age)
+{
+        struct obd_device_target *obt = &obd->u.obt;
+        __u32 old_age = obt->obt_stale_export_age;
+        const __u32 slots = le32_to_cpu(obt->obt_lsd->lsd_expire_intervals);
+        __u64 *table = obt->obt_lsd->lsd_trans_table;
+        int i, j;
+
+        /** there is no old info to recalc */
+        if (obt->obt_stale_export_age == 0)
+                return;
+
+        /** Expand table */
+        spin_lock(&obt->obt_trans_table_lock);
+        if (old_age < new_age) {
+                for (j = 0; j < slots; j++) {
+                        i = j * new_age / old_age;
+                        /** no more data for new age */
+                        if (i >= slots)
+                                table[j] = 0;
+                        else
+                                table[j] = table[i];
+                }
+        } else {
+                for (j = slots; j > 0; j--) {
+                        i = (j - 1) * new_age / old_age;
+                        table[j] = table[i];
+                }
+        }
+        spin_unlock(&obt->obt_trans_table_lock);
+}
+EXPORT_SYMBOL(target_trans_table_recalc);
+
+/**
+ * New transno is arrived and it is time for new slot
+ */
+void target_trans_table_update(struct obd_export *exp, __u64 transno)
+{
+        struct obd_device_target *obt = &exp->exp_obd->u.obt;
+        __u32 shift = cfs_time_sub(cfs_time_current_sec(),
+                              le32_to_cpu(obt->obt_lsd->lsd_trans_table_time));
+        __u64 *table = obt->obt_lsd->lsd_trans_table;
+        const __u32 slots = le32_to_cpu(obt->obt_lsd->lsd_expire_intervals);
+        int n = 0, i, j;
+
+        /** how many slots are in shift */
+        if (obt->obt_stale_export_age > 0)
+                n = shift * slots / obt->obt_stale_export_age;
+        /** it is not time to update */
+        if (n == 0)
+                return;
+        spin_lock(&obt->obt_trans_table_lock);
+        /** shift table if there is overlapping or fill with latest transno */
+        for (i = slots - 1; i >= 1; i--) {
+                j = i > n ? i - n : 0;
+                table[i] = table[j];
+        }
+        /** now update first slot with new data */
+        obt->obt_lsd->lsd_trans_table_time = cpu_to_le32(cfs_time_current_sec());
+        obt->obt_lsd->lsd_trans_table[0] = cpu_to_le64(transno);
+        spin_unlock(&obt->obt_trans_table_lock);
+}
+EXPORT_SYMBOL(target_trans_table_update);
+
+int target_fs_version_capable(struct obd_device *obd)
+{
+        struct inode * inode = obd->u.obt.obt_rcvd_filp->f_dentry->d_inode;
+        /** check inode has version */
+        return (fsfilt_get_version(obd, inode) != -EOPNOTSUPP);
+}
+EXPORT_SYMBOL(target_fs_version_capable);
+
diff --git a/lustre/obdclass/uuid.c b/lustre/obdclass/uuid.c

index 5783e09..6059bc6 100644 (file)
--- a/lustre/obdclass/uuid.c
+++ b/lustre/obdclass/uuid.c
@@ -1,7 +1,41 @@
+/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
+ * vim:expandtab:shiftwidth=8:tabstop=8:
+ *
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
  /*
- * Public include file for the UUID library
+ * Copyright  2008 Sun Microsystems, Inc. All rights reserved
+ * Use is subject to license terms.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/obdclass/uuid.c
   *
- * Copyright (C) 2007 Cluster File System
+ * Public include file for the UUID library
   */
  
  #define DEBUG_SUBSYSTEM S_CLASS
diff --git a/lustre/obdecho/autoMakefile.am b/lustre/obdecho/autoMakefile.am

index d08aa57..bd83a99 100644 (file)
--- a/lustre/obdecho/autoMakefile.am
+++ b/lustre/obdecho/autoMakefile.am
@@ -1,7 +1,38 @@
-# Copyright (C) 2001  Cluster File Systems, Inc.
  #
-# This code is issued under the GNU General Public License.
-# See the file COPYING in this distribution
+# GPL HEADER START
+#
+# DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License version 2 only,
+# as published by the Free Software Foundation.
+#
+# This program is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+# General Public License version 2 for more details (a copy is included
+# in the LICENSE file that accompanied this code).
+#
+# You should have received a copy of the GNU General Public License
+# version 2 along with this program; If not, see
+# http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+#
+# Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+# CA 95054 USA or visit www.sun.com if you need additional information or
+# have any questions.
+#
+# GPL HEADER END
+#
+
+#
+# Copyright  2008 Sun Microsystems, Inc. All rights reserved
+# Use is subject to license terms.
+#
+
+#
+# This file is part of Lustre, http://www.lustre.org/
+# Lustre is a trademark of Sun Microsystems, Inc.
+#
  
  if LIBLUSTRE
  noinst_LIBRARIES = libobdecho.a
diff --git a/lustre/obdecho/echo.c b/lustre/obdecho/echo.c

index b822c56..dab376b 100644 (file)
--- a/lustre/obdecho/echo.c
+++ b/lustre/obdecho/echo.c
@@ -1,27 +1,42 @@
  /* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
   * vim:expandtab:shiftwidth=8:tabstop=8:
   *
- *  Copyright (c) 2001-2003 Cluster File Systems, Inc.
- *   Author: Peter Braam <braam@clusterfs.com>
- *   Author: Andreas Dilger <adilger@clusterfs.com>
+ * GPL HEADER START
   *
- *   This file is part of the Lustre file system, http://www.lustre.org
- *   Lustre is a trademark of Cluster File Systems, Inc.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   *
- *   You may have signed or agreed to another license before downloading
- *   this software.  If so, you are bound by the terms and conditions
- *   of that agreement, and the following does not apply to you.  See the
- *   LICENSE file included with this distribution for more information.
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
   *
- *   If you did not agree to a different license, then this copy of Lustre
- *   is open source software; you can redistribute it and/or modify it
- *   under the terms of version 2 of the GNU General Public License as
- *   published by the Free Software Foundation.
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
   *
- *   In either case, Lustre is distributed in the hope that it will be
- *   useful, but WITHOUT ANY WARRANTY; without even the implied warranty
- *   of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- *   license text for more details.
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright  2008 Sun Microsystems, Inc. All rights reserved
+ * Use is subject to license terms.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/obdecho/echo.c
+ *
+ * Author: Peter Braam <braam@clusterfs.com>
+ * Author: Andreas Dilger <adilger@clusterfs.com>
   */
  
  #ifndef EXPORT_SYMTAB
@@ -81,11 +96,17 @@ static int echo_disconnect(struct obd_export *exp)
          return class_disconnect(exp);
  }
  
+static int echo_init_export(struct obd_export *exp)
+{
+        return ldlm_init_export(exp);
+}
+
  static int echo_destroy_export(struct obd_export *exp)
  {
          ENTRY;
  
          target_destroy_export(exp);
+        ldlm_destroy_export(exp);
  
          RETURN(0);
  }
@@ -267,16 +288,79 @@ echo_page_debug_check(cfs_page_t *page, obd_id id,
  /* This allows us to verify that desc_private is passed unmolested */
  #define DESC_PRIV 0x10293847
  
+static int echo_map_nb_to_lb(struct obdo *oa, struct obd_ioobj *obj,
+                             struct niobuf_remote *nb, int *pages,
+                             struct niobuf_local *lb, int cmd, int *left)
+{
+        int gfp_mask = (obj->ioo_id & 1) ? CFS_ALLOC_HIGHUSER : CFS_ALLOC_STD;
+        int ispersistent = obj->ioo_id == ECHO_PERSISTENT_OBJID;
+        int debug_setup = (!ispersistent &&
+                           (oa->o_valid & OBD_MD_FLFLAGS) != 0 &&
+                           (oa->o_flags & OBD_FL_DEBUG_CHECK) != 0);
+        struct niobuf_local *res = lb;
+        obd_off offset = nb->offset;
+        int len = nb->len;
+
+        while (len > 0) {
+                int plen = CFS_PAGE_SIZE - (offset & (CFS_PAGE_SIZE-1));
+                if (len < plen)
+                        plen = len;
+
+                /* check for local buf overflow */
+                if (*left == 0)
+                        return -EINVAL;
+
+                res->offset = offset;
+                res->len = plen;
+                LASSERT((res->offset & ~CFS_PAGE_MASK) + res->len <= CFS_PAGE_SIZE);
+
+
+                if (ispersistent &&
+                    (res->offset >> CFS_PAGE_SHIFT) < ECHO_PERSISTENT_PAGES) {
+                        res->page = echo_persistent_pages[res->offset >>
+                                CFS_PAGE_SHIFT];
+                        /* Take extra ref so __free_pages() can be called OK */
+                        cfs_get_page (res->page);
+                } else {
+                        OBD_PAGE_ALLOC(res->page, gfp_mask);
+                        if (res->page == NULL) {
+                                CERROR("can't get page for id " LPU64"\n",
+                                       obj->ioo_id);
+                                return -ENOMEM;
+                        }
+                }
+
+                CDEBUG(D_PAGE, "$$$$ get page %p @ "LPU64" for %d\n",
+                       res->page, res->offset, res->len);
+
+                if (cmd & OBD_BRW_READ)
+                        res->rc = res->len;
+
+                if (debug_setup)
+                        echo_page_debug_setup(res->page, cmd, obj->ioo_id,
+                                              res->offset, res->len);
+
+                offset += plen;
+                len -= plen;
+                res++;
+
+                (*left)--;
+                (*pages)++;
+        }
+        
+        return 0;
+}
+
  int echo_preprw(int cmd, struct obd_export *export, struct obdo *oa,
-                int objcount, struct obd_ioobj *obj, int niocount,
-                struct niobuf_remote *nb, struct niobuf_local *res,
+                int objcount, struct obd_ioobj *obj, struct niobuf_remote *nb,
+                int *pages, struct niobuf_local *res,
                  struct obd_trans_info *oti)
  {
          struct obd_device *obd;
          struct niobuf_local *r = res;
          int tot_bytes = 0;
          int rc = 0;
-        int i;
+        int i, left;
          ENTRY;
  
          obd = export->exp_obd;
@@ -286,59 +370,33 @@ int echo_preprw(int cmd, struct obd_export *export, struct obdo *oa,
          /* Temp fix to stop falling foul of osc_announce_cached() */
          oa->o_valid &= ~(OBD_MD_FLBLOCKS | OBD_MD_FLGRANT);
  
-        memset(res, 0, sizeof(*res) * niocount);
+        memset(res, 0, sizeof(*res) * *pages);
  
          CDEBUG(D_PAGE, "%s %d obdos with %d IOs\n",
-               cmd == OBD_BRW_READ ? "reading" : "writing", objcount, niocount);
+               cmd == OBD_BRW_READ ? "reading" : "writing", objcount, *pages);
  
          if (oti)
                  oti->oti_handle = (void *)DESC_PRIV;
  
+        left = *pages;
+        *pages = 0;
+
          for (i = 0; i < objcount; i++, obj++) {
-                int gfp_mask = (obj->ioo_id & 1) ? CFS_ALLOC_HIGHUSER : CFS_ALLOC_STD;
-                int ispersistent = obj->ioo_id == ECHO_PERSISTENT_OBJID;
-                int debug_setup = (!ispersistent &&
-                                   (oa->o_valid & OBD_MD_FLFLAGS) != 0 &&
-                                   (oa->o_flags & OBD_FL_DEBUG_CHECK) != 0);
                  int j;
  
                  for (j = 0 ; j < obj->ioo_bufcnt ; j++, nb++, r++) {
  
-                        if (ispersistent &&
-                            (nb->offset >> CFS_PAGE_SHIFT) < ECHO_PERSISTENT_PAGES) {
-                                r->page = echo_persistent_pages[nb->offset >>
-                                                                CFS_PAGE_SHIFT];
-                                /* Take extra ref so __free_pages() can be called OK */
-                                cfs_get_page (r->page);
-                        } else {
-                                OBD_PAGE_ALLOC(r->page, gfp_mask);
-                                if (r->page == NULL) {
-                                        CERROR("can't get page %u/%u for id "
-                                               LPU64"\n",
-                                               j, obj->ioo_bufcnt, obj->ioo_id);
-                                        GOTO(preprw_cleanup, rc = -ENOMEM);
-                                }
-                        }
+                        rc = echo_map_nb_to_lb(oa, obj, nb, pages,
+                                               res + *pages, cmd, &left);
+                        if (rc)
+                                GOTO(preprw_cleanup, rc);
  
                          tot_bytes += nb->len;
-
-                        atomic_inc(&obd->u.echo.eo_prep);
-
-                        r->offset = nb->offset;
-                        r->len = nb->len;
-                        LASSERT((r->offset & ~CFS_PAGE_MASK) + r->len <= CFS_PAGE_SIZE);
-
-                        CDEBUG(D_PAGE, "$$$$ get page %p @ "LPU64" for %d\n",
-                               r->page, r->offset, r->len);
-
-                        if (cmd & OBD_BRW_READ)
-                                r->rc = r->len;
-
-                        if (debug_setup)
-                                echo_page_debug_setup(r->page, cmd, obj->ioo_id,
-                                                      r->offset, r->len);
                  }
          }
+
+        atomic_add(*pages, &obd->u.echo.eo_prep);
+
          if (cmd & OBD_BRW_READ)
                  lprocfs_counter_add(obd->obd_stats, LPROC_ECHO_READ_BYTES,
                                      tot_bytes);
@@ -357,21 +415,22 @@ preprw_cleanup:
           * all down again.  I believe that this is what the in-kernel
           * prep/commit operations do.
           */
-        CERROR("cleaning up %ld pages (%d obdos)\n", (long)(r - res), objcount);
-        while (r-- > res) {
-                cfs_kunmap(r->page);
+        CERROR("cleaning up %u pages (%d obdos)\n", *pages, objcount);
+        for (i = 0; i < *pages; i++) {
+                cfs_kunmap(res[i].page);
                  /* NB if this is a persistent page, __free_pages will just
                   * lose the extra ref gained above */
-                OBD_PAGE_FREE(r->page);
+                OBD_PAGE_FREE(res[i].page);
+                res[i].page = NULL;
                  atomic_dec(&obd->u.echo.eo_prep);
          }
-        memset(res, 0, sizeof(*res) * niocount);
  
          return rc;
  }
  
  int echo_commitrw(int cmd, struct obd_export *export, struct obdo *oa,
-                  int objcount, struct obd_ioobj *obj, int niocount,
+                  int objcount, struct obd_ioobj *obj,
+                  struct niobuf_remote *rb, int niocount,
                    struct niobuf_local *res, struct obd_trans_info *oti, int rc)
  {
          struct obd_device *obd;
@@ -524,6 +583,7 @@ static struct obd_ops echo_obd_ops = {
          .o_owner           = THIS_MODULE,
          .o_connect         = echo_connect,
          .o_disconnect      = echo_disconnect,
+        .o_init_export     = echo_init_export,
          .o_destroy_export  = echo_destroy_export,
          .o_create          = echo_create,
          .o_destroy         = echo_destroy,
@@ -581,7 +641,7 @@ static int __init obdecho_init(void)
          int rc;
  
          ENTRY;
-        printk(KERN_INFO "Lustre: Echo OBD driver; info@clusterfs.com\n");
+        printk(KERN_INFO "Lustre: Echo OBD driver; http://www.lustre.org/\n");
  
          LASSERT(CFS_PAGE_SIZE % OBD_ECHO_BLOCK_SIZE == 0);
  
@@ -614,7 +674,7 @@ static void /*__exit*/ obdecho_exit(void)
          echo_persistent_pages_fini ();
  }
  
-MODULE_AUTHOR("Cluster File Systems, Inc. <info@clusterfs.com>");
+MODULE_AUTHOR("Sun Microsystems, Inc. <http://www.lustre.org/>");
  MODULE_DESCRIPTION("Lustre Testing Echo OBD driver");
  MODULE_LICENSE("GPL");
  
diff --git a/lustre/obdecho/echo_client.c b/lustre/obdecho/echo_client.c

index b1dc7f5..9c9a6b6 100644 (file)
--- a/lustre/obdecho/echo_client.c
+++ b/lustre/obdecho/echo_client.c
@@ -1,25 +1,37 @@
  /* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
   * vim:expandtab:shiftwidth=8:tabstop=8:
   *
- *  Copyright (c) 2001-2003 Cluster File Systems, Inc.
+ * GPL HEADER START
   *
- *   This file is part of the Lustre file system, http://www.lustre.org
- *   Lustre is a trademark of Cluster File Systems, Inc.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   *
- *   You may have signed or agreed to another license before downloading
- *   this software.  If so, you are bound by the terms and conditions
- *   of that agreement, and the following does not apply to you.  See the
- *   LICENSE file included with this distribution for more information.
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
   *
- *   If you did not agree to a different license, then this copy of Lustre
- *   is open source software; you can redistribute it and/or modify it
- *   under the terms of version 2 of the GNU General Public License as
- *   published by the Free Software Foundation.
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
   *
- *   In either case, Lustre is distributed in the hope that it will be
- *   useful, but WITHOUT ANY WARRANTY; without even the implied warranty
- *   of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- *   license text for more details.
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright  2008 Sun Microsystems, Inc. All rights reserved
+ * Use is subject to license terms.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
   */
  
  #define DEBUG_SUBSYSTEM S_ECHO
@@ -864,6 +876,8 @@ static int echo_client_prep_commit(struct obd_export *exp, int rw,
          off = offset;
  
          for(; tot_pages; tot_pages -= npages) {
+                int lpages;
+
                  if (tot_pages < npages)
                          npages = tot_pages;
  
@@ -875,11 +889,13 @@ static int echo_client_prep_commit(struct obd_export *exp, int rw,
                  ioo.ioo_bufcnt = npages;
                  oti->oti_transno = 0;
  
-                ret = obd_preprw(rw, exp, oa, 1, &ioo, npages, rnb, lnb, oti);
+                lpages = npages;
+                ret = obd_preprw(rw, exp, oa, 1, &ioo, rnb, &lpages, lnb, oti);
                  if (ret != 0)
                          GOTO(out, ret);
+                LASSERT(lpages == npages);
  
-                for (i = 0; i < npages; i++) {
+                for (i = 0; i < lpages; i++) {
                          cfs_page_t *page = lnb[i].page;
  
                          /* read past eof? */
@@ -903,7 +919,7 @@ static int echo_client_prep_commit(struct obd_export *exp, int rw,
                                                               rnb[i].len);
                  }
  
-                ret = obd_commitrw(rw, exp, oa, 1, &ioo, npages, lnb, oti, ret);
+                ret = obd_commitrw(rw, exp, oa, 1,&ioo,rnb,npages,lnb,oti,ret);
                  if (ret != 0)
                          GOTO(out, ret);
          }
@@ -921,7 +937,7 @@ int echo_client_brw_ioctl(int rw, struct obd_export *exp,
  {
          struct obd_device *obd = class_exp2obd(exp);
          struct echo_client_obd *ec = &obd->u.echo_client;
-        struct obd_trans_info dummy_oti = { .oti_thread_id = -1 };
+        struct obd_trans_info dummy_oti = { .oti_thread = NULL };
          struct ec_object *eco;
          int rc;
          ENTRY;
@@ -1136,7 +1152,7 @@ echo_client_iocontrol(unsigned int cmd, struct obd_export *exp,
  
          switch (cmd) {
          case OBD_IOC_CREATE:                    /* may create echo object */
-                if (!capable (CAP_SYS_ADMIN))
+                if (!cfs_capable(CFS_CAP_SYS_ADMIN))
                          GOTO (out, rc = -EPERM);
  
                  rc = echo_create_object (obd, 1, &data->ioc_obdo1,
@@ -1145,7 +1161,7 @@ echo_client_iocontrol(unsigned int cmd, struct obd_export *exp,
                  GOTO(out, rc);
  
          case OBD_IOC_DESTROY:
-                if (!capable (CAP_SYS_ADMIN))
+                if (!cfs_capable(CFS_CAP_SYS_ADMIN))
                          GOTO (out, rc = -EPERM);
                  rc = echo_get_object (&eco, obd, &data->ioc_obdo1);
                  if (rc == 0) {
@@ -1172,7 +1188,7 @@ echo_client_iocontrol(unsigned int cmd, struct obd_export *exp,
                  GOTO(out, rc);
  
          case OBD_IOC_SETATTR:
-                if (!capable (CAP_SYS_ADMIN))
+                if (!cfs_capable(CFS_CAP_SYS_ADMIN))
                          GOTO (out, rc = -EPERM);
  
                  rc = echo_get_object (&eco, obd, &data->ioc_obdo1);
@@ -1187,7 +1203,7 @@ echo_client_iocontrol(unsigned int cmd, struct obd_export *exp,
                  GOTO(out, rc);
  
          case OBD_IOC_BRW_WRITE:
-                if (!capable (CAP_SYS_ADMIN))
+                if (!cfs_capable(CFS_CAP_SYS_ADMIN))
                          GOTO (out, rc = -EPERM);
  
                  rw = OBD_BRW_WRITE;
@@ -1206,7 +1222,7 @@ echo_client_iocontrol(unsigned int cmd, struct obd_export *exp,
                  GOTO(out, rc);
  
          case ECHO_IOC_SET_STRIPE:
-                if (!capable (CAP_SYS_ADMIN))
+                if (!cfs_capable(CFS_CAP_SYS_ADMIN))
                          GOTO (out, rc = -EPERM);
  
                  if (data->ioc_pbuf1 == NULL) {  /* unset */
@@ -1223,7 +1239,7 @@ echo_client_iocontrol(unsigned int cmd, struct obd_export *exp,
                  GOTO (out, rc);
  
          case ECHO_IOC_ENQUEUE:
-                if (!capable (CAP_SYS_ADMIN))
+                if (!cfs_capable(CFS_CAP_SYS_ADMIN))
                          GOTO (out, rc = -EPERM);
  
                  rc = echo_client_enqueue(exp, &data->ioc_obdo1,
diff --git a/lustre/obdecho/lproc_echo.c b/lustre/obdecho/lproc_echo.c

index 42ded58..bdf0713 100644 (file)
--- a/lustre/obdecho/lproc_echo.c
+++ b/lustre/obdecho/lproc_echo.c
@@ -1,26 +1,37 @@
  /* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
   * vim:expandtab:shiftwidth=8:tabstop=8:
   *
- *  Copyright (C) 2002, 2003 Cluster File Systems, Inc.
+ * GPL HEADER START
   *
- *   This file is part of the Lustre file system, http://www.lustre.org
- *   Lustre is a trademark of Cluster File Systems, Inc.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   *
- *   You may have signed or agreed to another license before downloading
- *   this software.  If so, you are bound by the terms and conditions
- *   of that agreement, and the following does not apply to you.  See the
- *   LICENSE file included with this distribution for more information.
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
   *
- *   If you did not agree to a different license, then this copy of Lustre
- *   is open source software; you can redistribute it and/or modify it
- *   under the terms of version 2 of the GNU General Public License as
- *   published by the Free Software Foundation.
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
   *
- *   In either case, Lustre is distributed in the hope that it will be
- *   useful, but WITHOUT ANY WARRANTY; without even the implied warranty
- *   of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- *   license text for more details.
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
   *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright  2008 Sun Microsystems, Inc. All rights reserved
+ * Use is subject to license terms.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
   */
  #define DEBUG_SUBSYSTEM S_ECHO
  
diff --git a/lustre/obdfilter/Makefile.in b/lustre/obdfilter/Makefile.in

index 8305eb5..fd7412a 100644 (file)
--- a/lustre/obdfilter/Makefile.in
+++ b/lustre/obdfilter/Makefile.in
@@ -2,11 +2,6 @@ MODULES := obdfilter
  
  obdfilter-objs := filter.o filter_io.o filter_log.o
  obdfilter-objs += lproc_obdfilter.o filter_lvb.o
-
-ifeq ($(PATCHLEVEL),4)
-obdfilter-objs += filter_io_24.o
-else
  obdfilter-objs += filter_io_26.o
-endif # PATCHLEVEL 
  
  @INCLUDE_RULES@
diff --git a/lustre/obdfilter/autoMakefile.am b/lustre/obdfilter/autoMakefile.am

index 5f90afb..89490fb 100644 (file)
--- a/lustre/obdfilter/autoMakefile.am
+++ b/lustre/obdfilter/autoMakefile.am
@@ -1,11 +1,42 @@
-# Copyright (C) 2001  Cluster File Systems, Inc.
  #
-# This code is issued under the GNU General Public License.
-# See the file COPYING in this distribution
+# GPL HEADER START
+#
+# DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License version 2 only,
+# as published by the Free Software Foundation.
+#
+# This program is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+# General Public License version 2 for more details (a copy is included
+# in the LICENSE file that accompanied this code).
+#
+# You should have received a copy of the GNU General Public License
+# version 2 along with this program; If not, see
+# http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+#
+# Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+# CA 95054 USA or visit www.sun.com if you need additional information or
+# have any questions.
+#
+# GPL HEADER END
+#
+
+#
+# Copyright  2008 Sun Microsystems, Inc. All rights reserved
+# Use is subject to license terms.
+#
+
+#
+# This file is part of Lustre, http://www.lustre.org/
+# Lustre is a trademark of Sun Microsystems, Inc.
+#
  
  if MODULES
  modulefs_DATA = obdfilter$(KMODEXT)
  endif
  
  MOSTLYCLEANFILES := @MOSTLYCLEANFILES@ 
-DIST_SOURCES = $(obdfilter-objs:%.o=%.c) filter_io_24.c filter_io_26.c filter_internal.h
+DIST_SOURCES = $(obdfilter-objs:%.o=%.c) filter_io_26.c filter_internal.h
diff --git a/lustre/obdfilter/filter.c b/lustre/obdfilter/filter.c

index 1e991ba..7438201 100644 (file)
--- a/lustre/obdfilter/filter.c
+++ b/lustre/obdfilter/filter.c
@@ -1,26 +1,42 @@
  /* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
   * vim:expandtab:shiftwidth=8:tabstop=8:
   *
- *  linux/fs/obdfilter/filter.c
+ * GPL HEADER START
   *
- *  Copyright (c) 2001-2003 Cluster File Systems, Inc.
- *   Author: Peter Braam <braam@clusterfs.com>
- *   Author: Andreas Dilger <adilger@clusterfs.com>
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   *
- *   This file is part of Lustre, http://www.lustre.org.
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
   *
- *   Lustre is free software; you can redistribute it and/or
- *   modify it under the terms of version 2 of the GNU General Public
- *   License as published by the Free Software Foundation.
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
   *
- *   Lustre is distributed in the hope that it will be useful,
- *   but WITHOUT ANY WARRANTY; without even the implied warranty of
- *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- *   GNU General Public License for more details.
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
   *
- *   You should have received a copy of the GNU General Public License
- *   along with Lustre; if not, write to the Free Software
- *   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright  2008 Sun Microsystems, Inc. All rights reserved
+ * Use is subject to license terms.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/obdfilter/filter.c
+ *
+ * Author: Peter Braam <braam@clusterfs.com>
+ * Author: Andreas Dilger <adilger@clusterfs.com>
   */
  
  /*
@@ -43,10 +59,8 @@
  #include <linux/init.h>
  #include <linux/version.h>
  #include <linux/sched.h>
-#if (LINUX_VERSION_CODE > KERNEL_VERSION(2,5,0))
-# include <linux/mount.h>
-# include <linux/buffer_head.h>
-#endif
+#include <linux/mount.h>
+#include <linux/buffer_head.h>
  
  #include <obd_class.h>
  #include <obd_lov.h>
@@ -54,12 +68,12 @@
  #include <lustre_fsfilt.h>
  #include <lprocfs_status.h>
  #include <lustre_log.h>
-#include <lustre_commit_confd.h>
  #include <libcfs/list.h>
  #include <lustre_disk.h>
  #include <lustre_quota.h>
  #include <linux/slab.h>
  #include <lustre_param.h>
+#include <lustre/ll_fiemap.h>
  
  #include "filter_internal.h"
  
@@ -69,11 +83,38 @@ cfs_mem_cache_t *ll_fmd_cachep;
  static void filter_commit_cb(struct obd_device *obd, __u64 transno,
                               void *cb_data, int error)
  {
-        obd_transno_commit_cb(obd, transno, error);
+        struct obd_export *exp = cb_data;
+        obd_transno_commit_cb(obd, transno, exp, error);
+}
+
+int filter_version_get_check(struct obd_export *exp,
+                             struct obd_trans_info *oti, struct inode *inode)
+{
+        __u64 curr_version;
+
+        if (inode == NULL || oti == NULL)
+                RETURN(0);
+
+        curr_version = fsfilt_get_version(exp->exp_obd, inode);
+        if ((__s64)curr_version == -EOPNOTSUPP)
+                RETURN(0);
+        /* VBR: version is checked always because costs nothing */
+        if (oti->oti_pre_version != 0 &&
+            oti->oti_pre_version != curr_version) {
+                CDEBUG(D_INODE, "Version mismatch "LPX64" != "LPX64"\n",
+                       oti->oti_pre_version, curr_version);
+                spin_lock(&exp->exp_lock);
+                exp->exp_vbr_failed = 1;
+                spin_unlock(&exp->exp_lock);
+                RETURN (-EOVERFLOW);
+        }
+        oti->oti_pre_version = curr_version;
+        RETURN(0);
  }
  
  /* Assumes caller has already pushed us into the kernel context. */
-int filter_finish_transno(struct obd_export *exp, struct obd_trans_info *oti,
+int filter_finish_transno(struct obd_export *exp, struct inode *inode,
+                          struct obd_trans_info *oti,
                            int rc, int force_sync)
  {
          struct filter_obd *filter = &exp->exp_obd->u.filter;
@@ -91,24 +132,29 @@ int filter_finish_transno(struct obd_export *exp, struct obd_trans_info *oti,
                  RETURN(rc);
  
          /* we don't allocate new transnos for replayed requests */
+        spin_lock(&filter->fo_translock);
          if (oti->oti_transno == 0) {
-                spin_lock(&filter->fo_translock);
                  last_rcvd = le64_to_cpu(filter->fo_fsd->lsd_last_transno) + 1;
                  filter->fo_fsd->lsd_last_transno = cpu_to_le64(last_rcvd);
-                spin_unlock(&filter->fo_translock);
-                oti->oti_transno = last_rcvd;
          } else {
-                spin_lock(&filter->fo_translock);
                  last_rcvd = oti->oti_transno;
                  if (last_rcvd > le64_to_cpu(filter->fo_fsd->lsd_last_transno))
-                        filter->fo_fsd->lsd_last_transno =
-                                cpu_to_le64(last_rcvd);
+                        filter->fo_fsd->lsd_last_transno = cpu_to_le64(last_rcvd);
+        }
+        oti->oti_transno = last_rcvd;
+        if (last_rcvd <= le64_to_cpu(lcd->lcd_last_transno)) {
                  spin_unlock(&filter->fo_translock);
+                LBUG();
          }
          lcd->lcd_last_transno = cpu_to_le64(last_rcvd);
+        lcd->lcd_pre_versions[0] = cpu_to_le64(oti->oti_pre_version);
+        lcd->lcd_last_xid = cpu_to_le64(oti->oti_xid);
+        target_trans_table_update(exp, last_rcvd);
+
+        spin_unlock(&filter->fo_translock);
  
-        /* could get xid from oti, if it's ever needed */
-        lcd->lcd_last_xid = 0;
+        if (inode)
+                fsfilt_set_version(exp->exp_obd, inode, last_rcvd);
  
          off = fed->fed_lr_off;
          if (off <= 0) {
@@ -117,17 +163,17 @@ int filter_finish_transno(struct obd_export *exp, struct obd_trans_info *oti,
                  err = -EINVAL;
          } else {
                  if (!force_sync)
-                        force_sync = fsfilt_add_journal_cb(exp->exp_obd, 
+                        force_sync = fsfilt_add_journal_cb(exp->exp_obd,
                                                             last_rcvd,
                                                             oti->oti_handle,
                                                             filter_commit_cb,
-                                                           NULL);
+                                                           exp);
  
                  err = fsfilt_write_record(exp->exp_obd, filter->fo_rcvd_filp,
                                            lcd, sizeof(*lcd), &off,
                                            force_sync | exp->exp_need_sync);
                  if (force_sync)
-                        filter_commit_cb(exp->exp_obd, last_rcvd, NULL, err);
+                        filter_commit_cb(exp->exp_obd, last_rcvd, exp, err);
          }
          if (err) {
                  log_pri = D_ERROR;
@@ -194,9 +240,15 @@ static int filter_export_stats_init(struct obd_device *obd,
          if (obd_uuid_equals(&exp->exp_client_uuid, &obd->obd_uuid))
                  /* Self-export gets no proc entry */
                  RETURN(0);
-        rc = lprocfs_exp_setup(exp, client_nid, &newnid);
-        if (rc)
+        rc = lprocfs_exp_setup(exp, (lnet_nid_t *)client_nid, &newnid);
+        if (rc) {
+                /* Mask error for already created
+                 * /proc entries */
+                if (rc == -EALREADY)
+                        rc = 0;
+
                  RETURN(rc);
+        }
  
          if (newnid) {
                  struct nid_stat *tmp = exp->exp_nid_stats;
@@ -224,7 +276,85 @@ static int filter_export_stats_init(struct obd_device *obd,
                                              tmp->nid_stats);
                  if (rc)
                          RETURN(rc);
+
+                /* Always add in ldlm_stats */
+                tmp->nid_ldlm_stats = lprocfs_alloc_stats(LDLM_LAST_OPC -
+                                                          LDLM_FIRST_OPC, 0);
+                if (tmp->nid_ldlm_stats == NULL)
+                        return -ENOMEM;
+
+                lprocfs_init_ldlm_stats(tmp->nid_ldlm_stats);
+
+                rc = lprocfs_register_stats(tmp->nid_proc, "ldlm_stats",
+                                            tmp->nid_ldlm_stats);
+        }
+
+        RETURN(0);
+}
+
+/* VBR: to determine the delayed client the lcd should be updated for each new
+ * epoch */
+static int filter_update_client_epoch(struct obd_export *exp)
+{
+        struct filter_export_data *fed = &exp->exp_filter_data;
+        struct filter_obd *filter = &exp->exp_obd->u.filter;
+        struct lvfs_run_ctxt saved;
+        loff_t off = fed->fed_lr_off;
+        int rc = 0;
+
+        /* VBR: set client last_epoch to current epoch */
+        if (le32_to_cpu(fed->fed_lcd->lcd_last_epoch) >=
+                        le32_to_cpu(filter->fo_fsd->lsd_start_epoch))
+                return rc;
+        fed->fed_lcd->lcd_last_epoch = filter->fo_fsd->lsd_start_epoch;
+        push_ctxt(&saved, &exp->exp_obd->obd_lvfs_ctxt, NULL);
+        rc = fsfilt_write_record(exp->exp_obd, filter->fo_rcvd_filp,
+                                 fed->fed_lcd, sizeof(*fed->fed_lcd), &off,
+                                 exp->exp_delayed);
+        pop_ctxt(&saved, &exp->exp_obd->obd_lvfs_ctxt, NULL);
+
+        CDEBUG(D_INFO, "update client idx %u last_epoch %#x (%#x)\n",
+               fed->fed_lr_idx, le32_to_cpu(fed->fed_lcd->lcd_last_epoch),
+               le32_to_cpu(filter->fo_fsd->lsd_start_epoch));
+
+        return rc;
+}
+
+/* Called after recovery is done on server */
+static void filter_update_last_epoch(struct obd_device *obd)
+{
+        struct ptlrpc_request *req;
+        struct filter_obd *filter = &obd->u.filter;
+        struct lr_server_data *fsd = filter->fo_fsd;
+        __u32 start_epoch;
+
+        /* Increase server epoch after recovery */
+        spin_lock(&filter->fo_translock);
+        /* VBR: increase the epoch and store it in lsd */
+        start_epoch = lr_epoch(le64_to_cpu(fsd->lsd_last_transno)) + 1;
+        fsd->lsd_last_transno = cpu_to_le64((__u64)start_epoch << LR_EPOCH_BITS);
+        fsd->lsd_start_epoch = cpu_to_le32(start_epoch);
+        spin_unlock(&filter->fo_translock);
+
+        /* go through delayed reply queue to find all exports participate in
+         * recovery and set new epoch for them */
+        list_for_each_entry(req, &obd->obd_delayed_reply_queue, rq_list) {
+                LASSERT(!req->rq_export->exp_delayed);
+                filter_update_client_epoch(req->rq_export);
          }
+        filter_update_server_data(obd, filter->fo_rcvd_filp, fsd, 1);
+}
+
+static int filter_postrecov(struct obd_device *obd)
+{
+        ENTRY;
+
+        if (obd->obd_fail)
+                RETURN(0);
+
+        LASSERT(!obd->obd_recovering);
+        /* VBR: update start_epoch on server */
+        filter_update_last_epoch(obd);
  
          RETURN(0);
  }
@@ -250,6 +380,10 @@ static int filter_client_add(struct obd_device *obd, struct obd_export *exp,
          if (strcmp(fed->fed_lcd->lcd_uuid, obd->obd_uuid.uuid) == 0)
                  RETURN(0);
  
+        /* VBR: remove expired exports before searching for free slot */
+        if (new_client)
+                class_disconnect_expired_exports(obd);
+
          /* the bitmap operations can handle cl_idx > sizeof(long) * 8, so
           * there's no need for extra complication here
           */
@@ -300,6 +434,9 @@ static int filter_client_add(struct obd_device *obd, struct obd_export *exp,
                          rc = PTR_ERR(handle);
                          CERROR("unable to start transaction: rc %d\n", rc);
                  } else {
+                        fed->fed_lcd->lcd_last_epoch =
+                                              filter->fo_fsd->lsd_start_epoch;
+                        exp->exp_last_request_time = cfs_time_current_sec();
                          rc = fsfilt_add_journal_cb(obd, 0, handle,
                                                     target_client_add_cb, exp);
                          if (rc == 0) {
@@ -326,12 +463,13 @@ static int filter_client_add(struct obd_device *obd, struct obd_export *exp,
          RETURN(0);
  }
  
+struct lsd_client_data zero_lcd; /* globals are implicitly zeroed */
+
  static int filter_client_free(struct obd_export *exp)
  {
          struct filter_export_data *fed = &exp->exp_filter_data;
          struct filter_obd *filter = &exp->exp_obd->u.filter;
          struct obd_device *obd = exp->exp_obd;
-        struct lsd_client_data zero_lcd;
          struct lvfs_run_ctxt saved;
          int rc;
          loff_t off;
@@ -345,7 +483,7 @@ static int filter_client_free(struct obd_export *exp)
                  GOTO(free, 0);
  
          CDEBUG(D_INFO, "freeing client at idx %u, offset %lld with UUID '%s'\n",
-               fed->fed_lr_idx, off, fed->fed_lcd->lcd_uuid);
+               fed->fed_lr_idx, fed->fed_lr_off, fed->fed_lcd->lcd_uuid);
  
          LASSERT(filter->fo_last_rcvd_slots != NULL);
  
@@ -368,24 +506,27 @@ static int filter_client_free(struct obd_export *exp)
          }
  
          if (!(exp->exp_flags & OBD_OPT_FAILOVER)) {
-                memset(&zero_lcd, 0, sizeof(zero_lcd));
+                /* Don't force sync on disconnect if aborting recovery,
+                 * or it does num_clients * num_osts.  b=17194 */
+                int need_sync = (!exp->exp_libclient || exp->exp_need_sync) &&
+                                !(exp->exp_flags&OBD_OPT_ABORT_RECOV);
+
                  push_ctxt(&saved, &obd->obd_lvfs_ctxt, NULL);
                  rc = fsfilt_write_record(obd, filter->fo_rcvd_filp, &zero_lcd,
-                                         sizeof(zero_lcd), &off,
-                                         (!exp->exp_libclient ||
-                                          exp->exp_need_sync));
+                                         sizeof(zero_lcd), &off, 0);
  
+                /* Make sure the server's last_transno is up to date. Do this
+                 * after the client is freed so we know all the client's
+                 * transactions have been committed. */
                  if (rc == 0)
-                        /* update server's transno */
                          filter_update_server_data(obd, filter->fo_rcvd_filp,
-                                                  filter->fo_fsd,
-                                                  !exp->exp_libclient);
+                                                  filter->fo_fsd, need_sync);
                  pop_ctxt(&saved, &obd->obd_lvfs_ctxt, NULL);
  
                  CDEBUG(rc == 0 ? D_INFO : D_ERROR,
-                       "zeroing out client %s at idx %u (%llu) in %s rc %d\n",
+                       "zero out client %s at idx %u/%llu in %s %ssync rc %d\n",
                         fed->fed_lcd->lcd_uuid, fed->fed_lr_idx, fed->fed_lr_off,
-                       LAST_RCVD, rc);
+                       LAST_RCVD, need_sync ? "" : "a", rc);
          }
  
          if (!test_and_clear_bit(fed->fed_lr_idx, filter->fo_last_rcvd_slots)) {
@@ -577,12 +718,12 @@ static int filter_init_export(struct obd_export *exp)
  {
          spin_lock_init(&exp->exp_filter_data.fed_lock);
          INIT_LIST_HEAD(&exp->exp_filter_data.fed_mod_list);
-        
+
          spin_lock(&exp->exp_lock);
          exp->exp_connecting = 1;
          spin_unlock(&exp->exp_lock);
  
-        return 0;
+        return ldlm_init_export(exp);
  }
  
  static int filter_free_server_data(struct filter_obd *filter)
@@ -652,6 +793,7 @@ static int filter_init_server_data(struct obd_device *obd, struct file * filp)
          struct inode *inode = filp->f_dentry->d_inode;
          unsigned long last_rcvd_size = i_size_read(inode);
          __u64 mount_count;
+        __u32 start_epoch;
          int cl_idx;
          loff_t off = 0;
          int rc;
@@ -723,7 +865,12 @@ static int filter_init_server_data(struct obd_device *obd, struct file * filp)
                  GOTO(err_fsd, rc = -EINVAL);
          }
  
-        CDEBUG(D_INODE, "%s: server last_transno : "LPU64"\n",
+        target_trans_table_init(obd);
+        start_epoch = le32_to_cpu(fsd->lsd_start_epoch);
+
+        CDEBUG(D_INODE, "%s: server start_epoch : %#x\n",
+               obd->obd_name, start_epoch);
+        CDEBUG(D_INODE, "%s: server last_transno : "LPX64"\n",
                 obd->obd_name, le64_to_cpu(fsd->lsd_last_transno));
          CDEBUG(D_INODE, "%s: server mount_count: "LPU64"\n",
                 obd->obd_name, mount_count + 1);
@@ -801,20 +948,34 @@ static int filter_init_server_data(struct obd_device *obd, struct file * filp)
                          /* can't fail for existing client */
                          LASSERTF(rc == 0, "rc = %d\n", rc);
  
-                        lcd = NULL;
+                        /* VBR: set export last committed */
+                        exp->exp_last_committed = last_rcvd;
+                        /* read last time from disk */
+                        exp->exp_last_request_time = target_trans_table_last_time(exp);
  
                          spin_lock(&exp->exp_lock);
                          exp->exp_replay_needed = 1;
                          exp->exp_connecting = 0;
+                        exp->exp_in_recovery = 0;
                          spin_unlock(&exp->exp_lock);
  
+                        spin_lock_bh(&obd->obd_processing_task_lock);
                          obd->obd_recoverable_clients++;
                          obd->obd_max_recoverable_clients++;
+                        spin_unlock_bh(&obd->obd_processing_task_lock);
+
+                        /* VBR: if epoch too old mark export as delayed,
+                         * if epoch is zero then client is pre-vbr one */
+                        if (start_epoch > le32_to_cpu(lcd->lcd_last_epoch) &&
+                            le32_to_cpu(lcd->lcd_last_epoch) != 0)
+                                class_set_export_delayed(exp);
+
+                        lcd = NULL;
                          class_export_put(exp);
                  }
  
                  /* Need to check last_rcvd even for duplicated exports. */
-                CDEBUG(D_OTHER, "client at idx %d has last_rcvd = "LPU64"\n",
+                CDEBUG(D_OTHER, "client at idx %d has last_rcvd = "LPX64"\n",
                         cl_idx, last_rcvd);
  
                  if (last_rcvd > le64_to_cpu(fsd->lsd_last_transno))
@@ -828,8 +989,9 @@ static int filter_init_server_data(struct obd_device *obd, struct file * filp)
  
          if (obd->obd_recoverable_clients) {
                  CWARN("RECOVERY: service %s, %d recoverable clients, "
-                      "last_rcvd "LPU64"\n", obd->obd_name,
-                      obd->obd_recoverable_clients,
+                      "%d delayed clients, last_rcvd "LPU64"\n",
+                      obd->obd_name, obd->obd_recoverable_clients,
+                      obd->obd_delayed_clients,
                        le64_to_cpu(fsd->lsd_last_transno));
                  obd->obd_next_recovery_transno = obd->obd_last_committed + 1;
                  obd->obd_recovering = 1;
@@ -840,8 +1002,11 @@ static int filter_init_server_data(struct obd_device *obd, struct file * filp)
                  /* b13079: this should be set to desired value for ost */
                  obd->obd_recovery_max_time = OBD_RECOVERY_MAX_TIME;
  #endif
+        } else {
+                LASSERT(!obd->obd_recovering);
+                /* VBR: update boot epoch after recovery */
+                filter_update_last_epoch(obd);
          }
-
  out:
          filter->fo_mount_count = mount_count + 1;
          fsd->lsd_mount_count = cpu_to_le64(filter->fo_mount_count);
@@ -920,7 +1085,8 @@ static int filter_prep_groups(struct obd_device *obd)
          int i, rc = 0, cleanup_phase = 0;
          ENTRY;
  
-        O_dentry = simple_mkdir(current->fs->pwd, "O", 0700, 1);
+        O_dentry = simple_mkdir(current->fs->pwd, filter->fo_vfsmnt,
+                                "O", 0700, 1);
          CDEBUG(D_INODE, "got/created O: %p\n", O_dentry);
          if (IS_ERR(O_dentry)) {
                  rc = PTR_ERR(O_dentry);
@@ -947,7 +1113,8 @@ static int filter_prep_groups(struct obd_device *obd)
                  loff_t off = 0;
  
                  sprintf(name, "%d", i);
-                dentry = simple_mkdir(O_dentry, name, 0700, 1);
+                dentry = simple_mkdir(O_dentry, filter->fo_vfsmnt,
+                                      name, 0700, 1);
                  CDEBUG(D_INODE, "got/created O/%s: %p\n", name, dentry);
                  if (IS_ERR(dentry)) {
                          rc = PTR_ERR(dentry);
@@ -998,7 +1165,8 @@ static int filter_prep_groups(struct obd_device *obd)
                          char dir[20];
                          snprintf(dir, sizeof(dir), "d%u", i);
  
-                        dentry = simple_mkdir(O_dentry, dir, 0700, 1);
+                        dentry = simple_mkdir(O_dentry, filter->fo_vfsmnt,
+                                              dir, 0700, 1);
                          CDEBUG(D_INODE, "got/created O/0/%s: %p\n", dir,dentry);
                          if (IS_ERR(dentry)) {
                                  rc = PTR_ERR(dentry);
@@ -1291,7 +1459,8 @@ static int filter_prepare_destroy(struct obd_device *obd, obd_id objid)
   * i_sem before starting a handle, while filter_destroy() + vfs_unlink do the
   * reverse.  Caller must take i_sem before starting the transaction and we
   * drop it here before the inode is removed from the dentry.  bug 4180/6984 */
-int filter_vfs_unlink(struct inode *dir, struct dentry *dentry)
+int filter_vfs_unlink(struct inode *dir, struct dentry *dentry,
+                      struct vfsmount *mnt)
  {
          int rc;
          ENTRY;
@@ -1315,8 +1484,9 @@ int filter_vfs_unlink(struct inode *dir, struct dentry *dentry)
                  GOTO(out, rc = -EPERM);
  
          /* check_sticky() */
-        if ((dentry->d_inode->i_uid != current->fsuid && !capable(CAP_FOWNER))||
-            IS_APPEND(dentry->d_inode) || IS_IMMUTABLE(dentry->d_inode))
+        if ((dentry->d_inode->i_uid != current->fsuid &&
+             !cfs_capable(CFS_CAP_FOWNER)) || IS_APPEND(dentry->d_inode) ||
+            IS_IMMUTABLE(dentry->d_inode))
                  GOTO(out, rc = -EPERM);
  
          /* NOTE: This might need to go outside i_mutex, though it isn't clear if
@@ -1324,12 +1494,9 @@ int filter_vfs_unlink(struct inode *dir, struct dentry *dentry)
           *       here) or some other ordering issue. */
          DQUOT_INIT(dir);
  
-#if (LINUX_VERSION_CODE > KERNEL_VERSION(2,5,0))
-        rc = security_inode_unlink(dir, dentry);
+        rc = ll_security_inode_unlink(dir, dentry, mnt);
          if (rc)
                  GOTO(out, rc);
-#endif
-
          rc = dir->i_op->unlink(dir, dentry);
  out:
          /* need to drop i_mutex before we lose inode reference */
@@ -1348,6 +1515,7 @@ static int filter_destroy_internal(struct obd_device *obd, obd_id objid,
                                     struct dentry *dchild)
  {
          struct inode *inode = dchild->d_inode;
+        struct filter_obd *filter = &obd->u.filter;
          int rc;
  
          if (inode->i_nlink != 1 || atomic_read(&inode->i_count) != 1) {
@@ -1357,7 +1525,7 @@ static int filter_destroy_internal(struct obd_device *obd, obd_id objid,
                         atomic_read(&inode->i_count));
          }
  
-        rc = filter_vfs_unlink(dparent->d_inode, dchild);
+        rc = filter_vfs_unlink(dparent->d_inode, dchild, filter->fo_vfsmnt);
          if (rc)
                  CERROR("error unlinking objid %.*s: rc %d\n",
                         dchild->d_name.len, dchild->d_name.name, rc);
@@ -1426,7 +1594,7 @@ static int filter_intent_policy(struct ldlm_namespace *ns,
          int idx, rc, tmpflags = 0, only_liblustre = 1;
          struct ldlm_interval_tree *tree;
          struct filter_intent_args arg;
-        int repsize[3] = { [MSG_PTLRPC_BODY_OFF] = sizeof(struct ptlrpc_body),
+        __u32 repsize[3] = { [MSG_PTLRPC_BODY_OFF] = sizeof(struct ptlrpc_body),
                             [DLM_LOCKREPLY_OFF]   = sizeof(*rep),
                             [DLM_REPLY_REC_OFF]   = sizeof(*reply_lvb) };
          ENTRY;
@@ -1448,14 +1616,11 @@ static int filter_intent_policy(struct ldlm_namespace *ns,
  
          //fixup_handle_for_resent_req(req, lock, &lockh);
  
-        /* If we grant any lock at all, it will be a whole-file read lock.
-         * Call the extent policy function to see if our request can be
-         * granted, or is blocked. 
-         * If the OST lock has LDLM_FL_HAS_INTENT set, it means a glimpse lock
+        /* Call the extent policy function to see if our request can be
+         * granted, or is blocked.
+         * If the OST lock has LDLM_FL_HAS_INTENT set, it means a glimpse
+         * lock, and should not be granted if the lock will be blocked.
           */
-        lock->l_policy_data.l_extent.start = 0;
-        lock->l_policy_data.l_extent.end = OBD_OBJECT_EOF;
-        lock->l_req_mode = LCK_PR;
  
          LASSERT(ns == res->lr_namespace);
          lock_res(res);
@@ -1515,7 +1680,7 @@ static int filter_intent_policy(struct ldlm_namespace *ns,
                  if (tree->lit_mode == LCK_PR)
                          continue;
  
-                interval_iterate_reverse(tree->lit_root, 
+                interval_iterate_reverse(tree->lit_root,
                                           filter_intent_cb, &arg);
          }
          unlock_res(res);
@@ -1648,7 +1813,8 @@ static int filter_iobuf_pool_init(struct filter_obd *filter)
   * If we haven't allocated a pool entry for this thread before, do so now. */
  void *filter_iobuf_get(struct filter_obd *filter, struct obd_trans_info *oti)
  {
-        int thread_id                    = oti ? oti->oti_thread_id : -1;
+        int thread_id                    = (oti && oti->oti_thread) ?
+                                           oti->oti_thread->t_id : -1;
          struct filter_iobuf  *pool       = NULL;
          struct filter_iobuf **pool_place = NULL;
  
@@ -1684,6 +1850,9 @@ int filter_common_setup(struct obd_device *obd, obd_count len, void *buf,
          __u8 *uuid_ptr;
          char *str, *label;
          char ns_name[48];
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,9)
+        request_queue_t *q;
+#endif
          int rc;
          ENTRY;
  
@@ -1736,6 +1905,9 @@ int filter_common_setup(struct obd_device *obd, obd_count len, void *buf,
  
          filter->fo_vfsmnt = mnt;
          obd->u.obt.obt_sb = mnt->mnt_sb;
+        obd->u.obt.obt_stale_export_age = STALE_EXPORT_MAXTIME_DEFAULT;
+        spin_lock_init(&obd->u.obt.obt_trans_table_lock);
+
          filter->fo_fstype = mnt->mnt_sb->s_type->name;
          CDEBUG(D_SUPER, "%s: mnt = %p\n", filter->fo_fstype, mnt);
  
@@ -1749,10 +1921,6 @@ int filter_common_setup(struct obd_device *obd, obd_count len, void *buf,
          obd->obd_lvfs_ctxt.fs = get_ds();
          obd->obd_lvfs_ctxt.cb_ops = filter_lvfs_ops;
  
-        rc = filter_prep(obd);
-        if (rc)
-                GOTO(err_ops, rc);
-
          filter->fo_destroy_in_progress = 0;
          sema_init(&filter->fo_create_lock, 1);
          spin_lock_init(&filter->fo_translock);
@@ -1760,10 +1928,16 @@ int filter_common_setup(struct obd_device *obd, obd_count len, void *buf,
          INIT_LIST_HEAD(&filter->fo_export_list);
          sema_init(&filter->fo_alloc_lock, 1);
          init_brw_stats(&filter->fo_filter_stats);
+        filter->fo_read_cache = 1; /* enable read-only cache by default */
+        filter->fo_writethrough_cache = 1; /* enable writethrough cache */
          filter->fo_readcache_max_filesize = FILTER_MAX_CACHE_SIZE;
          filter->fo_fmd_max_num = FILTER_FMD_MAX_NUM_DEFAULT;
          filter->fo_fmd_max_age = FILTER_FMD_MAX_AGE_DEFAULT;
  
+        rc = filter_prep(obd);
+        if (rc)
+                GOTO(err_ops, rc);
+
          sprintf(ns_name, "filter-%s", obd->obd_uuid.uuid);
          obd->obd_namespace = ldlm_namespace_new(obd, ns_name, LDLM_NAMESPACE_SERVER,
                                                  LDLM_NAMESPACE_GREEDY);
@@ -1776,7 +1950,7 @@ int filter_common_setup(struct obd_device *obd, obd_count len, void *buf,
          ptlrpc_init_client(LDLM_CB_REQUEST_PORTAL, LDLM_CB_REPLY_PORTAL,
                             "filter_ldlm_cb_client", &obd->obd_ldlm_client);
  
-        rc = llog_cat_initialize(obd, 1, NULL);
+        rc = obd_llog_init(obd, obd, 1, NULL, NULL);
          if (rc) {
                  CERROR("failed to setup llogging subsystems\n");
                  GOTO(err_post, rc);
@@ -1786,6 +1960,17 @@ int filter_common_setup(struct obd_device *obd, obd_count len, void *buf,
          if (rc)
                  GOTO(err_post, rc);
  
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,9)
+        q = bdev_get_queue(mnt->mnt_sb->s_bdev);
+        if (q->max_sectors < q->max_hw_sectors &&
+            q->max_sectors < PTLRPC_MAX_BRW_SIZE >> 9)
+                LCONSOLE_INFO("%s: underlying device %s should be tuned "
+                              "for larger I/O requests: max_sectors = %u "
+                              "could be up to max_hw_sectors=%u\n",
+                              obd->obd_name, mnt->mnt_sb->s_id,
+                              q->max_sectors, q->max_hw_sectors);
+#endif
+
          uuid_ptr = fsfilt_uuid(obd, obd->u.obt.obt_sb);
          if (uuid_ptr != NULL) {
                  class_uuid_unparse(uuid_ptr, &uuid);
@@ -1807,8 +1992,8 @@ int filter_common_setup(struct obd_device *obd, obd_count len, void *buf,
                                label ?: "", label ? "/" : "", str,
                                obd->obd_recovery_timeout / 60,
                                obd->obd_recovery_timeout % 60,
-                              obd->obd_max_recoverable_clients,
-                              (obd->obd_max_recoverable_clients == 1) ? "":"s",
+                              obd->obd_recoverable_clients,
+                              (obd->obd_recoverable_clients == 1) ? "":"s",
                                obd->obd_name);
          } else {
                  LCONSOLE_INFO("OST %s now serving %s (%s%s%s) with recovery "
@@ -1863,14 +2048,32 @@ static int filter_setup(struct obd_device *obd, obd_count len, void *buf)
                  lprocfs_counter_init(obd->obd_stats, LPROC_FILTER_WRITE_BYTES,
                                       LPROCFS_CNTR_AVGMINMAX,
                                       "write_bytes", "bytes");
+                lprocfs_counter_init(obd->obd_stats, LPROC_FILTER_GET_PAGE,
+                                     LPROCFS_CNTR_AVGMINMAX|LPROCFS_CNTR_STDDEV,
+                                     "get_page", "usec");
+                lprocfs_counter_init(obd->obd_stats, LPROC_FILTER_NO_PAGE,
+                                     LPROCFS_CNTR_AVGMINMAX,
+                                     "get_page failures", "num");
+                lprocfs_counter_init(obd->obd_stats, LPROC_FILTER_CACHE_ACCESS,
+                                     LPROCFS_CNTR_AVGMINMAX,
+                                     "cache_access", "pages");
+                lprocfs_counter_init(obd->obd_stats, LPROC_FILTER_CACHE_HIT,
+                                     LPROCFS_CNTR_AVGMINMAX,
+                                     "cache_hit", "pages");
+                lprocfs_counter_init(obd->obd_stats, LPROC_FILTER_CACHE_MISS,
+                                     LPROCFS_CNTR_AVGMINMAX,
+                                     "cache_miss", "pages");
                  lproc_filter_attach_seqstat(obd);
+#ifdef HAVE_DELAYED_RECOVERY
+                lprocfs_obd_attach_stale_exports(obd);
+#endif
                  obd->obd_proc_exports_entry = proc_mkdir("exports",
                                                           obd->obd_proc_entry);
          }
          if (obd->obd_proc_exports_entry)
                  lprocfs_add_simple(obd->obd_proc_exports_entry, "clear",
                                     lprocfs_nid_stats_clear_read,
-                                   lprocfs_nid_stats_clear_write, obd);
+                                   lprocfs_nid_stats_clear_write, obd, NULL);
  
          memcpy((void *)addr, lustre_cfg_buf(lcfg, 4),
                 LUSTRE_CFG_BUFLEN(lcfg, 4));
@@ -1903,61 +2106,75 @@ static int filter_llog_init(struct obd_device *obd, struct obd_device *tgt,
          int rc;
          ENTRY;
  
-        OBD_ALLOC(filter->fo_lcm, sizeof(struct llog_commit_master));
+        filter->fo_lcm = llog_recov_thread_init(obd->obd_name);
          if (!filter->fo_lcm)
                  RETURN(-ENOMEM);
  
-        rc = llog_init_commit_master((struct llog_commit_master *)
-                                     filter->fo_lcm);
-        if (rc)
-                GOTO(cleanup, rc);
-
          filter_mds_ost_repl_logops = llog_client_ops;
          filter_mds_ost_repl_logops.lop_cancel = llog_obd_repl_cancel;
-        filter_mds_ost_repl_logops.lop_connect = llog_repl_connect;
+        filter_mds_ost_repl_logops.lop_connect = llog_obd_repl_connect;
          filter_mds_ost_repl_logops.lop_sync = llog_obd_repl_sync;
  
          rc = llog_setup(obd, LLOG_MDS_OST_REPL_CTXT, tgt, 0, NULL,
                          &filter_mds_ost_repl_logops);
          if (rc)
-                GOTO(cleanup, rc);
+                GOTO(cleanup_lcm, rc);
  
          /* FIXME - assign unlink_cb for filter's recovery */
          ctxt = llog_get_context(obd, LLOG_MDS_OST_REPL_CTXT);
          ctxt->llog_proc_cb = filter_recov_log_mds_ost_cb;
-        ctxt->loc_lcm = obd->u.filter.fo_lcm;
-        rc = llog_start_commit_thread(ctxt->loc_lcm);
+        ctxt->loc_lcm = filter->fo_lcm;
          llog_ctxt_put(ctxt);
-        if (rc)
-                GOTO(cleanup, rc);
  
          rc = llog_setup(obd, LLOG_SIZE_ORIG_CTXT, tgt, 0, NULL,
                          &filter_size_orig_logops);
-
-cleanup:
-        if (rc) {
-                llog_cleanup_commit_master(filter->fo_lcm, 0);
-                OBD_FREE(filter->fo_lcm, sizeof(struct llog_commit_master));
-                filter->fo_lcm = NULL;
-        }
+        if (rc)
+                GOTO(cleanup_ctxt, rc);
          RETURN(rc);
+cleanup_ctxt:
+        ctxt = llog_get_context(obd, LLOG_MDS_OST_REPL_CTXT);
+        if (ctxt)
+                llog_cleanup(ctxt);
+cleanup_lcm:
+        llog_recov_thread_fini(filter->fo_lcm, 1);
+        filter->fo_lcm = NULL;
+        return rc;
  }
  
  static int filter_llog_finish(struct obd_device *obd, int count)
  {
+        struct filter_obd *filter = &obd->u.filter;
          struct llog_ctxt *ctxt;
          int rc = 0, rc2 = 0;
          ENTRY;
  
-        if (obd->u.filter.fo_lcm) {
-                llog_cleanup_commit_master((struct llog_commit_master *)
-                                           obd->u.filter.fo_lcm, 1);
-                OBD_FREE(obd->u.filter.fo_lcm, 
-                         sizeof(struct llog_commit_master));
-                obd->u.filter.fo_lcm = NULL;
+        ctxt = llog_get_context(obd, LLOG_MDS_OST_REPL_CTXT);
+        if (ctxt) {
+                /*
+                 * Make sure that no cached llcds left in recov_thread. We
+                 * actually do sync in disconnect time, but disconnect may
+                 * not come being marked rq_no_resend = 1.
+                 */
+                llog_sync(ctxt, NULL);
+
+                /*
+                 * Balance class_import_get() called in llog_receptor_accept().
+                 * This is safe to do here, as llog is already synchronized and
+                 * its import may go.
+                 */
+                mutex_down(&ctxt->loc_sem);
+                if (ctxt->loc_imp) {
+                        class_import_put(ctxt->loc_imp);
+                        ctxt->loc_imp = NULL;
+                }
+                mutex_up(&ctxt->loc_sem);
+        }
+
+        if (filter->fo_lcm) {
+                llog_recov_thread_fini(filter->fo_lcm, obd->obd_force);
+                filter->fo_lcm = NULL;
          }
  
-        ctxt = llog_get_context(obd, LLOG_MDS_OST_REPL_CTXT);
          if (ctxt)
                  rc = llog_cleanup(ctxt);
  
@@ -2138,7 +2355,8 @@ static int filter_connect_internal(struct obd_export *exp,
  
  static int filter_reconnect(struct obd_export *exp, struct obd_device *obd,
                              struct obd_uuid *cluuid,
-                            struct obd_connect_data *data)
+                            struct obd_connect_data *data,
+                            void *localdata)
  {
          int rc;
          ENTRY;
@@ -2147,6 +2365,8 @@ static int filter_reconnect(struct obd_export *exp, struct obd_device *obd,
                  RETURN(-EINVAL);
  
          rc = filter_connect_internal(exp, data);
+        if (rc == 0)
+                filter_export_stats_init(obd, exp, localdata);
  
          RETURN(rc);
  }
@@ -2334,11 +2554,11 @@ static int filter_destroy_export(struct obd_export *exp)
          lquota_clearinfo(filter_quota_interface_ref, exp, exp->exp_obd);
  
          target_destroy_export(exp);
+        ldlm_destroy_export(exp);
  
          if (obd_uuid_equals(&exp->exp_client_uuid, &exp->exp_obd->obd_uuid))
                  RETURN(0);
  
-        lprocfs_exp_cleanup(exp);
  
          if (exp->exp_obd->obd_replayable)
                  filter_client_free(exp);
@@ -2359,12 +2579,21 @@ static int filter_disconnect(struct obd_export *exp)
  {
          struct obd_device *obd = exp->exp_obd;
          struct llog_ctxt *ctxt;
-        int rc, err;
+        int rc;
          ENTRY;
  
          LASSERT(exp);
          class_export_get(exp);
  
+        /* Flush any remaining cancel messages out to the target */
+        ctxt = llog_get_context(obd, LLOG_MDS_OST_REPL_CTXT);
+        if (ctxt) {
+                if (ctxt->loc_imp == exp->exp_imp_reverse)
+                        CDEBUG(D_RPCTRACE, "Reverse import disconnect\n");
+                llog_sync(ctxt, exp);
+                llog_ctxt_put(ctxt);
+        }
+
          if (!(exp->exp_flags & OBD_OPT_FORCE))
                  filter_grant_sanity_check(obd, __FUNCTION__);
          filter_grant_discard(exp);
@@ -2374,14 +2603,7 @@ static int filter_disconnect(struct obd_export *exp)
          if (exp->exp_obd->obd_namespace != NULL)
                  ldlm_cancel_locks_for_export(exp);
  
-        /* flush any remaining cancel messages out to the target */
-        ctxt = llog_get_context(obd, LLOG_MDS_OST_REPL_CTXT);
-        err = llog_sync(ctxt, exp);
-        llog_ctxt_put(ctxt);
-
-        if (err)
-                CERROR("error flushing logs to MDS: rc %d\n", err);
-
+        lprocfs_exp_cleanup(exp);
          class_export_put(exp);
          RETURN(rc);
  }
@@ -2390,6 +2612,9 @@ static int filter_ping(struct obd_export *exp)
  {
          filter_fmd_expire(exp);
  
+        if (exp->exp_delayed)
+                filter_update_client_epoch(exp);
+
          return 0;
  }
  
@@ -2491,6 +2716,7 @@ int filter_setattr_internal(struct obd_export *exp, struct dentry *dentry,
          struct llog_cookie *fcc = NULL;
          struct filter_obd *filter;
          int rc, err, locked = 0, sync = 0;
+        loff_t old_size = 0;
          unsigned int ia_valid;
          struct inode *inode;
          struct iattr iattr;
@@ -2516,9 +2742,15 @@ int filter_setattr_internal(struct obd_export *exp, struct dentry *dentry,
          if (ia_valid & ATTR_SIZE || ia_valid & (ATTR_UID | ATTR_GID)) {
                  DQUOT_INIT(inode);
                  LOCK_INODE_MUTEX(inode);
+                old_size = i_size_read(inode);
                  locked = 1;
          }
  
+        /* VBR: version recovery check */
+        rc = filter_version_get_check(exp, oti, inode);
+        if (rc)
+                GOTO(out_unlock, rc);
+
          /* If the inode still has SUID+SGID bits set (see filter_precreate())
           * then we will accept the UID+GID sent by the client during write for
           * initializing the ownership of this inode.  We only allow this to
@@ -2583,7 +2815,7 @@ int filter_setattr_internal(struct obd_export *exp, struct dentry *dentry,
          /* The truncate might have used up our transaction credits.  Make
           * sure we have one left for the last_rcvd update. */
          err = fsfilt_extend(exp->exp_obd, inode, 1, handle);
-        rc = filter_finish_transno(exp, oti, rc, sync);
+        rc = filter_finish_transno(exp, inode, oti, rc, sync);
          if (sync) {
                  filter_cancel_cookies_cb(exp->exp_obd, 0, fcc, rc);
                  fcc = NULL;
@@ -2598,14 +2830,17 @@ int filter_setattr_internal(struct obd_export *exp, struct dentry *dentry,
                  fcc = NULL;
          }
  
+        /* For a partial-page truncate flush the page to disk immediately
+         * to avoid data corruption during direct disk write. b=17397 */
+        if (!sync && (iattr.ia_valid & ATTR_SIZE) &&
+            old_size != iattr.ia_size && (iattr.ia_size & ~CFS_PAGE_MASK)) {
+                err = filemap_fdatawrite_range(inode->i_mapping, iattr.ia_size,
+                                               iattr.ia_size + 1);
+                if (!rc)
+                        rc = err;
+        }
+
          if (locked) {
-                /* Let's flush truncated page on disk immediately, then we can
-                 * avoid need to search for page aliases before directio writes
-                 * and this sort of stuff at expense of somewhat slower
-                 * truncates not on a page boundary. I believe this is the only
-                 * place in filter code that can lead to pages getting to
-                 * pagecache so far. */
-                filter_clear_truncated_page(inode);
                  UNLOCK_INODE_MUTEX(inode);
                  locked = 0;
          }
@@ -3004,8 +3239,6 @@ static int filter_precreate(struct obd_device *obd, struct obdo *oa,
                  } else
                          next_id = filter_last_id(filter, group) + 1;
  
-                CDEBUG(D_INFO, "precreate objid "LPU64"\n", next_id);
-
                  dparent = filter_parent_lock(obd, group, next_id);
                  if (IS_ERR(dparent))
                          GOTO(cleanup, rc = PTR_ERR(dparent));
@@ -3051,6 +3284,10 @@ static int filter_precreate(struct obd_device *obd, struct obdo *oa,
                          GOTO(cleanup, rc = PTR_ERR(handle));
                  cleanup_phase = 3;
  
+                CDEBUG(D_INODE, "%s: filter_precreate(od->o_gr="LPU64
+                       ",od->o_id="LPU64")\n", obd->obd_name, group, 
+                       next_id);
+
                  /* We mark object SUID+SGID to flag it for accepting UID+GID
                   * from client on first write.  Currently the permission bits
                   * on the OST are never used, so this is OK. */
@@ -3157,7 +3394,7 @@ int filter_recreate(struct obd_device *obd, struct obdo *oa)
  static int filter_create(struct obd_export *exp, struct obdo *oa,
                           struct lov_stripe_md **ea, struct obd_trans_info *oti)
  {
-        struct obd_device *obd = NULL;
+        struct obd_device *obd = exp->exp_obd;
          struct lvfs_run_ctxt saved;
          struct lov_stripe_md *lsm = NULL;
          struct ldlm_res_id res_id = { .name = { oa->o_id } };
@@ -3167,6 +3404,9 @@ static int filter_create(struct obd_export *exp, struct obdo *oa,
          int rc = 0;
          ENTRY;
  
+        CDEBUG(D_INODE, "%s: filter_create(od->o_gr="LPU64",od->o_id="
+               LPU64")\n", obd->obd_name, oa->o_gr, oa->o_id);
+
          if (!(oa->o_valid & OBD_MD_FLGROUP))
                  oa->o_gr = 0;
  
@@ -3180,7 +3420,6 @@ static int filter_create(struct obd_export *exp, struct obdo *oa,
                  }
          }
  
-        obd = exp->exp_obd;
          push_ctxt(&saved, &obd->obd_lvfs_ctxt, NULL);
  
          if ((oa->o_valid & OBD_MD_FLFLAGS) &&
@@ -3238,6 +3477,9 @@ int filter_destroy(struct obd_export *exp, struct obdo *oa,
          push_ctxt(&saved, &obd->obd_lvfs_ctxt, NULL);
          cleanup_phase = 1;
  
+        CDEBUG(D_INODE, "%s: filter_destroy(od->o_gr="LPU64",od->o_id="
+               LPU64")\n", obd->obd_name, oa->o_gr, oa->o_id);
+
          dchild = filter_fid2dentry(obd, NULL, oa->o_gr, oa->o_id);
          if (IS_ERR(dchild))
                  GOTO(cleanup, rc = PTR_ERR(dchild));
@@ -3257,7 +3499,7 @@ int filter_destroy(struct obd_export *exp, struct obdo *oa,
                  }
                  GOTO(cleanup, rc = -ENOENT);
          }
-        
+
          filter_prepare_destroy(obd, oa->o_id);
  
          /* Our MDC connection is established by the MDS to us */
@@ -3276,6 +3518,12 @@ int filter_destroy(struct obd_export *exp, struct obdo *oa,
           * (see BUG 4180) -bzzz
           */
          LOCK_INODE_MUTEX(dchild->d_inode);
+
+        /* VBR: version recovery check */
+        rc = filter_version_get_check(exp, oti, dchild->d_inode);
+        if (rc)
+                GOTO(cleanup, rc);
+
          handle = fsfilt_start_log(obd, dchild->d_inode, FSFILT_OP_SETATTR,
                                    NULL, 1);
          if (IS_ERR(handle)) {
@@ -3328,13 +3576,13 @@ cleanup:
                                                       filter_cancel_cookies_cb,
                                                       fcc);
                  /* If add_journal_cb failed, then filter_finish_transno
-                 * will commit the handle and we will do a sync 
-                 * on commit. then we call callback directly to free 
-                 * the fcc. 
+                 * will commit the handle and we will do a sync
+                 * on commit. then we call callback directly to free
+                 * the fcc.
                   */
-                rc = filter_finish_transno(exp, oti, rc, sync);
+                rc = filter_finish_transno(exp, NULL, oti, rc, sync);
                  if (sync) {
-                        filter_cancel_cookies_cb(obd, 0, fcc, rc); 
+                        filter_cancel_cookies_cb(obd, 0, fcc, rc);
                          fcc = NULL;
                  }
                  rc2 = fsfilt_commit(obd, dparent->d_inode, handle, 0);
@@ -3393,8 +3641,9 @@ static int filter_truncate(struct obd_export *exp, struct obd_info *oinfo,
          RETURN(rc);
  }
  
-static int filter_sync(struct obd_export *exp, struct obdo *oa,
-                       struct lov_stripe_md *lsm, obd_off start, obd_off end)
+static int filter_sync(struct obd_export *exp, struct obd_info *oinfo,
+                       obd_off start, obd_off end,
+                       struct ptlrpc_request_set *set)
  {
          struct lvfs_run_ctxt saved;
          struct filter_obd *filter;
@@ -3405,17 +3654,23 @@ static int filter_sync(struct obd_export *exp, struct obdo *oa,
  
          filter = &exp->exp_obd->u.filter;
  
-        /* an objid of zero is taken to mean "sync whole filesystem" */
-        if (!oa || !(oa->o_valid & OBD_MD_FLID)) {
+        /* An objid of zero is taken to mean "sync whole filesystem" */
+        if (!oinfo->oi_oa || !(oinfo->oi_oa->o_valid & OBD_MD_FLID)) {
                  rc = fsfilt_sync(exp->exp_obd, filter->fo_obt.obt_sb);
-                /* flush any remaining cancel messages out to the target */
+
+                /* Flush any remaining cancel messages out to the target */
                  ctxt = llog_get_context(exp->exp_obd, LLOG_MDS_OST_REPL_CTXT);
-                llog_sync(ctxt, exp);
-                llog_ctxt_put(ctxt);
+                if (ctxt) {
+                        llog_sync(ctxt, exp);
+                        llog_ctxt_put(ctxt);
+                } else {
+                        CERROR("No LLOG_MDS_OST_REPL_CTXT found in obd %p\n",
+                               exp->exp_obd);
+                }
                  RETURN(rc);
          }
  
-        dentry = filter_oa2dentry(exp->exp_obd, oa);
+        dentry = filter_oa2dentry(exp->exp_obd, oinfo->oi_oa);
          if (IS_ERR(dentry))
                  RETURN(PTR_ERR(dentry));
  
@@ -3437,8 +3692,8 @@ static int filter_sync(struct obd_export *exp, struct obdo *oa,
          }
          UNLOCK_INODE_MUTEX(dentry->d_inode);
  
-        oa->o_valid = OBD_MD_FLID;
-        obdo_from_inode(oa, dentry->d_inode, FILTER_VALID_FLAGS);
+        oinfo->oi_oa->o_valid = OBD_MD_FLID;
+        obdo_from_inode(oinfo->oi_oa, dentry->d_inode, FILTER_VALID_FLAGS);
  
          pop_ctxt(&saved, &exp->exp_obd->obd_lvfs_ctxt, NULL);
  
@@ -3447,7 +3702,8 @@ static int filter_sync(struct obd_export *exp, struct obdo *oa,
  }
  
  static int filter_get_info(struct obd_export *exp, __u32 keylen,
-                           void *key, __u32 *vallen, void *val)
+                           void *key, __u32 *vallen, void *val,
+                           struct lov_stripe_md *lsm)
  {
          struct obd_device *obd;
          ENTRY;
@@ -3492,6 +3748,38 @@ static int filter_get_info(struct obd_export *exp, __u32 keylen,
                  RETURN(0);
          }
  
+        if (KEY_IS(KEY_FIEMAP)) {
+                struct ll_fiemap_info_key *fm_key = key;
+                struct dentry *dentry;
+                struct ll_user_fiemap *fiemap = val;
+                struct lvfs_run_ctxt saved;
+                int rc;
+
+                if (fiemap == NULL) {
+                        *vallen = fiemap_count_to_size(
+                                                fm_key->fiemap.fm_extent_count);
+                        RETURN(0);
+                }
+
+                dentry = __filter_oa2dentry(exp->exp_obd, &fm_key->oa,
+                                            __FUNCTION__, 1);
+                if (IS_ERR(dentry))
+                        RETURN(PTR_ERR(dentry));
+
+                memcpy(fiemap, &fm_key->fiemap, sizeof(*fiemap));
+                push_ctxt(&saved, &obd->obd_lvfs_ctxt, NULL);
+                rc = fsfilt_iocontrol(obd, dentry->d_inode, NULL,
+                                      EXT3_IOC_FIEMAP, (long)fiemap);
+                if (rc) {
+                        f_dput(dentry);
+                        RETURN(rc);
+                }
+                pop_ctxt(&saved, &obd->obd_lvfs_ctxt, NULL);
+
+                f_dput(dentry);
+                RETURN(0);
+        }
+
          CDEBUG(D_IOCTL, "invalid key\n");
          RETURN(-EINVAL);
  }
@@ -3552,9 +3840,8 @@ int filter_iocontrol(unsigned int cmd, struct obd_export *exp,
                  void *handle;
                  struct super_block *sb = obd->u.obt.obt_sb;
                  struct inode *inode = sb->s_root->d_inode;
-                BDEVNAME_DECLARE_STORAGE(tmp);
                  LCONSOLE_WARN("*** setting obd %s device '%s' read-only ***\n",
-                              obd->obd_name, ll_bdevname(sb, tmp));
+                              obd->obd_name, sb->s_id);
  
                  handle = fsfilt_start(obd, inode, FSFILT_OP_MKNOD, NULL);
                  if (!IS_ERR(handle))
@@ -3588,8 +3875,6 @@ int filter_iocontrol(unsigned int cmd, struct obd_export *exp,
                  RETURN(rc);
  */
          }
-
-
          default:
                  RETURN(-EINVAL);
          }
@@ -3641,6 +3926,23 @@ static struct lvfs_callback_ops filter_lvfs_ops = {
          l_fid2dentry:     filter_lvfs_fid2dentry,
  };
  
+static int filter_notify(struct obd_device *obd, struct obd_device *watched,
+                         enum obd_notify_event ev, void *data)
+{
+        ENTRY;
+
+        CDEBUG(D_CONFIG, "notify %s ev=%d\n", watched->obd_name, ev);
+
+        switch (ev) {
+        case OBD_NOTIFY_CONFIG:
+                /* call this only when config is processed and stale_export_age
+                 * value is configured */
+                class_disconnect_expired_exports(obd);
+        default:
+                RETURN(0);
+        }
+}
+
  static struct obd_ops filter_obd_ops = {
          .o_owner          = THIS_MODULE,
          .o_get_info       = filter_get_info,
@@ -3670,6 +3972,8 @@ static struct obd_ops filter_obd_ops = {
          .o_iocontrol      = filter_iocontrol,
          .o_health_check   = filter_health_check,
          .o_process_config = filter_process_config,
+        .o_postrecov      = filter_postrecov,
+        .o_notify         = filter_notify,
  };
  
  quota_interface_t *filter_quota_interface_ref;
@@ -3680,7 +3984,7 @@ static int __init obdfilter_init(void)
          struct lprocfs_static_vars lvars;
          int rc;
  
-        printk(KERN_INFO "Lustre: Filtering OBD driver; info@clusterfs.com\n");
+        printk(KERN_INFO "Lustre: Filtering OBD driver; http://www.lustre.org/\n");
  
          lprocfs_filter_init_vars(&lvars);
  
@@ -3737,7 +4041,7 @@ static void __exit obdfilter_exit(void)
                   sizeof(*obdfilter_created_scratchpad));
  }
  
-MODULE_AUTHOR("Cluster File Systems, Inc. <info@clusterfs.com>");
+MODULE_AUTHOR("Sun Microsystems, Inc. <http://www.lustre.org/>");
  MODULE_DESCRIPTION("Lustre Filtering OBD driver");
  MODULE_LICENSE("GPL");
  
diff --git a/lustre/obdfilter/filter_internal.h b/lustre/obdfilter/filter_internal.h

index 4620d0d..4d3dd30 100644 (file)
--- a/lustre/obdfilter/filter_internal.h
+++ b/lustre/obdfilter/filter_internal.h
@@ -1,5 +1,37 @@
  /* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
   * vim:expandtab:shiftwidth=8:tabstop=8:
+ *
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright  2008 Sun Microsystems, Inc. All rights reserved
+ * Use is subject to license terms.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
   */
  
  #ifndef _FILTER_INTERNAL_H
@@ -47,7 +79,7 @@ struct filter_mod_data {
          int              fmd_refcount;  /* reference counter - list holds 1 */
  };
  
-#ifdef BGL_SUPPORT
+#ifdef HAVE_BGL_SUPPORT
  #define FILTER_FMD_MAX_NUM_DEFAULT 128 /* many active files per client on BGL */
  #else
  #define FILTER_FMD_MAX_NUM_DEFAULT  32
@@ -55,6 +87,12 @@ struct filter_mod_data {
  /* Client cache seconds */
  #define FILTER_FMD_MAX_AGE_DEFAULT ((obd_timeout + 10) * HZ)
  
+#ifndef HAVE_PAGE_CONSTANT
+#define mapping_cap_page_constant_write(mapping) 0
+#define SetPageConstant(page) do {} while (0)
+#define ClearPageConstant(page) do {} while (0)
+#endif
+
  struct filter_mod_data *filter_fmd_find(struct obd_export *exp,
                                          obd_id objid, obd_gr group);
  struct filter_mod_data *filter_fmd_get(struct obd_export *exp,
@@ -65,6 +103,11 @@ void filter_fmd_expire(struct obd_export *exp);
  enum {
          LPROC_FILTER_READ_BYTES = 0,
          LPROC_FILTER_WRITE_BYTES = 1,
+        LPROC_FILTER_GET_PAGE = 2,
+        LPROC_FILTER_NO_PAGE = 3,
+        LPROC_FILTER_CACHE_ACCESS = 4,
+        LPROC_FILTER_CACHE_HIT = 5,
+        LPROC_FILTER_CACHE_MISS = 6,
          LPROC_FILTER_LAST,
  };
  
@@ -84,8 +127,8 @@ struct dentry *__filter_oa2dentry(struct obd_device *obd, struct obdo *oa,
                                    const char *what, int quiet);
  #define filter_oa2dentry(obd, oa) __filter_oa2dentry(obd, oa, __FUNCTION__, 0)
  
-int filter_finish_transno(struct obd_export *, struct obd_trans_info *, int rc,
-                          int force_sync);
+int filter_finish_transno(struct obd_export *, struct inode *,
+                          struct obd_trans_info *, int rc, int force_sync);
  __u64 filter_next_id(struct filter_obd *, struct obdo *);
  __u64 filter_last_id(struct filter_obd *, obd_gr group);
  int filter_update_fidea(struct obd_export *exp, struct inode *inode,
@@ -112,19 +155,20 @@ extern struct ldlm_valblock_ops filter_lvbo;
  
  /* filter_io.c */
  int filter_preprw(int cmd, struct obd_export *, struct obdo *, int objcount,
-                  struct obd_ioobj *, int niocount, struct niobuf_remote *,
-                  struct niobuf_local *, struct obd_trans_info *);
+                  struct obd_ioobj *, struct niobuf_remote *,
+                  int *, struct niobuf_local *, struct obd_trans_info *);
  int filter_commitrw(int cmd, struct obd_export *, struct obdo *, int objcount,
-                    struct obd_ioobj *, int niocount, struct niobuf_local *,
-                    struct obd_trans_info *, int rc);
+                    struct obd_ioobj *, struct niobuf_remote *,  int,
+                    struct niobuf_local *, struct obd_trans_info *, int rc);
  int filter_brw(int cmd, struct obd_export *, struct obd_info *oinfo,
                 obd_count oa_bufs, struct brw_page *pga, struct obd_trans_info *);
-void flip_into_page_cache(struct inode *inode, struct page *new_page);
+void filter_invalidate_cache(struct obd_device *, struct obd_ioobj *,
+                             struct niobuf_remote *, struct inode *);
  
  /* filter_io_*.c */
  struct filter_iobuf;
  int filter_commitrw_write(struct obd_export *exp, struct obdo *oa, int objcount,
-                          struct obd_ioobj *obj, int niocount,
+                          struct obd_ioobj *obj, struct niobuf_remote *, int,
                            struct niobuf_local *res, struct obd_trans_info *oti,
                            int rc);
  obd_size filter_grant_space_left(struct obd_export *exp);
diff --git a/lustre/obdfilter/filter_io.c b/lustre/obdfilter/filter_io.c

index cd5f1d1..c902903 100644 (file)
--- a/lustre/obdfilter/filter_io.c
+++ b/lustre/obdfilter/filter_io.c
@@ -1,30 +1,43 @@
  /* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
   * vim:expandtab:shiftwidth=8:tabstop=8:
   *
- *  linux/fs/obdfilter/filter_io.c
+ * GPL HEADER START
   *
- *  Copyright (c) 2001-2003 Cluster File Systems, Inc.
- *   Author: Peter Braam <braam@clusterfs.com>
- *   Author: Andreas Dilger <adilger@clusterfs.com>
- *   Author: Phil Schwan <phil@clusterfs.com>
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   *
- *   This file is part of the Lustre file system, http://www.lustre.org
- *   Lustre is a trademark of Cluster File Systems, Inc.
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
   *
- *   You may have signed or agreed to another license before downloading
- *   this software.  If so, you are bound by the terms and conditions
- *   of that agreement, and the following does not apply to you.  See the
- *   LICENSE file included with this distribution for more information.
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
   *
- *   If you did not agree to a different license, then this copy of Lustre
- *   is open source software; you can redistribute it and/or modify it
- *   under the terms of version 2 of the GNU General Public License as
- *   published by the Free Software Foundation.
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
   *
- *   In either case, Lustre is distributed in the hope that it will be
- *   useful, but WITHOUT ANY WARRANTY; without even the implied warranty
- *   of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- *   license text for more details.
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright  2008 Sun Microsystems, Inc. All rights reserved
+ * Use is subject to license terms.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/obdfilter/filter_io.c
+ *
+ * Author: Peter Braam <braam@clusterfs.com>
+ * Author: Andreas Dilger <adilger@clusterfs.com>
+ * Author: Phil Schwan <phil@clusterfs.com>
   */
  
  #define DEBUG_SUBSYSTEM S_FILTER
@@ -42,37 +55,6 @@
  
  int *obdfilter_created_scratchpad;
  
-static int filter_alloc_dio_page(struct obd_device *obd, struct inode *inode,
-                                 struct niobuf_local *lnb)
-{
-        struct page *page;
-
-        LASSERT(lnb->page != NULL);
-
-        page = lnb->page;
-#if 0
-        POISON_PAGE(page, 0xf1);
-        if (lnb->len != CFS_PAGE_SIZE) {
-                memset(kmap(page) + lnb->len, 0, CFS_PAGE_SIZE - lnb->len);
-                kunmap(page);
-        }
-#endif
-        page->index = lnb->offset >> CFS_PAGE_SHIFT;
-
-        RETURN(0);
-}
-
-static void filter_free_dio_pages(int objcount, struct obd_ioobj *obj,
-                           int niocount, struct niobuf_local *res)
-{
-        int i, j;
-
-        for (i = 0; i < objcount; i++, obj++) {
-                for (j = 0 ; j < obj->ioo_bufcnt ; j++, res++)
-                                res->page = NULL;
-        }
-}
-
  /* Grab the dirty and seen grant announcements from the incoming obdo.
   * We will later calculate the clients new grant and return it.
   * Caller must hold osfs lock */
@@ -258,21 +240,117 @@ long filter_grant(struct obd_export *exp, obd_size current_grant,
          return grant;
  }
  
+/*
+ * the routine is used to request pages from pagecache
+ *
+ * use GFP_NOFS not allowing to enter FS as the client can run on this node
+ * and we might end waiting on a page he sent in the request we're serving.
+ *
+ * use NORETRY so that the allocator doesn't go crazy: chance to more lucky
+ * thread have enough memory to complete his request. for our request client
+ * will do resend hopefully -bzzz
+ */
+static struct page * filter_get_page(struct obd_device *obd,
+                                     struct inode *inode,
+                                     obd_off offset)
+{
+        struct page *page;
+
+        page = find_or_create_page(inode->i_mapping, offset >> CFS_PAGE_SHIFT,
+                                   GFP_NOFS | __GFP_NORETRY);
+        if (unlikely(page == NULL))
+                lprocfs_counter_add(obd->obd_stats, LPROC_FILTER_NO_PAGE, 1);
+
+        return page;
+}
+
+/*
+ * the routine initializes array of local_niobuf from remote_niobuf
+ */
+static int filter_map_remote_to_local(int objcount, struct obd_ioobj *obj,
+                                      struct niobuf_remote *nb,
+                                      int *nrpages, struct niobuf_local *res)
+{
+        struct niobuf_remote *rnb;
+        struct niobuf_local *lnb;
+        int i, max;
+        ENTRY;
+
+        /* we don't support multiobject RPC yet
+         * ost_brw_read() and ost_brw_write() check this */
+        LASSERT(objcount == 1);
+
+        max = *nrpages;
+        *nrpages = 0;
+        for (i = 0, rnb = nb, lnb = res; i < obj->ioo_bufcnt; i++, rnb++) {
+                obd_off offset = rnb->offset;
+                unsigned int len = rnb->len;
+
+                while (len > 0) {
+                        int poff = offset & (CFS_PAGE_SIZE - 1);
+                        int plen = CFS_PAGE_SIZE - poff;
+
+                        if (*nrpages >= max) {
+                                CERROR("small array of local bufs: %d\n", max);
+                                RETURN(-EINVAL);
+                        }
+
+                        if (plen > len)
+                                plen = len;
+                        lnb->offset = offset;
+                        lnb->len = plen;
+                        lnb->flags = rnb->flags;
+                        lnb->page = NULL;
+                        lnb->rc = 0;
+                        lnb->lnb_grant_used = 0;
+
+                        LASSERTF(plen <= len, "plen %u, len %u\n", plen, len);
+                        offset += plen;
+                        len -= plen;
+                        lnb++;
+                        (*nrpages)++;
+                }
+        }
+        RETURN(0);
+}
+
+/*
+ * the function is used to free all pages used for request
+ * just to mimic cacheless OSS which don't occupy much memory
+ */
+void filter_invalidate_cache(struct obd_device *obd, struct obd_ioobj *obj,
+                             struct niobuf_remote *nb, struct inode *inode)
+{
+        struct niobuf_remote *rnb;
+        int i;
+
+        LASSERT(inode != NULL);
+
+        for (i = 0, rnb = nb; i < obj->ioo_bufcnt; i++, rnb++) {
+                invalidate_mapping_pages(inode->i_mapping, 
+                                         rnb->offset >> CFS_PAGE_SHIFT,
+                                         (rnb->offset + rnb->len) >>
+                                         CFS_PAGE_SHIFT);
+        }
+        
+}
+
  static int filter_preprw_read(int cmd, struct obd_export *exp, struct obdo *oa,
                                int objcount, struct obd_ioobj *obj,
-                              int niocount, struct niobuf_remote *nb,
-                              struct niobuf_local *res,
+                              struct niobuf_remote *nb,
+                              int *pages, struct niobuf_local *res,
                                struct obd_trans_info *oti)
  {
          struct obd_device *obd = exp->exp_obd;
+        struct timeval start, end;
          struct lvfs_run_ctxt saved;
-        struct niobuf_remote *rnb;
          struct niobuf_local *lnb;
          struct dentry *dentry = NULL;
-        struct inode *inode;
+        struct inode *inode = NULL;
          void *iobuf = NULL;
          int rc = 0, i, tot_bytes = 0;
          unsigned long now = jiffies;
+        long timediff;
          ENTRY;
  
          /* We are currently not supporting multi-obj BRW_READ RPCS at all.
@@ -304,28 +382,29 @@ static int filter_preprw_read(int cmd, struct obd_export *exp, struct obdo *oa,
          inode = dentry->d_inode;
  
          obdo_to_inode(inode, oa, OBD_MD_FLATIME);
+
+        rc = filter_map_remote_to_local(objcount, obj, nb, pages, res);
+        if (rc)
+                GOTO(cleanup, rc);
+
          fsfilt_check_slow(obd, now, "preprw_read setup");
  
-        for (i = 0, lnb = res, rnb = nb; i < obj->ioo_bufcnt;
-             i++, rnb++, lnb++) {
+        /* find pages for all segments, fill array with them */
+        do_gettimeofday(&start);
+        for (i = 0, lnb = res; i < *pages; i++, lnb++) {
+
                  lnb->dentry = dentry;
-                lnb->offset = rnb->offset;
-                lnb->len    = rnb->len;
-                lnb->flags  = rnb->flags;
-
-                /*
-                 * ost_brw_write()->ost_nio_pages_get() already initialized
-                 * lnb->page to point to the page from the per-thread page
-                 * pool (bug 5137), initialize page.
-                 */
-                LASSERT(lnb->page != NULL);
-
-                if (i_size_read(inode) <= rnb->offset)
+
+                if (i_size_read(inode) <= lnb->offset)
                          /* If there's no more data, abort early.  lnb->rc == 0,
                           * so it's easy to detect later. */
                          break;
-                else
-                        filter_alloc_dio_page(obd, inode, lnb);
+
+                lnb->page = filter_get_page(obd, inode, lnb->offset);
+                if (lnb->page == NULL)
+                        GOTO(cleanup, rc = -ENOMEM);
+
+                lprocfs_counter_add(obd->obd_stats, LPROC_FILTER_CACHE_ACCESS, 1);
  
                  if (i_size_read(inode) < lnb->offset + lnb->len - 1)
                          lnb->rc = i_size_read(inode) - lnb->offset;
@@ -334,8 +413,21 @@ static int filter_preprw_read(int cmd, struct obd_export *exp, struct obdo *oa,
  
                  tot_bytes += lnb->rc;
  
+                if (PageUptodate(lnb->page)) {
+                        lprocfs_counter_add(obd->obd_stats,
+                                            LPROC_FILTER_CACHE_HIT, 1);
+                        continue;
+                }
+
+                lprocfs_counter_add(obd->obd_stats, LPROC_FILTER_CACHE_MISS, 1);
                  filter_iobuf_add_page(obd, iobuf, inode, lnb->page);
          }
+        do_gettimeofday(&end);
+        timediff = cfs_timeval_sub(&end, &start, NULL);
+        lprocfs_counter_add(obd->obd_stats, LPROC_FILTER_GET_PAGE, timediff);
+
+        if (OBD_FAIL_CHECK(OBD_FAIL_OST_NOMEM))
+                GOTO(cleanup, rc = -ENOMEM);
  
          fsfilt_check_slow(obd, now, "start_page_read");
  
@@ -352,9 +444,20 @@ static int filter_preprw_read(int cmd, struct obd_export *exp, struct obdo *oa,
          EXIT;
  
   cleanup:
-        if (rc != 0) {
-                filter_free_dio_pages(objcount, obj, niocount, res);
+        /* unlock pages to allow access from concurrent OST_READ */
+        for (i = 0, lnb = res; i < *pages; i++, lnb++) {
+                if (lnb->page) {
+                        LASSERT(PageLocked(lnb->page));
+                        unlock_page(lnb->page);
+
+                        if (rc) {
+                                page_cache_release(lnb->page);
+                                lnb->page = NULL;
+                        }
+                }
+        }
  
+        if (rc != 0) {
                  if (dentry != NULL)
                          f_dput(dentry);
          }
@@ -378,9 +481,8 @@ static int filter_preprw_read(int cmd, struct obd_export *exp, struct obdo *oa,
   * Caller must hold obd_osfs_lock. */
  static int filter_grant_check(struct obd_export *exp, struct obdo *oa, 
                                int objcount, struct fsfilt_objinfo *fso, 
-                              int niocount, struct niobuf_remote *rnb,
-                              struct niobuf_local *lnb, obd_size *left,
-                              struct inode *inode)
+                              int niocount, struct niobuf_local *lnb,
+                              obd_size *left, struct inode *inode)
  {
          struct filter_export_data *fed = &exp->exp_filter_data;
          int blocksize = exp->exp_obd->u.obt.obt_sb->s_blocksize;
@@ -394,13 +496,13 @@ static int filter_grant_check(struct obd_export *exp, struct obdo *oa,
                          int tmp, bytes;
  
                          /* should match the code in osc_exit_cache */
-                        bytes = rnb[n].len;
-                        bytes += rnb[n].offset & (blocksize - 1);
-                        tmp = (rnb[n].offset + rnb[n].len) & (blocksize - 1);
+                        bytes = lnb[n].len;
+                        bytes += lnb[n].offset & (blocksize - 1);
+                        tmp = (lnb[n].offset + lnb[n].len) & (blocksize - 1);
                          if (tmp)
                                  bytes += blocksize - tmp;
  
-                        if ((rnb[n].flags & OBD_BRW_FROM_GRANT) &&
+                        if ((lnb[n].flags & OBD_BRW_FROM_GRANT) &&
                              (oa->o_valid & OBD_MD_FLGRANT)) {
                                  if (fed->fed_grant < used + bytes) {
                                          CDEBUG(D_CACHE,
@@ -411,7 +513,7 @@ static int filter_grant_check(struct obd_export *exp, struct obdo *oa,
                                                 used, bytes, fed->fed_grant, n);
                                  } else {
                                          used += bytes;
-                                        rnb[n].flags |= OBD_BRW_GRANTED;
+                                        lnb[n].flags |= OBD_BRW_GRANTED;
                                          lnb[n].lnb_grant_used = bytes;
                                          CDEBUG(0, "idx %d used=%lu\n", n, used);
                                          rc = 0;
@@ -421,7 +523,7 @@ static int filter_grant_check(struct obd_export *exp, struct obdo *oa,
                          if (*left > ungranted + bytes) {
                                  /* if enough space, pretend it was granted */
                                  ungranted += bytes;
-                                rnb[n].flags |= OBD_BRW_GRANTED;
+                                lnb[n].flags |= OBD_BRW_GRANTED;
                                  lnb[n].lnb_grant_used = bytes;
                                  CDEBUG(0, "idx %d ungranted=%lu\n",n,ungranted);
                                  rc = 0;
@@ -435,7 +537,7 @@ static int filter_grant_check(struct obd_export *exp, struct obdo *oa,
                           * marked BRW_GRANTED are already mapped and we can
                           * ignore this error. */
                          lnb[n].rc = -ENOSPC;
-                        rnb[n].flags &= ~OBD_BRW_GRANTED;
+                        lnb[n].flags &= ~OBD_BRW_GRANTED;
                          CDEBUG(D_CACHE,"%s: cli %s/%p idx %d no space for %d\n",
                                 exp->exp_obd->obd_name,
                                 exp->exp_client_uuid.uuid, exp, n, bytes);
@@ -496,19 +598,20 @@ static int filter_grant_check(struct obd_export *exp, struct obdo *oa,
   * bug) or ensure we get the page locks in an appropriate order. */
  static int filter_preprw_write(int cmd, struct obd_export *exp, struct obdo *oa,
                                 int objcount, struct obd_ioobj *obj,
-                               int niocount, struct niobuf_remote *nb,
+                               struct niobuf_remote *nb, int *pages,
                                 struct niobuf_local *res,
                                 struct obd_trans_info *oti)
  {
+        struct obd_device *obd = exp->exp_obd;
+        struct timeval start, end;
          struct lvfs_run_ctxt saved;
-        struct niobuf_remote *rnb;
          struct niobuf_local *lnb = res;
          struct fsfilt_objinfo fso;
          struct filter_mod_data *fmd;
          struct dentry *dentry = NULL;
          void *iobuf;
          obd_size left;
-        unsigned long now = jiffies;
+        unsigned long now = jiffies, timediff;
          int rc = 0, i, tot_bytes = 0, cleanup_phase = 0;
          ENTRY;
          LASSERT(objcount == 1);
@@ -532,8 +635,9 @@ static int filter_preprw_write(int cmd, struct obd_export *exp, struct obdo *oa,
                  GOTO(cleanup, rc = -ENOENT);
          }
  
-        fso.fso_dentry = dentry;
-        fso.fso_bufcnt = obj->ioo_bufcnt;
+        rc = filter_map_remote_to_local(objcount, obj, nb, pages, res);
+        if (rc)
+                GOTO(cleanup, rc);
  
          fsfilt_check_slow(exp->exp_obd, now, "preprw_write setup");
  
@@ -558,7 +662,10 @@ static int filter_preprw_write(int cmd, struct obd_export *exp, struct obdo *oa,
  
          left = filter_grant_space_left(exp);
  
-        rc = filter_grant_check(exp, oa, objcount, &fso, niocount, nb, res,
+        fso.fso_dentry = dentry;
+        fso.fso_bufcnt = *pages;
+
+        rc = filter_grant_check(exp, oa, objcount, &fso, *pages, res,
                                  &left, dentry->d_inode);
  
          /* do not zero out oa->o_valid as it is used in filter_commitrw_write()
@@ -571,31 +678,31 @@ static int filter_preprw_write(int cmd, struct obd_export *exp, struct obdo *oa,
  
          if (rc)
                  GOTO(cleanup, rc);
+        cleanup_phase = 4;
+
+        do_gettimeofday(&start);
+        for (i = 0, lnb = res; i < *pages; i++, lnb++) {
  
-        for (i = 0, rnb = nb, lnb = res; i < obj->ioo_bufcnt;
-             i++, lnb++, rnb++) {
                  /* We still set up for ungranted pages so that granted pages
                   * can be written to disk as they were promised, and portals
                   * needs to keep the pages all aligned properly. */
                  lnb->dentry = dentry;
-                lnb->offset = rnb->offset;
-                lnb->len    = rnb->len;
-                lnb->flags  = rnb->flags;
-
-                /*
-                 * ost_brw_write()->ost_nio_pages_get() already initialized
-                 * lnb->page to point to the page from the per-thread page
-                 * pool (bug 5137), initialize page.
-                 */
-                LASSERT(lnb->page != NULL);
-                if (lnb->len != CFS_PAGE_SIZE) {
-                        memset(kmap(lnb->page) + lnb->len,
-                               0, CFS_PAGE_SIZE - lnb->len);
-                        kunmap(lnb->page);
-                }
-                lnb->page->index = lnb->offset >> CFS_PAGE_SHIFT;
  
-                cleanup_phase = 4;
+                lnb->page = filter_get_page(obd, dentry->d_inode, lnb->offset);
+                if (lnb->page == NULL)
+                        GOTO(cleanup, rc = -ENOMEM);
+
+                /* DLM locking protects us from write and truncate competing
+                 * for same region, but truncate can leave dirty page in the
+                 * cache. it's possible the writeout on a such a page is in
+                 * progress when we access it. it's also possible that during
+                 * this writeout we put new (partial) data, but then won't
+                 * be able to proceed in filter_commitrw_write(). thus let's
+                 * just wait for writeout completion, should be rare enough.
+                 * -bzzz */
+                if (obd->u.filter.fo_writethrough_cache)
+                        wait_on_page_writeback(lnb->page);
+                BUG_ON(PageWriteback(lnb->page));
  
                  /* If the filter writes a partial page, then has the file
                   * extended, the client will read in the whole page.  the
@@ -632,7 +739,14 @@ static int filter_preprw_write(int cmd, struct obd_export *exp, struct obdo *oa,
                  if (lnb->rc == 0)
                          tot_bytes += lnb->len;
          }
+        do_gettimeofday(&end);
+        timediff = cfs_timeval_sub(&end, &start, NULL);
+        lprocfs_counter_add(obd->obd_stats, LPROC_FILTER_GET_PAGE, timediff);
  
+        if (OBD_FAIL_CHECK(OBD_FAIL_OST_NOMEM))
+                GOTO(cleanup, rc = -ENOMEM);
+
+        /* don't unlock pages to prevent any access */
          rc = filter_direct_io(OBD_BRW_READ, dentry, iobuf, exp,
                                NULL, NULL, NULL);
  
@@ -647,6 +761,16 @@ static int filter_preprw_write(int cmd, struct obd_export *exp, struct obdo *oa,
  cleanup:
          switch(cleanup_phase) {
          case 4:
+                if (rc) {
+                        for (i = 0, lnb = res; i < *pages; i++, lnb++) {
+                                if (lnb->page != NULL) {
+                                        unlock_page(lnb->page);
+                                        page_cache_release(lnb->page);
+                                        lnb->page = NULL;
+                                }
+                        }
+                        filter_grant_commit(exp, *pages, res);
+                }
          case 3:
                  filter_iobuf_put(&exp->exp_obd->u.filter, iobuf, oti);
          case 2:
@@ -669,47 +793,33 @@ cleanup:
  }
  
  int filter_preprw(int cmd, struct obd_export *exp, struct obdo *oa,
-                  int objcount, struct obd_ioobj *obj, int niocount,
-                  struct niobuf_remote *nb, struct niobuf_local *res,
-                  struct obd_trans_info *oti)
+                  int objcount, struct obd_ioobj *obj,
+                  struct niobuf_remote *nb, int *pages,
+                  struct niobuf_local *res, struct obd_trans_info *oti)
  {
          if (cmd == OBD_BRW_WRITE)
                  return filter_preprw_write(cmd, exp, oa, objcount, obj,
-                                           niocount, nb, res, oti);
+                                           nb, pages, res, oti);
          if (cmd == OBD_BRW_READ)
                  return filter_preprw_read(cmd, exp, oa, objcount, obj,
-                                          niocount, nb, res, oti);
+                                          nb, pages, res, oti);
          LBUG();
          return -EPROTO;
  }
  
-void filter_release_read_page(struct filter_obd *filter, struct inode *inode,
-                              struct page *page)
-{
-        int drop = 0;
-
-        if (inode != NULL &&
-            (i_size_read(inode) > filter->fo_readcache_max_filesize))
-                drop = 1;
-
-        /* drop from cache like truncate_list_pages() */
-        if (drop && !TryLockPage(page)) {
-                if (page->mapping)
-                        ll_truncate_complete_page(page);
-                unlock_page(page);
-        }
-        page_cache_release(page);
-}
-
  static int filter_commitrw_read(struct obd_export *exp, struct obdo *oa,
                                  int objcount, struct obd_ioobj *obj,
-                                int niocount, struct niobuf_local *res,
+                                struct niobuf_remote *rnb,
+                                int pages, struct niobuf_local *res,
                                  struct obd_trans_info *oti, int rc)
  {
+        struct filter_obd *fo = &exp->exp_obd->u.filter;
          struct inode *inode = NULL;
          struct ldlm_res_id res_id = { .name = { obj->ioo_id } };
          struct ldlm_resource *resource = NULL;
          struct ldlm_namespace *ns = exp->exp_obd->obd_namespace;
+        struct niobuf_local *lnb;
+        int i;
          ENTRY;
  
          /* If oa != NULL then filter_preprw_read updated the inode atime
@@ -727,52 +837,22 @@ static int filter_commitrw_read(struct obd_export *exp, struct obdo *oa,
          if (res->dentry != NULL)
                  inode = res->dentry->d_inode;
  
-        filter_free_dio_pages(objcount, obj, niocount, res);
+        for (i = 0, lnb = res; i < pages; i++, lnb++) {
+                if (lnb->page != NULL) {
+                        page_cache_release(lnb->page);
+                        lnb->page = NULL;
+                }
+        }
+
+        if (inode && (fo->fo_read_cache == 0 ||
+                        i_size_read(inode) > fo->fo_readcache_max_filesize))
+                filter_invalidate_cache(exp->exp_obd, obj, rnb, inode);
  
          if (res->dentry != NULL)
                  f_dput(res->dentry);
          RETURN(rc);
  }
  
-void flip_into_page_cache(struct inode *inode, struct page *new_page)
-{
-        struct page *old_page;
-        int rc;
-
-        do {
-                /* the dlm is protecting us from read/write concurrency, so we
-                 * expect this find_lock_page to return quickly.  even if we
-                 * race with another writer it won't be doing much work with
-                 * the page locked.  we do this 'cause t_c_p expects a
-                 * locked page, and it wants to grab the pagecache lock
-                 * as well. */
-                old_page = find_lock_page(inode->i_mapping, new_page->index);
-                if (old_page) {
-                        ll_truncate_complete_page(old_page);
-                        unlock_page(old_page);
-                        page_cache_release(old_page);
-                }
-
-#if 0 /* this should be a /proc tunable someday */
-                /* racing o_directs (no locking ioctl) could race adding
-                 * their pages, so we repeat the page invalidation unless
-                 * we successfully added our new page */
-                rc = add_to_page_cache_unique(new_page, inode->i_mapping,
-                                              new_page->index,
-                                              page_hash(inode->i_mapping,
-                                                        new_page->index));
-                if (rc == 0) {
-                        /* add_to_page_cache clears uptodate|dirty and locks
-                         * the page */
-                        SetPageUptodate(new_page);
-                        unlock_page(new_page);
-                }
-#else
-                rc = 0;
-#endif
-        } while (rc != 0);
-}
-
  void filter_grant_commit(struct obd_export *exp, int niocount,
                           struct niobuf_local *res)
  {
@@ -805,16 +885,17 @@ void filter_grant_commit(struct obd_export *exp, int niocount,
  }
  
  int filter_commitrw(int cmd, struct obd_export *exp, struct obdo *oa,
-                    int objcount, struct obd_ioobj *obj, int niocount,
+                    int objcount, struct obd_ioobj *obj,
+                    struct niobuf_remote *nb, int pages,
                      struct niobuf_local *res, struct obd_trans_info *oti,
                      int rc)
  {
          if (cmd == OBD_BRW_WRITE)
-                return filter_commitrw_write(exp, oa, objcount, obj, niocount,
-                                             res, oti, rc);
+                return filter_commitrw_write(exp, oa, objcount, obj,
+                                             nb, pages, res, oti, rc);
          if (cmd == OBD_BRW_READ)
-                return filter_commitrw_read(exp, oa, objcount, obj, niocount,
-                                            res, oti, rc);
+                return filter_commitrw_read(exp, oa, objcount, obj,
+                                            nb, pages, res, oti, rc);
          LBUG();
          return -EPROTO;
  }
@@ -827,7 +908,7 @@ int filter_brw(int cmd, struct obd_export *exp, struct obd_info *oinfo,
          struct niobuf_local *lnb;
          struct niobuf_remote *rnb;
          obd_count i;
-        int ret = 0;
+        int ret = 0, npages;
          ENTRY;
  
          OBD_ALLOC(lnb, oa_bufs * sizeof(struct niobuf_local));
@@ -845,13 +926,15 @@ int filter_brw(int cmd, struct obd_export *exp, struct obd_info *oinfo,
          obdo_to_ioobj(oinfo->oi_oa, &ioo);
          ioo.ioo_bufcnt = oa_bufs;
  
+        npages = oa_bufs;
          ret = filter_preprw(cmd, exp, oinfo->oi_oa, 1, &ioo,
-                            oa_bufs, rnb, lnb, oti);
+                            rnb, &npages, lnb, oti);
          if (ret != 0)
                  GOTO(out, ret);
+        LASSERTF(oa_bufs == npages, "%u != %u\n", oa_bufs, npages);
  
-        ret = filter_commitrw(cmd, exp, oinfo->oi_oa, 1, &ioo,
-                              oa_bufs, lnb, oti, ret);
+        ret = filter_commitrw(cmd, exp, oinfo->oi_oa, 1, &ioo, rnb,
+                              npages, lnb, oti, ret);
  
  out:
          if (lnb)
diff --git a/lustre/obdfilter/filter_io_24.c b/lustre/obdfilter/filter_io_24.c

deleted file mode 100644 (file)

index a2042f8..0000000
--- a/lustre/obdfilter/filter_io_24.c
+++ /dev/null
@@ -1,544 +0,0 @@
-/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
- * vim:expandtab:shiftwidth=8:tabstop=8:
- *
- *  linux/fs/obdfilter/filter_io.c
- *
- *  Copyright (c) 2001-2003 Cluster File Systems, Inc.
- *   Author: Peter Braam <braam@clusterfs.com>
- *   Author: Andreas Dilger <adilger@clusterfs.com>
- *   Author: Phil Schwan <phil@clusterfs.com>
- *
- *   This file is part of the Lustre file system, http://www.lustre.org
- *   Lustre is a trademark of Cluster File Systems, Inc.
- *
- *   You may have signed or agreed to another license before downloading
- *   this software.  If so, you are bound by the terms and conditions
- *   of that agreement, and the following does not apply to you.  See the
- *   LICENSE file included with this distribution for more information.
- *
- *   If you did not agree to a different license, then this copy of Lustre
- *   is open source software; you can redistribute it and/or modify it
- *   under the terms of version 2 of the GNU General Public License as
- *   published by the Free Software Foundation.
- *
- *   In either case, Lustre is distributed in the hope that it will be
- *   useful, but WITHOUT ANY WARRANTY; without even the implied warranty
- *   of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- *   license text for more details.
- */
-
-#ifndef AUTOCONF_INCLUDED
-#include <linux/config.h>
-#endif
-#include <linux/module.h>
-#include <linux/pagemap.h> // XXX kill me soon
-#include <linux/version.h>
-
-#define DEBUG_SUBSYSTEM S_FILTER
-
-#include <linux/iobuf.h>
-#include <linux/locks.h>
-
-#include <obd_class.h>
-#include <lustre_fsfilt.h>
-#include "filter_internal.h"
-
-/* Bug 2254 -- this is better done in ext3_map_inode_page, but this
- * workaround will suffice until everyone has upgraded their kernels */
-static void check_pending_bhs(unsigned long *blocks, int nr_pages, dev_t dev,
-                              int size)
-{
-#if (LUSTRE_KERNEL_VERSION < 32)
-        struct buffer_head *bh;
-        int i;
-
-        for (i = 0; i < nr_pages; i++) {
-                bh = get_hash_table(dev, blocks[i], size);
-                if (bh == NULL)
-                        continue;
-                if (!buffer_dirty(bh)) {
-                        put_bh(bh);
-                        continue;
-                }
-                mark_buffer_clean(bh);
-                wait_on_buffer(bh);
-                clear_bit(BH_Req, &bh->b_state);
-                __brelse(bh);
-        }
-#endif
-}
-
-/* when brw_kiovec() is asked to read from block -1UL it just zeros
- * the page.  this gives us a chance to verify the write mappings
- * as well */
-static int filter_cleanup_mappings(int rw, struct kiobuf *iobuf,
-                                   struct inode *inode)
-{
-        int i, blocks_per_page_bits = CFS_PAGE_SHIFT - inode->i_blkbits;
-        ENTRY;
-
-        for (i = 0 ; i < iobuf->nr_pages << blocks_per_page_bits; i++) {
-                if (KIOBUF_GET_BLOCKS(iobuf)[i] > 0)
-                        continue;
-
-                if (rw == OBD_BRW_WRITE)
-                        RETURN(-EINVAL);
-
-                KIOBUF_GET_BLOCKS(iobuf)[i] = -1UL;
-        }
-        RETURN(0);
-}
-
-#if 0
-static void dump_page(int rw, unsigned long block, struct page *page)
-{
-        char *blah = kmap(page);
-        CDEBUG(D_PAGE, "rw %d block %lu: %02x %02x %02x %02x\n", rw, block,
-                       blah[0], blah[1], blah[2], blah[3]);
-        kunmap(page);
-}
-#endif
-
-/* These are our hacks to keep our directio/bh IO coherent with ext3's
- * page cache use.  Most notably ext3 reads file data into the page
- * cache when it is zeroing the tail of partial-block truncates and
- * leaves it there, sometimes generating io from it at later truncates.
- * This removes the partial page and its buffers from the page cache,
- * so it should only ever cause a wait in rare cases, as otherwise we
- * always do full-page IO to the OST.
- *
- * The call to truncate_complete_page() will call journal_flushpage() to
- * free the buffers and drop the page from cache.  The buffers should not
- * be dirty, because we already called fdatasync/fdatawait on them.
- */
-static int filter_sync_inode_data(struct inode *inode)
-{
-        int rc, rc2;
-
-        /* This is nearly generic_osync_inode, without the waiting on the inode
-        rc = generic_osync_inode(inode, inode->i_mapping,
-                                 OSYNC_DATA|OSYNC_METADATA);
-         */
-        rc = filemap_fdatasync(inode->i_mapping);
-        rc2 = fsync_inode_data_buffers(inode);
-        if (rc == 0)
-                rc = rc2;
-        rc2 = filemap_fdatawait(inode->i_mapping);
-        if (rc == 0)
-                rc = rc2;
-
-        return rc;
-}
-
-static int filter_clear_page_cache(struct inode *inode, struct kiobuf *iobuf)
-{
-        struct page *page;
-        int i, rc;
-
-        check_pending_bhs(KIOBUF_GET_BLOCKS(iobuf), iobuf->nr_pages,
-                          inode->i_dev, 1 << inode->i_blkbits);
-
-        rc = filter_sync_inode_data(inode);
-        if (rc != 0)
-                RETURN(rc);
-
-        /* be careful to call this after fsync_inode_data_buffers has waited
-         * for IO to complete before we evict it from the cache */
-        for (i = 0; i < iobuf->nr_pages ; i++) {
-                page = find_lock_page(inode->i_mapping,
-                                      iobuf->maplist[i]->index);
-                if (page == NULL)
-                        continue;
-                if (page->mapping != NULL) {
-                        /* Now that the only source of such pages in truncate
-                         * path flushes these pages to disk and and then
-                         * discards, this is error condition */
-                        CERROR("Data page in page cache during write!\n");
-                        ll_truncate_complete_page(page);
-                }
-
-                unlock_page(page);
-                page_cache_release(page);
-        }
-
-        return 0;
-}
-
-int filter_clear_truncated_page(struct inode *inode)
-{
-        struct page *page;
-        int rc;
-
-        /* Truncate on page boundary, so nothing to flush? */
-        if (!(i_size_read(inode) & ~CFS_PAGE_MASK))
-                return 0;
-
-        rc = filter_sync_inode_data(inode);
-        if (rc != 0)
-                RETURN(rc);
-
-        /* be careful to call this after fsync_inode_data_buffers has waited
-         * for IO to complete before we evict it from the cache */
-        page = find_lock_page(inode->i_mapping,
-                              i_size_read(inode) >> CFS_PAGE_SHIFT);
-        if (page) {
-                if (page->mapping != NULL)
-                        ll_truncate_complete_page(page);
-
-                unlock_page(page);
-                page_cache_release(page);
-        }
-
-        return 0;
-}
-
-/* Must be called with i_sem taken for writes; this will drop it */
-int filter_direct_io(int rw, struct dentry *dchild, struct filter_iobuf *buf,
-                     struct obd_export *exp, struct iattr *attr,
-                     struct obd_trans_info *oti, void **wait_handle)
-{
-        struct obd_device *obd = exp->exp_obd;
-        struct inode *inode = dchild->d_inode;
-        struct kiobuf *iobuf = (void *)buf;
-        int rc, create = (rw == OBD_BRW_WRITE), committed = 0;
-        int blocks_per_page = CFS_PAGE_SIZE >> inode->i_blkbits, cleanup_phase = 0;
-        struct semaphore *sem = NULL;
-        ENTRY;
-
-        LASSERTF(rw == OBD_BRW_WRITE || rw == OBD_BRW_READ, "%x\n", rw);
-
-        if (iobuf->nr_pages == 0)
-                GOTO(cleanup, rc = 0);
-
-        if (iobuf->nr_pages * blocks_per_page > KIO_MAX_SECTORS)
-                GOTO(cleanup, rc = -EINVAL);
-
-        if (iobuf->nr_pages * blocks_per_page >
-            OBDFILTER_CREATED_SCRATCHPAD_ENTRIES)
-                GOTO(cleanup, rc = -EINVAL);
-
-        cleanup_phase = 1;
-
-        rc = lock_kiovec(1, &iobuf, 1);
-        if (rc < 0)
-                GOTO(cleanup, rc);
-        cleanup_phase = 2;
-
-        if (rw == OBD_BRW_WRITE) {
-                create = 1;
-                sem = &obd->u.filter.fo_alloc_lock;
-        }
-        rc = fsfilt_map_inode_pages(obd, inode, iobuf->maplist,
-                                    iobuf->nr_pages, KIOBUF_GET_BLOCKS(iobuf),
-                                    obdfilter_created_scratchpad, create, sem);
-        if (rc)
-                GOTO(cleanup, rc);
-
-        rc = filter_cleanup_mappings(rw, iobuf, inode);
-        if (rc)
-                GOTO(cleanup, rc);
-
-        if (rw == OBD_BRW_WRITE) {
-                if (rc == 0) {
-                        filter_tally(exp, iobuf->maplist, iobuf->nr_pages,
-                                     KIOBUF_GET_BLOCKS(iobuf), blocks_per_page,
-                                     1);
-
-                        if (attr->ia_size > i_size_read(inode))
-                                attr->ia_valid |= ATTR_SIZE;
-                        rc = fsfilt_setattr(obd, dchild,
-                                            oti->oti_handle, attr, 0);
-                        if (rc)
-                                GOTO(cleanup, rc);
-                }
-
-                up(&inode->i_sem);
-                cleanup_phase = 3;
-
-                rc = filter_finish_transno(exp, oti, 0, 0);
-                if (rc)
-                        GOTO(cleanup, rc);
-
-                rc = fsfilt_commit_async(obd,inode,oti->oti_handle,wait_handle);
-                committed = 1;
-                if (rc)
-                        GOTO(cleanup, rc);
-        } else {
-                filter_tally(exp, iobuf->maplist, iobuf->nr_pages,
-                             KIOBUF_GET_BLOCKS(iobuf), blocks_per_page, 0);
-        }
-
-        rc = filter_clear_page_cache(inode, iobuf);
-        if (rc < 0)
-                GOTO(cleanup, rc);
-
-        rc = fsfilt_send_bio(rw, obd, inode, iobuf);
-
-        CDEBUG(D_INFO, "tried to %s %d pages, rc = %d\n",
-               rw & OBD_BRW_WRITE ? "write" : "read", iobuf->nr_pages, rc);
-
-        if (rc > 0)
-                rc = 0;
-
-        EXIT;
-cleanup:
-        if (!committed && (rw == OBD_BRW_WRITE)) {
-                int err = fsfilt_commit_async(obd, inode,
-                                              oti->oti_handle, wait_handle);
-                if (err)
-                        CERROR("can't close transaction: %d\n", err);
-                /*
-                 * this is error path, so we prefer to return
-                 * original error, not this one
-                 */
-        }
-
-        switch(cleanup_phase) {
-        case 3:
-        case 2:
-                unlock_kiovec(1, &iobuf);
-        case 1:
-        case 0:
-                if (cleanup_phase != 3 && rw == OBD_BRW_WRITE)
-                        up(&inode->i_sem);
-                break;
-        default:
-                CERROR("corrupt cleanup_phase (%d)?\n", cleanup_phase);
-                LBUG();
-                break;
-        }
-        return rc;
-}
-
-/* See if there are unallocated parts in given file region */
-int filter_range_is_mapped(struct inode *inode, obd_size offset, int len)
-{
-        int (*fs_bmap)(struct address_space *, long) =
-                inode->i_mapping->a_ops->bmap;
-        int j;
-
-        /* We can't know if the range is mapped already or not */
-        if (fs_bmap == NULL)
-                return 0;
-
-        offset >>= inode->i_blkbits;
-        len >>= inode->i_blkbits;
-
-        for (j = 0; j < len; j++)
-                if (fs_bmap(inode->i_mapping, offset + j) == 0)
-                        return 0;
-
-        return 1;
-}
-
-/* some kernels require alloc_kiovec callers to zero members through the use of
- * map_user_kiobuf and unmap_.. we don't use those, so we have a little helper
- * that makes sure we don't break the rules. */
-static void clear_kiobuf(struct kiobuf *iobuf)
-{
-        int i;
-
-        for (i = 0; i < iobuf->array_len; i++)
-                iobuf->maplist[i] = NULL;
-
-        iobuf->nr_pages = 0;
-        iobuf->offset = 0;
-        iobuf->length = 0;
-}
-
-struct filter_iobuf *filter_alloc_iobuf(struct filter_obd *filter,
-                                        int rw, int num_pages)
-{
-        struct kiobuf *iobuf;
-        int rc;
-        ENTRY;
-
-        LASSERTF(rw == OBD_BRW_WRITE || rw == OBD_BRW_READ, "%x\n", rw);
-
-        rc = alloc_kiovec(1, &iobuf);
-        if (rc)
-                RETURN(ERR_PTR(rc));
-
-        rc = expand_kiobuf(iobuf, num_pages);
-        if (rc) {
-                free_kiovec(1, &iobuf);
-                RETURN(ERR_PTR(rc));
-        }
-
-#ifdef HAVE_KIOBUF_DOVARY
-        iobuf->dovary = 0; /* this prevents corruption, not present in 2.4.20 */
-#endif
-        clear_kiobuf(iobuf);
-        RETURN((void *)iobuf);
-}
-
-void filter_free_iobuf(struct filter_iobuf *buf)
-{
-        struct kiobuf *iobuf = (void *)buf;
-
-        clear_kiobuf(iobuf);
-        free_kiovec(1, &iobuf);
-}
-
-void filter_iobuf_put(struct filter_obd *filter, struct filter_iobuf *iobuf,
-                      struct obd_trans_info *oti)
-{
-        int thread_id = oti ? oti->oti_thread_id : -1;
-
-        if (unlikely(thread_id < 0)) {
-                filter_free_iobuf(iobuf);
-                return;
-        }
-
-        LASSERTF(filter->fo_iobuf_pool[thread_id] == iobuf,
-                 "iobuf mismatch for thread %d: pool %p iobuf %p\n",
-                 thread_id, filter->fo_iobuf_pool[thread_id], iobuf);
-        clear_kiobuf((void *)iobuf);
-}
-
-int filter_iobuf_add_page(struct obd_device *obd, struct filter_iobuf *buf,
-                           struct inode *inode, struct page *page)
-{
-        struct kiobuf *iobuf = (void *)buf;
-
-        iobuf->maplist[iobuf->nr_pages++] = page;
-        iobuf->length += CFS_PAGE_SIZE;
-
-        return 0;
-}
-
-int filter_commitrw_write(struct obd_export *exp, struct obdo *oa, int objcount,
-                          struct obd_ioobj *obj, int niocount,
-                          struct niobuf_local *res, struct obd_trans_info *oti,
-                          int rc)
-{
-        struct obd_device *obd = exp->exp_obd;
-        struct lvfs_run_ctxt saved;
-        struct niobuf_local *lnb;
-        struct fsfilt_objinfo fso;
-        struct iattr iattr = { 0 };
-        void *iobuf = NULL;
-        struct inode *inode = NULL;
-        int i, n, cleanup_phase = 0, err;
-        unsigned long now = jiffies; /* DEBUGGING OST TIMEOUTS */
-        void *wait_handle;
-        ENTRY;
-        LASSERT(oti != NULL);
-        LASSERT(objcount == 1);
-        LASSERT(current->journal_info == NULL);
-
-        if (rc != 0)
-                GOTO(cleanup, rc);
-
-        iobuf = filter_iobuf_get(&obd->u.filter, oti);
-        if (IS_ERR(iobuf))
-                GOTO(cleanup, rc = PTR_ERR(iobuf));
-        cleanup_phase = 1;
-
-        fso.fso_dentry = res->dentry;
-        fso.fso_bufcnt = obj->ioo_bufcnt;
-        inode = res->dentry->d_inode;
-
-        for (i = 0, lnb = res, n = 0; i < obj->ioo_bufcnt; i++, lnb++) {
-                loff_t this_size;
-
-                /* If overwriting an existing block, we don't need a grant */
-                if (!(lnb->flags & OBD_BRW_GRANTED) && lnb->rc == -ENOSPC &&
-                    filter_range_is_mapped(inode, lnb->offset, lnb->len))
-                        lnb->rc = 0;
-
-                if (lnb->rc) /* ENOSPC, network RPC error */
-                        continue;
-
-                filter_iobuf_add_page(obd, iobuf, inode, lnb->page);
-
-                /* We expect these pages to be in offset order, but we'll
-                 * be forgiving */
-                this_size = lnb->offset + lnb->len;
-                if (this_size > iattr.ia_size)
-                        iattr.ia_size = this_size;
-        }
-
-        push_ctxt(&saved, &obd->obd_lvfs_ctxt, NULL);
-        cleanup_phase = 2;
-
-        down(&inode->i_sem);
-        oti->oti_handle = fsfilt_brw_start(obd, objcount, &fso, niocount, res,
-                                           oti);
-        if (IS_ERR(oti->oti_handle)) {
-                up(&inode->i_sem);
-                rc = PTR_ERR(oti->oti_handle);
-                CDEBUG(rc == -ENOSPC ? D_INODE : D_ERROR,
-                       "error starting transaction: rc = %d\n", rc);
-                oti->oti_handle = NULL;
-                GOTO(cleanup, rc);
-        }
-
-        fsfilt_check_slow(obd, now, "brw_start");
-
-        i = OBD_MD_FLATIME | OBD_MD_FLMTIME | OBD_MD_FLCTIME;
-
-        /* If the inode still has SUID+SGID bits set (see filter_precreate())
-         * then we will accept the UID+GID if sent by the client for
-         * initializing the ownership of this inode.  We only allow this to
-         * happen once (so clear these bits) and later only allow setattr. */
-        if (inode->i_mode & S_ISUID)
-                i |= OBD_MD_FLUID;
-        if (inode->i_mode & S_ISGID)
-                i |= OBD_MD_FLGID;
-
-        iattr_from_obdo(&iattr, oa, i);
-        if (iattr.ia_valid & (ATTR_UID | ATTR_GID)) {
-                CDEBUG(D_INODE, "update UID/GID to %lu/%lu\n",
-                       (unsigned long)oa->o_uid, (unsigned long)oa->o_gid);
-
-                cap_raise(current->cap_effective, CAP_SYS_RESOURCE);
-
-                iattr.ia_valid |= ATTR_MODE;
-                iattr.ia_mode = inode->i_mode;
-                if (iattr.ia_valid & ATTR_UID)
-                        iattr.ia_mode &= ~S_ISUID;
-                if (iattr.ia_valid & ATTR_GID)
-                        iattr.ia_mode &= ~S_ISGID;
-
-                rc = filter_update_fidea(exp, inode, oti->oti_handle, oa);
-        }
-
-        /* filter_direct_io drops i_sem */
-        rc = filter_direct_io(OBD_BRW_WRITE, res->dentry, iobuf, exp, &iattr,
-                              oti, &wait_handle);
-        if (rc == 0)
-                obdo_from_inode(oa, inode, FILTER_VALID_FLAGS);
-
-        fsfilt_check_slow(obd, now, "direct_io");
-
-        err = fsfilt_commit_wait(obd, inode, wait_handle);
-        if (err) {
-                CERROR("Failure to commit OST transaction (%d)?\n", err);
-                rc = err;
-        }
-        if (obd->obd_replayable && !rc)
-                LASSERTF(oti->oti_transno <= obd->obd_last_committed,
-                         "oti_transno "LPU64" last_committed "LPU64"\n",
-                         oti->oti_transno, obd->obd_last_committed);
-        fsfilt_check_slow(obd, now, "commitrw commit");
-
-cleanup:
-        filter_grant_commit(exp, niocount, res);
-
-        switch (cleanup_phase) {
-        case 2:
-                pop_ctxt(&saved, &obd->obd_lvfs_ctxt, NULL);
-                LASSERT(current->journal_info == NULL);
-        case 1:
-                filter_iobuf_put(&obd->u.filter, iobuf, oti);
-        case 0:
-                /*
-                 * lnb->page automatically returns back into per-thread page
-                 * pool (bug 5137)
-                 */
-                f_dput(res->dentry);
-        }
-
-        RETURN(rc);
-}
diff --git a/lustre/obdfilter/filter_io_26.c b/lustre/obdfilter/filter_io_26.c

index cf13af1..5ad3dda 100644 (file)
--- a/lustre/obdfilter/filter_io_26.c
+++ b/lustre/obdfilter/filter_io_26.c
@@ -1,30 +1,43 @@
  /* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
   * vim:expandtab:shiftwidth=8:tabstop=8:
   *
- *  linux/fs/obdfilter/filter_io.c
+ * GPL HEADER START
   *
- *  Copyright (c) 2001-2003 Cluster File Systems, Inc.
- *   Author: Peter Braam <braam@clusterfs.com>
- *   Author: Andreas Dilger <adilger@clusterfs.com>
- *   Author: Phil Schwan <phil@clusterfs.com>
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   *
- *   This file is part of the Lustre file system, http://www.lustre.org
- *   Lustre is a trademark of Cluster File Systems, Inc.
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
   *
- *   You may have signed or agreed to another license before downloading
- *   this software.  If so, you are bound by the terms and conditions
- *   of that agreement, and the following does not apply to you.  See the
- *   LICENSE file included with this distribution for more information.
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
   *
- *   If you did not agree to a different license, then this copy of Lustre
- *   is open source software; you can redistribute it and/or modify it
- *   under the terms of version 2 of the GNU General Public License as
- *   published by the Free Software Foundation.
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
   *
- *   In either case, Lustre is distributed in the hope that it will be
- *   useful, but WITHOUT ANY WARRANTY; without even the implied warranty
- *   of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- *   license text for more details.
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright  2008 Sun Microsystems, Inc. All rights reserved
+ * Use is subject to license terms.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/obdfilter/filter_io_26.c
+ *
+ * Author: Peter Braam <braam@clusterfs.com>
+ * Author: Andreas Dilger <adilger@clusterfs.com>
+ * Author: Phil Schwan <phil@clusterfs.com>
   */
  
  #ifndef AUTOCONF_INCLUDED
@@ -52,7 +65,6 @@ struct filter_iobuf {
          int               dr_error;
          struct page     **dr_pages;
          unsigned long    *dr_blocks;
-        spinlock_t        dr_lock;              /* IRQ lock */
          unsigned int      dr_ignore_quota:1;
          struct filter_obd *dr_filter;
  };
@@ -87,7 +99,7 @@ static void record_finish_io(struct filter_iobuf *iobuf, int rw, int rc)
  {
          struct filter_obd *filter = iobuf->dr_filter;
  
-        /* CAVEAT EMPTOR: possibly in IRQ context 
+        /* CAVEAT EMPTOR: possibly in IRQ context
           * DO NOT record procfs stats here!!! */
  
          if (rw == OBD_BRW_READ)
@@ -102,28 +114,24 @@ static void record_finish_io(struct filter_iobuf *iobuf, int rw, int rc)
  static int dio_complete_routine(struct bio *bio, unsigned int done, int error)
  {
          struct filter_iobuf *iobuf = bio->bi_private;
-        unsigned long        flags;
-
-#ifdef HAVE_PAGE_CONSTANT
          struct bio_vec *bvl;
          int i;
-#endif
  
-        /* CAVEAT EMPTOR: possibly in IRQ context 
+        /* CAVEAT EMPTOR: possibly in IRQ context
           * DO NOT record procfs stats here!!! */
  
          if (bio->bi_size)                       /* Not complete */
                  return 1;
  
-        if (iobuf == NULL) {
+        if (unlikely(iobuf == NULL)) {
                  CERROR("***** bio->bi_private is NULL!  This should never "
                         "happen.  Normally, I would crash here, but instead I "
                         "will dump the bio contents to the console.  Please "
-                       "report this to CFS, along with any interesting "
-                       "messages leading up to this point (like SCSI errors, "
-                       "perhaps).  Because bi_private is NULL, I can't wake up "
-                       "the thread that initiated this I/O -- so you will "
-                       "probably have to reboot this node.\n");
+                       "report this to <http://bugzilla.lustre.org/> , along "
+                       "with any interesting messages leading up to this point "
+                       "(like SCSI errors, perhaps).  Because bi_private is "
+                       "NULL, I can't wake up the thread that initiated this "
+                       "IO - you will probably have to reboot this node.\n");
                  CERROR("bi_next: %p, bi_flags: %lx, bi_rw: %lu, bi_vcnt: %d, "
                         "bi_idx: %d, bi->size: %d, bi_end_io: %p, bi_cnt: %d, "
                         "bi_private: %p\n", bio->bi_next, bio->bi_flags,
@@ -133,18 +141,27 @@ static int dio_complete_routine(struct bio *bio, unsigned int done, int error)
                  return 0;
          }
  
-#ifdef HAVE_PAGE_CONSTANT
-        bio_for_each_segment(bvl, bio, i)
-                ClearPageConstant(bvl->bv_page);
-#endif
+        /* the check is outside of the cycle for performance reason -bzzz */
+        if (!test_bit(BIO_RW, &bio->bi_rw)) {
+                bio_for_each_segment(bvl, bio, i) {
+                        if (likely(error == 0))
+                                SetPageUptodate(bvl->bv_page);
+                        LASSERT(PageLocked(bvl->bv_page));
+                        ClearPageConstant(bvl->bv_page);
+                }
+                record_finish_io(iobuf, OBD_BRW_READ, error);
+        } else {
+                if (mapping_cap_page_constant_write(iobuf->dr_pages[0]->mapping)){
+                        bio_for_each_segment(bvl, bio, i) {
+                                ClearPageConstant(bvl->bv_page);
+                        }
+                }
+                record_finish_io(iobuf, OBD_BRW_WRITE, error);
+        }
  
-        spin_lock_irqsave(&iobuf->dr_lock, flags);
-        if (iobuf->dr_error == 0)
+        /* any real error is good enough -bzzz */
+        if (error != 0 && iobuf->dr_error == 0)
                  iobuf->dr_error = error;
-        spin_unlock_irqrestore(&iobuf->dr_lock, flags);
-
-        record_finish_io(iobuf, test_bit(BIO_RW, &bio->bi_rw) ?
-                         OBD_BRW_WRITE : OBD_BRW_READ, error);
  
          /* Completed bios used to be chained off iobuf->dr_bios and freed in
           * filter_clear_dreq().  It was then possible to exhaust the biovec-256
@@ -189,7 +206,6 @@ struct filter_iobuf *filter_alloc_iobuf(struct filter_obd *filter,
          iobuf->dr_filter = filter;
          init_waitqueue_head(&iobuf->dr_wait);
          atomic_set(&iobuf->dr_numreqs, 0);
-        spin_lock_init(&iobuf->dr_lock);
          iobuf->dr_max_pages = num_pages;
          iobuf->dr_npages = 0;
          iobuf->dr_error = 0;
@@ -228,7 +244,8 @@ void filter_free_iobuf(struct filter_iobuf *iobuf)
  void filter_iobuf_put(struct filter_obd *filter, struct filter_iobuf *iobuf,
                        struct obd_trans_info *oti)
  {
-        int thread_id = oti ? oti->oti_thread_id : -1;
+        int thread_id = (oti && oti->oti_thread) ?
+                        oti->oti_thread->t_id : -1;
  
          if (unlikely(thread_id < 0)) {
                  filter_free_iobuf(iobuf);
@@ -306,17 +323,15 @@ int filter_do_bio(struct obd_export *exp, struct inode *inode,
                                  sector_bits))
                                  nblocks++;
  
-#ifdef HAVE_PAGE_CONSTANT
-                        /* I only set the page to be constant only if it 
-                         * is mapped to a contiguous underlying disk block(s). 
-                         * It will then make sure the corresponding device 
-                         * cache of raid5 will be overwritten by this page. 
+                        /* I only set the page to be constant only if it
+                         * is mapped to a contiguous underlying disk block(s).
+                         * It will then make sure the corresponding device
+                         * cache of raid5 will be overwritten by this page.
                           * - jay */
-                        if ((rw == OBD_BRW_WRITE) && 
-                            (nblocks == blocks_per_page) && 
+                        if ((rw == OBD_BRW_WRITE) &&
+                            (nblocks == blocks_per_page) &&
                              mapping_cap_page_constant_write(inode->i_mapping))
-                                SetPageConstant(page);
-#endif
+                               SetPageConstant(page);
  
                          if (bio != NULL &&
                              can_be_merged(bio, sector) &&
@@ -349,9 +364,10 @@ int filter_do_bio(struct obd_export *exp, struct inode *inode,
                                  frags++;
                          }
  
-                        /* allocate new bio */
-                        bio = bio_alloc(GFP_NOIO,
-                                        (npages - page_idx) * blocks_per_page);
+                        /* allocate new bio, limited by max BIO size, b=9945 */
+                        bio = bio_alloc(GFP_NOIO, max(BIO_MAX_PAGES,
+                                                     (npages - page_idx) *
+                                                     blocks_per_page));
                          if (bio == NULL) {
                                  CERROR("Can't allocate bio %u*%u = %u pages\n",
                                         (npages - page_idx), blocks_per_page,
@@ -421,110 +437,6 @@ int filter_do_bio(struct obd_export *exp, struct inode *inode,
          RETURN(rc);
  }
  
-/* These are our hacks to keep our directio/bh IO coherent with ext3's
- * page cache use.  Most notably ext3 reads file data into the page
- * cache when it is zeroing the tail of partial-block truncates and
- * leaves it there, sometimes generating io from it at later truncates.
- * This removes the partial page and its buffers from the page cache,
- * so it should only ever cause a wait in rare cases, as otherwise we
- * always do full-page IO to the OST.
- *
- * The call to truncate_complete_page() will call journal_invalidatepage()
- * to free the buffers and drop the page from cache.  The buffers should
- * not be dirty, because we already called fdatasync/fdatawait on them.
- */
-static int filter_sync_inode_data(struct inode *inode, int locked)
-{
-        int rc = 0;
-
-        /* This is nearly do_fsync(), without the waiting on the inode */
-        /* XXX: in 2.6.16 (at least) we don't need to hold i_mutex over
-         * filemap_fdatawrite() and filemap_fdatawait(), so we may no longer
-         * need this lock here at all. */
-        if (!locked)
-                LOCK_INODE_MUTEX(inode);
-        if (inode->i_mapping->nrpages) {
-#ifdef PF_SYNCWRITE
-                current->flags |= PF_SYNCWRITE;
-#endif
-                rc = filemap_fdatawrite(inode->i_mapping);
-                if (rc == 0)
-                        rc = filemap_fdatawait(inode->i_mapping);
-#ifdef PF_SYNCWRITE
-                current->flags &= ~PF_SYNCWRITE;
-#endif
-        }
-        if (!locked)
-                UNLOCK_INODE_MUTEX(inode);
-
-        return rc;
-}
-
-/* Clear pages from the mapping before we do direct IO to that offset.
- * Now that the only source of such pages in the truncate path flushes
- * these pages to disk and then discards them, this is error condition.
- * If add back read cache this will happen again.  This could be disabled
- * until that time if we never see the below error. */
-static int filter_clear_page_cache(struct inode *inode,
-                                   struct filter_iobuf *iobuf)
-{
-        struct page *page;
-        int i, rc;
-
-        rc = filter_sync_inode_data(inode, 0);
-        if (rc != 0)
-                RETURN(rc);
-
-        /* be careful to call this after fsync_inode_data_buffers has waited
-         * for IO to complete before we evict it from the cache */
-        for (i = 0; i < iobuf->dr_npages; i++) {
-                page = find_lock_page(inode->i_mapping,
-                                      iobuf->dr_pages[i]->index);
-                if (page == NULL)
-                        continue;
-                if (page->mapping != NULL) {
-                        CERROR("page %lu (%d/%d) in page cache during write!\n",
-                               page->index, i, iobuf->dr_npages);
-                        wait_on_page_writeback(page);
-                        ll_truncate_complete_page(page);
-                }
-
-                unlock_page(page);
-                page_cache_release(page);
-        }
-
-        return 0;
-}
-
-int filter_clear_truncated_page(struct inode *inode)
-{
-        struct page *page;
-        int rc;
-
-        /* Truncate on page boundary, so nothing to flush? */
-        if (!(i_size_read(inode) & ~CFS_PAGE_MASK))
-                return 0;
-
-        rc = filter_sync_inode_data(inode, 1);
-        if (rc != 0)
-                RETURN(rc);
-
-        /* be careful to call this after fsync_inode_data_buffers has waited
-         * for IO to complete before we evict it from the cache */
-        page = find_lock_page(inode->i_mapping,
-                              i_size_read(inode) >> CFS_PAGE_SHIFT);
-        if (page) {
-                if (page->mapping != NULL) {
-                        wait_on_page_writeback(page);
-                        ll_truncate_complete_page(page);
-                }
-                unlock_page(page);
-                page_cache_release(page);
-        }
-
-        return 0;
-}
-
  /* Must be called with i_mutex taken for writes; this will drop it */
  int filter_direct_io(int rw, struct dentry *dchild, struct filter_iobuf *iobuf,
                       struct obd_export *exp, struct iattr *attr,
@@ -573,7 +485,7 @@ int filter_direct_io(int rw, struct dentry *dchild, struct filter_iobuf *iobuf,
  
                  UNLOCK_INODE_MUTEX(inode);
  
-                rc2 = filter_finish_transno(exp, oti, 0, 0);
+                rc2 = filter_finish_transno(exp, inode, oti, 0, 0);
                  if (rc2 != 0) {
                          CERROR("can't close transaction: %d\n", rc2);
                          if (rc == 0)
@@ -590,10 +502,6 @@ int filter_direct_io(int rw, struct dentry *dchild, struct filter_iobuf *iobuf,
                               iobuf->dr_blocks, blocks_per_page, 0);
          }
  
-        rc = filter_clear_page_cache(inode, iobuf);
-        if (rc != 0)
-                RETURN(rc);
-
          RETURN(filter_do_bio(exp, inode, iobuf, rw));
  }
  
@@ -618,8 +526,20 @@ static int filter_range_is_mapped(struct inode *inode, obd_size offset, int len)
          return 1;
  }
  
+/*
+ * interesting use cases on how it interacts with VM:
+ *
+ * - vm writeout -- shouldn't see our pages as we don't mark them dirty
+ *   though vm can find partial page left dirty by truncate. in this
+ *   usual writeout is used unless our write rewrite that page - then we
+ *   drop PG_dirty with PG_lock held.
+ *
+ * - else?
+ *
+ */
  int filter_commitrw_write(struct obd_export *exp, struct obdo *oa,
-                          int objcount, struct obd_ioobj *obj, int niocount,
+                          int objcount, struct obd_ioobj *obj,
+                          struct niobuf_remote *nb, int niocount,
                            struct niobuf_local *res, struct obd_trans_info *oti,
                            int rc)
  {
@@ -628,10 +548,11 @@ int filter_commitrw_write(struct obd_export *exp, struct obdo *oa,
          struct lvfs_run_ctxt saved;
          struct fsfilt_objinfo fso;
          struct iattr iattr = { 0 };
-        struct inode *inode = NULL;
+        struct inode *inode = res->dentry->d_inode;
          unsigned long now = jiffies;
          int i, err, cleanup_phase = 0;
          struct obd_device *obd = exp->exp_obd;
+        struct filter_obd *fo = &obd->u.filter;
          void *wait_handle;
          int total_size = 0;
          int rec_pending = 0;
@@ -647,8 +568,8 @@ int filter_commitrw_write(struct obd_export *exp, struct obdo *oa,
  
          /* we try to get enough quota to write here, and let ldiskfs
           * decide if it is out of quota or not b=14783 */
-        lquota_chkquota(filter_quota_interface_ref, obd, oa->o_uid,
-                        oa->o_gid, niocount, &rec_pending);
+        lquota_chkquota(filter_quota_interface_ref, obd, oa->o_uid, oa->o_gid,
+                        niocount, &rec_pending, oti, inode, obj->ioo_bufcnt);
  
          iobuf = filter_iobuf_get(&obd->u.filter, oti);
          if (IS_ERR(iobuf))
@@ -657,10 +578,9 @@ int filter_commitrw_write(struct obd_export *exp, struct obdo *oa,
  
          fso.fso_dentry = res->dentry;
          fso.fso_bufcnt = obj->ioo_bufcnt;
-        inode = res->dentry->d_inode;
  
          iobuf->dr_ignore_quota = 0;
-        for (i = 0, lnb = res; i < obj->ioo_bufcnt; i++, lnb++) {
+        for (i = 0, lnb = res; i < niocount; i++, lnb++) {
                  loff_t this_size;
  
                  /* If overwriting an existing block, we don't need a grant */
@@ -673,6 +593,16 @@ int filter_commitrw_write(struct obd_export *exp, struct obdo *oa,
                          continue;
                  }
  
+                LASSERT(PageLocked(lnb->page));
+                LASSERT(!PageWriteback(lnb->page));
+
+                /* preceding filemap_write_and_wait() should have clean pages */
+                if (fo->fo_writethrough_cache)
+                        clear_page_dirty_for_io(lnb->page);
+                LASSERT(!PageDirty(lnb->page));
+
+                SetPageUptodate(lnb->page);
+
                  err = filter_iobuf_add_page(obd, iobuf, inode, lnb->page);
                  LASSERT (err == 0);
  
@@ -684,10 +614,12 @@ int filter_commitrw_write(struct obd_export *exp, struct obdo *oa,
                  if (this_size > iattr.ia_size)
                          iattr.ia_size = this_size;
  
-                /* if one page is a write-back page from client cache, or it's
-                 * written by root, then mark the whole io request as ignore
-                 * quota request */
-                if (lnb->flags & (OBD_BRW_FROM_GRANT | OBD_BRW_NOQUOTA))
+                /* if one page is a write-back page from client cache and
+                 * not from direct_io, or it's written by root, then mark
+                 * the whole io request as ignore quota request */
+                if (lnb->flags & OBD_BRW_NOQUOTA ||
+                    (lnb->flags & (OBD_BRW_FROM_GRANT | OBD_BRW_SYNC)) ==
+                    OBD_BRW_FROM_GRANT)
                          iobuf->dr_ignore_quota = 1;
          }
  
@@ -730,7 +662,7 @@ int filter_commitrw_write(struct obd_export *exp, struct obdo *oa,
                  CDEBUG(D_INODE, "update UID/GID to %lu/%lu\n",
                         (unsigned long)oa->o_uid, (unsigned long)oa->o_gid);
  
-                cap_raise(current->cap_effective, CAP_SYS_RESOURCE);
+                cfs_cap_raise(CFS_CAP_SYS_RESOURCE);
  
                  iattr.ia_valid |= ATTR_MODE;
                  iattr.ia_mode = inode->i_mode;
@@ -745,8 +677,8 @@ int filter_commitrw_write(struct obd_export *exp, struct obdo *oa,
                   * in the inode before filter_direct_io() - see bug 10357. */
                  save = iattr.ia_valid;
                  iattr.ia_valid &= (ATTR_UID | ATTR_GID);
-                rc = fsfilt_setattr(obd, res->dentry, oti->oti_handle, &iattr, 0);
-                CDEBUG(D_QUOTA, "set uid(%u)/gid(%u) to ino(%lu). rc(%d)\n", 
+                rc = fsfilt_setattr(obd, res->dentry, oti->oti_handle,&iattr,0);
+                CDEBUG(D_QUOTA, "set uid(%u)/gid(%u) to ino(%lu). rc(%d)\n",
                                  iattr.ia_uid, iattr.ia_gid, inode->i_ino, rc);
                  iattr.ia_valid = save & ~(ATTR_UID | ATTR_GID);
          }
@@ -779,8 +711,8 @@ int filter_commitrw_write(struct obd_export *exp, struct obdo *oa,
  
  cleanup:
          if (rec_pending)
-                lquota_pending_commit(filter_quota_interface_ref, obd, oa->o_uid,
-                                      oa->o_gid, niocount);
+                lquota_pending_commit(filter_quota_interface_ref, obd,
+                                      oa->o_uid, oa->o_gid, rec_pending);
  
          filter_grant_commit(exp, niocount, res);
  
@@ -806,5 +738,20 @@ cleanup:
          CDEBUG(err ? D_ERROR : D_QUOTA,
                 "filter adjust qunit! (rc:%d)\n", err);
  
+        for (i = 0, lnb = res; i < niocount; i++, lnb++) {
+                if (lnb->page == NULL)
+                        continue;
+
+                LASSERT(PageLocked(lnb->page));
+                unlock_page(lnb->page);
+
+                page_cache_release(lnb->page);
+                lnb->page = NULL;
+        }
+
+        if (inode && (fo->fo_writethrough_cache == 0 ||
+                        i_size_read(inode) > fo->fo_readcache_max_filesize))
+                filter_invalidate_cache(obd, obj, nb, inode);
+
          RETURN(rc);
  }
diff --git a/lustre/obdfilter/filter_log.c b/lustre/obdfilter/filter_log.c

index dd40fde..261852c 100644 (file)
--- a/lustre/obdfilter/filter_log.c
+++ b/lustre/obdfilter/filter_log.c
@@ -1,30 +1,43 @@
  /* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
   * vim:expandtab:shiftwidth=8:tabstop=8:
   *
- *  linux/fs/obdfilter/filter_log.c
+ * GPL HEADER START
   *
- *  Copyright (c) 2001-2003 Cluster File Systems, Inc.
- *   Author: Peter Braam <braam@clusterfs.com>
- *   Author: Andreas Dilger <adilger@clusterfs.com>
- *   Author: Phil Schwan <phil@clusterfs.com>
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   *
- *   This file is part of the Lustre file system, http://www.lustre.org
- *   Lustre is a trademark of Cluster File Systems, Inc.
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
   *
- *   You may have signed or agreed to another license before downloading
- *   this software.  If so, you are bound by the terms and conditions
- *   of that agreement, and the following does not apply to you.  See the
- *   LICENSE file included with this distribution for more information.
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
   *
- *   If you did not agree to a different license, then this copy of Lustre
- *   is open source software; you can redistribute it and/or modify it
- *   under the terms of version 2 of the GNU General Public License as
- *   published by the Free Software Foundation.
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
   *
- *   In either case, Lustre is distributed in the hope that it will be
- *   useful, but WITHOUT ANY WARRANTY; without even the implied warranty
- *   of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- *   license text for more details.
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright  2008 Sun Microsystems, Inc. All rights reserved
+ * Use is subject to license terms.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/obdfilter/filter_log.c
+ *
+ * Author: Peter Braam <braam@clusterfs.com>
+ * Author: Andreas Dilger <adilger@clusterfs.com>
+ * Author: Phil Schwan <phil@clusterfs.com>
   */
  
  #define DEBUG_SUBSYSTEM S_FILTER
@@ -37,9 +50,8 @@
  
  #include <libcfs/list.h>
  #include <obd_class.h>
+#include <lustre_log.h>
  #include <lustre_fsfilt.h>
-#include <lustre_commit_confd.h>
-
  #include "filter_internal.h"
  
  int filter_log_sz_change(struct llog_handle *cathandle,
@@ -132,11 +144,11 @@ static int filter_recov_log_unlink_cb(struct llog_ctxt *ctxt,
                                        struct llog_rec_hdr *rec,
                                        struct llog_cookie *cookie)
  {
-        struct obd_device *obd = ctxt->loc_obd;
-        struct obd_export *exp = obd->obd_self_export;
+        struct obd_export *exp = ctxt->loc_obd->obd_self_export;
          struct llog_unlink_rec *lur;
          struct obdo *oa;
          obd_id oid;
+        obd_count count;
          int rc = 0;
          ENTRY;
  
@@ -146,20 +158,26 @@ static int filter_recov_log_unlink_cb(struct llog_ctxt *ctxt,
                  RETURN(-ENOMEM);
          oa->o_valid |= OBD_MD_FLCOOKIE;
          oa->o_id = lur->lur_oid;
-        oa->o_gr = lur->lur_ogen;
+        oa->o_gr = lur->lur_ogr;
          oa->o_lcookie = *cookie;
          oid = oa->o_id;
-
-        rc = filter_destroy(exp, oa, NULL, NULL, NULL);
-        OBDO_FREE(oa);
-        if (rc == -ENOENT) {
-                CDEBUG(D_RPCTRACE, "object already removed, send cookie\n");
-                llog_cancel(ctxt, NULL, 1, cookie, 0);
-                RETURN(0);
+        /* objid gap may require to destroy several objects in row */
+        count = lur->lur_count + 1;
+
+        while (count > 0) {
+                rc = filter_destroy(exp, oa, NULL, NULL, NULL);
+                if (rc == 0)
+                        CDEBUG(D_RPCTRACE, "object "LPU64" is destroyed\n",
+                               oid);
+                else if (rc != -ENOENT)
+                        CEMERG("error destroying object "LPU64": %d\n",
+                               oid, rc);
+                else
+                        rc = 0;
+                count--;
+                oid++;
          }
-
-        if (rc == 0)
-                CDEBUG(D_RPCTRACE, "object "LPU64" is destroyed\n", oid);
+        OBDO_FREE(oa);
  
          RETURN(rc);
  }
@@ -172,21 +190,31 @@ static int filter_recov_log_setattr_cb(struct llog_ctxt *ctxt,
  {
          struct obd_device *obd = ctxt->loc_obd;
          struct obd_export *exp = obd->obd_self_export;
-        struct llog_setattr_rec *lsr;
          struct obd_info oinfo = { { { 0 } } };
          obd_id oid;
          int rc = 0;
          ENTRY;
  
-        lsr = (struct llog_setattr_rec *)rec;
          OBDO_ALLOC(oinfo.oi_oa);
  
+        if (rec->lrh_type == MDS_SETATTR_REC) {
+                struct llog_setattr_rec *lsr = (struct llog_setattr_rec *)rec;
+
+                oinfo.oi_oa->o_id = lsr->lsr_oid;
+                oinfo.oi_oa->o_gr = lsr->lsr_ogr;
+                oinfo.oi_oa->o_uid = lsr->lsr_uid;
+                oinfo.oi_oa->o_gid = lsr->lsr_gid;
+        } else {
+                struct llog_setattr64_rec *lsr = (struct llog_setattr64_rec *)rec;
+
+                oinfo.oi_oa->o_id = lsr->lsr_oid;
+                oinfo.oi_oa->o_gr = lsr->lsr_ogr;
+                oinfo.oi_oa->o_uid = lsr->lsr_uid;
+                oinfo.oi_oa->o_gid = lsr->lsr_gid;
+        }
+
          oinfo.oi_oa->o_valid |= (OBD_MD_FLID | OBD_MD_FLUID | OBD_MD_FLGID |
                                   OBD_MD_FLCOOKIE);
-        oinfo.oi_oa->o_id = lsr->lsr_oid;
-        oinfo.oi_oa->o_gr = lsr->lsr_ogen;
-        oinfo.oi_oa->o_uid = lsr->lsr_uid;
-        oinfo.oi_oa->o_gid = lsr->lsr_gid;
          oinfo.oi_oa->o_lcookie = *cookie;
          oid = oinfo.oi_oa->o_id;
  
@@ -206,7 +234,7 @@ static int filter_recov_log_setattr_cb(struct llog_ctxt *ctxt,
  }
  
  int filter_recov_log_mds_ost_cb(struct llog_handle *llh,
-                               struct llog_rec_hdr *rec, void *data)
+                                struct llog_rec_hdr *rec, void *data)
  {
          struct llog_ctxt *ctxt = llh->lgh_ctxt;
          struct llog_cookie cookie;
@@ -231,6 +259,7 @@ int filter_recov_log_mds_ost_cb(struct llog_handle *llh,
                  rc = filter_recov_log_unlink_cb(ctxt, rec, &cookie);
                  break;
          case MDS_SETATTR_REC:
+        case MDS_SETATTR64_REC:
                  rc = filter_recov_log_setattr_cb(ctxt, rec, &cookie);
                  break;
          case LLOG_GEN_REC: {
diff --git a/lustre/obdfilter/filter_lvb.c b/lustre/obdfilter/filter_lvb.c

index 678119b..06c718c 100644 (file)
--- a/lustre/obdfilter/filter_lvb.c
+++ b/lustre/obdfilter/filter_lvb.c
@@ -1,30 +1,43 @@
  /* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
   * vim:expandtab:shiftwidth=8:tabstop=8:
   *
- *  linux/fs/obdfilter/filter_log.c
+ * GPL HEADER START
   *
- *  Copyright (c) 2001-2003 Cluster File Systems, Inc.
- *   Author: Peter Braam <braam@clusterfs.com>
- *   Author: Andreas Dilger <adilger@clusterfs.com>
- *   Author: Phil Schwan <phil@clusterfs.com>
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   *
- *   This file is part of the Lustre file system, http://www.lustre.org
- *   Lustre is a trademark of Cluster File Systems, Inc.
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
   *
- *   You may have signed or agreed to another license before downloading
- *   this software.  If so, you are bound by the terms and conditions
- *   of that agreement, and the following does not apply to you.  See the
- *   LICENSE file included with this distribution for more information.
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
   *
- *   If you did not agree to a different license, then this copy of Lustre
- *   is open source software; you can redistribute it and/or modify it
- *   under the terms of version 2 of the GNU General Public License as
- *   published by the Free Software Foundation.
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
   *
- *   In either case, Lustre is distributed in the hope that it will be
- *   useful, but WITHOUT ANY WARRANTY; without even the implied warranty
- *   of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- *   license text for more details.
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright  2008 Sun Microsystems, Inc. All rights reserved
+ * Use is subject to license terms.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/obdfilter/filter_lvb.c
+ *
+ * Author: Peter Braam <braam@clusterfs.com>
+ * Author: Andreas Dilger <adilger@clusterfs.com>
+ * Author: Phil Schwan <phil@clusterfs.com>
   */
  
  #define DEBUG_SUBSYSTEM S_FILTER
@@ -53,11 +66,6 @@ static int filter_lvbo_init(struct ldlm_resource *res)
          LASSERT(res);
          LASSERT_SEM_LOCKED(&res->lr_lvb_sem);
  
-        /* we only want lvb's for object resources */
-        /* check for internal locks: these have name[1] != 0 */
-        if (res->lr_name.name[1])
-                RETURN(0);
-
          if (res->lr_lvb_data)
                  RETURN(0);
  
@@ -71,6 +79,10 @@ static int filter_lvbo_init(struct ldlm_resource *res)
          obd = res->lr_namespace->ns_lvbp;
          LASSERT(obd != NULL);
  
+        CDEBUG(D_INODE, "%s: filter_lvbo_init(o_gr="LPU64", o_id="
+               LPU64")\n", obd->obd_name, res->lr_name.name[1],
+               res->lr_name.name[0]);
+
          dentry = filter_fid2dentry(obd, NULL, 0, res->lr_name.name[0]);
          if (IS_ERR(dentry)) {
                  rc = PTR_ERR(dentry);
@@ -118,11 +130,6 @@ static int filter_lvbo_update(struct ldlm_resource *res, struct ptlrpc_request *
  
          LASSERT(res);
  
-        /* we only want lvb's for object resources */
-        /* check for internal locks: these have name[1] != 0 */
-        if (res->lr_name.name[1])
-                RETURN(0);
-
          down(&res->lr_lvb_sem);
          lvb = res->lr_lvb_data;
          if (lvb == NULL) {
diff --git a/lustre/obdfilter/lproc_obdfilter.c b/lustre/obdfilter/lproc_obdfilter.c

index 7b6aafa..efd5f67 100644 (file)
--- a/lustre/obdfilter/lproc_obdfilter.c
+++ b/lustre/obdfilter/lproc_obdfilter.c
@@ -1,26 +1,37 @@
  /* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
   * vim:expandtab:shiftwidth=8:tabstop=8:
   *
- *  Copyright (C) 2002, 2003 Cluster File Systems, Inc.
+ * GPL HEADER START
   *
- *   This file is part of the Lustre file system, http://www.lustre.org
- *   Lustre is a trademark of Cluster File Systems, Inc.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   *
- *   You may have signed or agreed to another license before downloading
- *   this software.  If so, you are bound by the terms and conditions
- *   of that agreement, and the following does not apply to you.  See the
- *   LICENSE file included with this distribution for more information.
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
   *
- *   If you did not agree to a different license, then this copy of Lustre
- *   is open source software; you can redistribute it and/or modify it
- *   under the terms of version 2 of the GNU General Public License as
- *   published by the Free Software Foundation.
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
   *
- *   In either case, Lustre is distributed in the hope that it will be
- *   useful, but WITHOUT ANY WARRANTY; without even the implied warranty
- *   of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- *   license text for more details.
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
   *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright  2008 Sun Microsystems, Inc. All rights reserved
+ * Use is subject to license terms.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
   */
  #define DEBUG_SUBSYSTEM S_CLASS
  
@@ -176,6 +187,56 @@ int lprocfs_filter_wr_fmd_max_age(struct file *file, const char *buffer,
          return count;
  }
  
+static int lprocfs_filter_rd_cache(char *page, char **start, off_t off,
+                                   int count, int *eof, void *data)
+{
+        struct obd_device *obd = (struct obd_device *)data;
+        LASSERT(obd != NULL);
+
+        return snprintf(page, count, "%u\n", obd->u.filter.fo_read_cache);
+}
+
+static int lprocfs_filter_wr_cache(struct file *file, const char *buffer,
+                     unsigned long count, void *data)
+{
+        struct obd_device *obd = (struct obd_device *)data;
+        int val, rc;
+        LASSERT(obd != NULL);
+
+        rc = lprocfs_write_helper(buffer, count, &val);
+
+        if (rc)
+                return rc;
+
+        obd->u.filter.fo_read_cache = val;
+        return count;
+}
+
+static int lprocfs_filter_rd_wcache(char *page, char **start, off_t off,
+                                   int count, int *eof, void *data)
+{
+        struct obd_device *obd = (struct obd_device *)data;
+        LASSERT(obd != NULL);
+
+        return snprintf(page, count, "%u\n", obd->u.filter.fo_writethrough_cache);
+}
+
+static int lprocfs_filter_wr_wcache(struct file *file, const char *buffer,
+                     unsigned long count, void *data)
+{
+        struct obd_device *obd = (struct obd_device *)data;
+        int val, rc;
+        LASSERT(obd != NULL);
+
+        rc = lprocfs_write_helper(buffer, count, &val);
+
+        if (rc)
+                return rc;
+
+        obd->u.filter.fo_writethrough_cache = val;
+        return count;
+}
+
  static struct lprocfs_vars lprocfs_filter_obd_vars[] = {
          { "uuid",         lprocfs_rd_uuid,          0, 0 },
          { "blocksize",    lprocfs_rd_blksize,       0, 0 },
@@ -192,6 +253,7 @@ static struct lprocfs_vars lprocfs_filter_obd_vars[] = {
          { "tot_pending",  lprocfs_filter_rd_tot_pending, 0, 0 },
          { "tot_granted",  lprocfs_filter_rd_tot_granted, 0, 0 },
          { "recovery_status", lprocfs_obd_rd_recovery_status, 0, 0 },
+        { "hash_stats",   lprocfs_obd_rd_hash,      0, 0 },
  #ifdef CRAY_XT3
          { "recovery_maxtime", lprocfs_obd_rd_recovery_maxtime,
                                lprocfs_obd_wr_recovery_maxtime, 0},
@@ -214,13 +276,20 @@ static struct lprocfs_vars lprocfs_filter_obd_vars[] = {
          { "quota_type",     lprocfs_quota_rd_type,
                              lprocfs_quota_wr_type, 0},
          { "quota_switch_seconds",  lprocfs_quota_rd_switch_seconds,
-                            lprocfs_quota_wr_switch_seconds, 0 },
-
+                                   lprocfs_quota_wr_switch_seconds, 0 },
  #endif
          { "client_cache_count", lprocfs_filter_rd_fmd_max_num,
                            lprocfs_filter_wr_fmd_max_num, 0 },
          { "client_cache_seconds", lprocfs_filter_rd_fmd_max_age,
                            lprocfs_filter_wr_fmd_max_age, 0 },
+        { "read_cache_enable", lprocfs_filter_rd_cache, lprocfs_filter_wr_cache, 0},
+        { "writethrough_cache_enable", lprocfs_filter_rd_wcache,
+                          lprocfs_filter_wr_wcache, 0},
+#ifdef HAVE_DELAYED_RECOVERY
+        { "stale_export_age", lprocfs_obd_rd_stale_export_age,
+                              lprocfs_obd_wr_stale_export_age, 0},
+        { "flush_stale_exports", 0, lprocfs_obd_wr_flush_stale_exports, 0 },
+#endif
          { 0 }
  };
  
@@ -248,8 +317,8 @@ void filter_tally(struct obd_export *exp, struct page **pages, int nr_pages,
          lprocfs_oh_tally_log2(&fed->fed_brw_stats.hist[BRW_R_PAGES + wr],
                                nr_pages);
          if (exp->exp_nid_stats && exp->exp_nid_stats->nid_brw_stats)
-                lprocfs_oh_tally_log2(&exp->exp_nid_stats->nid_brw_stats->hist[BRW_W_PAGES + wr],
-                                      nr_pages);
+                lprocfs_oh_tally_log2(&exp->exp_nid_stats->nid_brw_stats->
+                                        hist[BRW_R_PAGES + wr], nr_pages);
  
          while (nr_pages-- > 0) {
                  if (last_page && (*pages)->index != (last_page->index + 1))
@@ -273,9 +342,11 @@ void filter_tally(struct obd_export *exp, struct page **pages, int nr_pages,
                           discont_blocks);
  
          if (exp->exp_nid_stats && exp->exp_nid_stats->nid_brw_stats) {
-                lprocfs_oh_tally_log2(&exp->exp_nid_stats->nid_brw_stats->hist[BRW_W_DISCONT_PAGES + wr],
+                lprocfs_oh_tally_log2(&exp->exp_nid_stats->nid_brw_stats->
+                                        hist[BRW_R_DISCONT_PAGES + wr],
                                        discont_pages);
-                lprocfs_oh_tally_log2(&exp->exp_nid_stats->nid_brw_stats->hist[BRW_W_DISCONT_BLOCKS + wr],
+                lprocfs_oh_tally_log2(&exp->exp_nid_stats->nid_brw_stats->
+                                        hist[BRW_R_DISCONT_BLOCKS + wr],
                                        discont_blocks);
          }
  }
@@ -289,7 +360,7 @@ static void display_brw_stats(struct seq_file *seq, char *name, char *units,
          int i;
  
          seq_printf(seq, "\n%26s read      |     write\n", " ");
-        seq_printf(seq, "%-22s %-5s %% cum %% |  %-5s %% cum %%\n", 
+        seq_printf(seq, "%-22s %-5s %% cum %% |  %-5s %% cum %%\n",
                     name, units, units);
  
          read_tot = lprocfs_oh_sum(read);
@@ -302,7 +373,7 @@ static void display_brw_stats(struct seq_file *seq, char *name, char *units,
                  if (read_cum == 0 && write_cum == 0)
                          continue;
  
-                if (!log2) 
+                if (!log2)
                          seq_printf(seq, "%u", i);
                  else if (i < 10)
                          seq_printf(seq, "%u", 1<<i);
@@ -312,7 +383,7 @@ static void display_brw_stats(struct seq_file *seq, char *name, char *units,
                          seq_printf(seq, "%uM", 1<<(i-20));
  
                  seq_printf(seq, ":\t\t%10lu %3lu %3lu   | %4lu %3lu %3lu\n",
-                           r, pct(r, read_tot), pct(read_cum, read_tot), 
+                           r, pct(r, read_tot), pct(read_cum, read_tot),
                             w, pct(w, write_tot), pct(write_cum, write_tot));
  
                  if (read_cum == read_tot && write_cum == write_tot)
@@ -345,7 +416,6 @@ static void brw_stats_show(struct seq_file *seq, struct brw_stats *brw_stats)
                            &brw_stats->hist[BRW_R_DIO_FRAGS],
                            &brw_stats->hist[BRW_W_DIO_FRAGS], 0);
  
-#if (LINUX_VERSION_CODE > KERNEL_VERSION(2,5,0))
          display_brw_stats(seq, "disk I/Os in flight", "ios",
                            &brw_stats->hist[BRW_R_RPC_HIST],
                            &brw_stats->hist[BRW_W_RPC_HIST], 0);
@@ -361,7 +431,6 @@ static void brw_stats_show(struct seq_file *seq, struct brw_stats *brw_stats)
          display_brw_stats(seq, "disk I/O size", "ios",
                            &brw_stats->hist[BRW_R_DISK_IOSIZE],
                            &brw_stats->hist[BRW_W_DISK_IOSIZE], 1);
-#endif
  }
  
  #undef pct
diff --git a/lustre/osc/autoMakefile.am b/lustre/osc/autoMakefile.am

index 985e473..65c588b 100644 (file)
--- a/lustre/osc/autoMakefile.am
+++ b/lustre/osc/autoMakefile.am
@@ -1,7 +1,38 @@
-# Copyright (C) 2001  Cluster File Systems, Inc.
  #
-# This code is issued under the GNU General Public License.
-# See the file COPYING in this distribution
+# GPL HEADER START
+#
+# DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License version 2 only,
+# as published by the Free Software Foundation.
+#
+# This program is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+# General Public License version 2 for more details (a copy is included
+# in the LICENSE file that accompanied this code).
+#
+# You should have received a copy of the GNU General Public License
+# version 2 along with this program; If not, see
+# http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+#
+# Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+# CA 95054 USA or visit www.sun.com if you need additional information or
+# have any questions.
+#
+# GPL HEADER END
+#
+
+#
+# Copyright  2008 Sun Microsystems, Inc. All rights reserved
+# Use is subject to license terms.
+#
+
+#
+# This file is part of Lustre, http://www.lustre.org/
+# Lustre is a trademark of Sun Microsystems, Inc.
+#
  
  if LIBLUSTRE
  noinst_LIBRARIES = libosc.a
diff --git a/lustre/osc/cache.c b/lustre/osc/cache.c

index 4f4ddd9..371b78e 100644 (file)
--- a/lustre/osc/cache.c
+++ b/lustre/osc/cache.c
@@ -1,26 +1,39 @@
  /* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
   * vim:expandtab:shiftwidth=8:tabstop=8:
   *
- *  Copyright (C) 2001-2003 Cluster File Systems, Inc.
- *   Author Oleg Drokin <green@clusterfs.com>
+ * GPL HEADER START
   *
- *   This file is part of the Lustre file system, http://www.lustre.org
- *   Lustre is a trademark of Cluster File Systems, Inc.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   *
- *   You may have signed or agreed to another license before downloading
- *   this software.  If so, you are bound by the terms and conditions
- *   of that agreement, and the following does not apply to you.  See the
- *   LICENSE file included with this distribution for more information.
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
   *
- *   If you did not agree to a different license, then this copy of Lustre
- *   is open source software; you can redistribute it and/or modify it
- *   under the terms of version 2 of the GNU General Public License as
- *   published by the Free Software Foundation.
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
   *
- *   In either case, Lustre is distributed in the hope that it will be
- *   useful, but WITHOUT ANY WARRANTY; without even the implied warranty
- *   of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- *   license text for more details.
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright  2008 Sun Microsystems, Inc. All rights reserved
+ * Use is subject to license terms.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/osc/cache.c
   *
   * Cache of triples - object, lock, extent
   */
@@ -335,6 +348,9 @@ static int cache_remove_extents_from_lock(struct lustre_cache *cache,
                             cache_extent_removal_event */
                          ext_data = extent->oap_page;
                          cache->lc_pin_extent_cb(extent->oap_page);
+
+                        if (lock->l_flags & LDLM_FL_BL_AST)
+                                extent->oap_async_flags |= ASYNC_HP;
                          spin_unlock(&extent->oap_lock);
                          spin_unlock(&lock->l_extents_list_lock);
                          cache_extent_removal_event(cache, ext_data,
diff --git a/lustre/osc/lproc_osc.c b/lustre/osc/lproc_osc.c

index b5c3929..15c6860 100644 (file)
--- a/lustre/osc/lproc_osc.c
+++ b/lustre/osc/lproc_osc.c
@@ -1,33 +1,42 @@
  /* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
   * vim:expandtab:shiftwidth=8:tabstop=8:
   *
- *  Copyright (C) 2002, 2003 Cluster File Systems, Inc.
+ * GPL HEADER START
   *
- *   This file is part of the Lustre file system, http://www.lustre.org
- *   Lustre is a trademark of Cluster File Systems, Inc.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   *
- *   You may have signed or agreed to another license before downloading
- *   this software.  If so, you are bound by the terms and conditions
- *   of that agreement, and the following does not apply to you.  See the
- *   LICENSE file included with this distribution for more information.
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
   *
- *   If you did not agree to a different license, then this copy of Lustre
- *   is open source software; you can redistribute it and/or modify it
- *   under the terms of version 2 of the GNU General Public License as
- *   published by the Free Software Foundation.
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
   *
- *   In either case, Lustre is distributed in the hope that it will be
- *   useful, but WITHOUT ANY WARRANTY; without even the implied warranty
- *   of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- *   license text for more details.
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
   *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright  2008 Sun Microsystems, Inc. All rights reserved
+ * Use is subject to license terms.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
   */
  #define DEBUG_SUBSYSTEM S_CLASS
  
  #include <linux/version.h>
-#if (LINUX_VERSION_CODE >= KERNEL_VERSION(2,5,0))
  #include <asm/statfs.h>
-#endif
  #include <obd_class.h>
  #include <lprocfs_status.h>
  #include <linux/seq_file.h>
@@ -51,7 +60,7 @@ static int osc_wr_active(struct file *file, const char *buffer,
  {
          struct obd_device *dev = data;
          int val, rc;
-        
+
          rc = lprocfs_write_helper(buffer, count, &val);
          if (rc)
                  return rc;
@@ -60,11 +69,11 @@ static int osc_wr_active(struct file *file, const char *buffer,
  
          LPROCFS_CLIMP_CHECK(dev);
          /* opposite senses */
-        if (dev->u.cli.cl_import->imp_deactive == val) 
+        if (dev->u.cli.cl_import->imp_deactive == val)
                  rc = ptlrpc_set_import_active(dev->u.cli.cl_import, val);
          else
                  CDEBUG(D_CONFIG, "activate %d: ignoring repeat request\n", val);
-        
+
          LPROCFS_CLIMP_EXIT(dev);
          return count;
  }
@@ -230,6 +239,53 @@ static int osc_wr_create_count(struct file *file, const char *buffer,
                                 unsigned long count, void *data)
  {
          struct obd_device *obd = data;
+        int val, rc, i;
+
+        if (obd == NULL)
+                return 0;
+
+        rc = lprocfs_write_helper(buffer, count, &val);
+        if (rc)
+                return rc;
+
+        /* The MDT ALWAYS needs to limit the precreate count to
+         * OST_MAX_PRECREATE, and the constant cannot be changed
+         * because it is a value shared between the OSC and OST
+         * that is the maximum possible number of objects that will
+         * ever be handled by MDT->OST recovery processing.
+         *
+         * If the OST ever gets a request to delete more orphans,
+         * this implies that something has gone badly on the MDT
+         * and the OST will refuse to delete so much data from the
+         * filesystem as a safety measure. */
+        if (val < OST_MIN_PRECREATE || val > OST_MAX_PRECREATE)
+                return -ERANGE;
+        if (val > obd->u.cli.cl_oscc.oscc_max_grow_count)
+                return -ERANGE;
+
+        for (i = 1; (i << 1) <= val; i <<= 1)
+                ;
+        obd->u.cli.cl_oscc.oscc_grow_count = i;
+
+        return count;
+}
+
+static int osc_rd_max_create_count(char *page, char **start, off_t off,
+                                   int count, int *eof, void *data)
+{
+        struct obd_device *obd = data;
+
+        if (obd == NULL)
+                return 0;
+
+        return snprintf(page, count, "%d\n",
+                        obd->u.cli.cl_oscc.oscc_max_grow_count);
+}
+
+static int osc_wr_max_create_count(struct file *file, const char *buffer,
+                                   unsigned long count, void *data)
+{
+        struct obd_device *obd = data;
          int val, rc;
  
          if (obd == NULL)
@@ -244,7 +300,10 @@ static int osc_wr_create_count(struct file *file, const char *buffer,
          if (val > OST_MAX_PRECREATE)
                  return -ERANGE;
  
-        obd->u.cli.cl_oscc.oscc_grow_count = val;
+        if (obd->u.cli.cl_oscc.oscc_grow_count > val)
+                obd->u.cli.cl_oscc.oscc_grow_count = val;
+
+        obd->u.cli.cl_oscc.oscc_max_grow_count = val;
  
          return count;
  }
@@ -364,7 +423,7 @@ static int osc_rd_resend_count(char *page, char **start, off_t off, int count,
  {
          struct obd_device *obd = data;
  
-        return snprintf(page, count, "%u\n", atomic_read(&obd->u.cli.cl_resends)); 
+        return snprintf(page, count, "%u\n", atomic_read(&obd->u.cli.cl_resends));
  }
  
  static int osc_wr_resend_count(struct file *file, const char *buffer,
@@ -387,7 +446,7 @@ static int osc_wr_resend_count(struct file *file, const char *buffer,
  
  static struct lprocfs_vars lprocfs_osc_obd_vars[] = {
          { "uuid",            lprocfs_rd_uuid,        0, 0 },
-        { "ping",            0, lprocfs_wr_ping,        0 },
+        { "ping",            0, lprocfs_wr_ping,     0, 0, 0222 },
          { "connect_flags",   lprocfs_rd_connect_flags, 0, 0 },
          { "blocksize",       lprocfs_rd_blksize,     0, 0 },
          { "kbytestotal",     lprocfs_rd_kbytestotal, 0, 0 },
@@ -398,7 +457,7 @@ static struct lprocfs_vars lprocfs_osc_obd_vars[] = {
          //{ "filegroups",      lprocfs_rd_filegroups,  0, 0 },
          { "ost_server_uuid", lprocfs_rd_server_uuid, 0, 0 },
          { "ost_conn_uuid",   lprocfs_rd_conn_uuid, 0, 0 },
-        { "active",          osc_rd_active, 
+        { "active",          osc_rd_active,
                               osc_wr_active, 0 },
          { "max_pages_per_rpc", osc_rd_max_pages_per_rpc,
                                 osc_wr_max_pages_per_rpc, 0 },
@@ -408,12 +467,15 @@ static struct lprocfs_vars lprocfs_osc_obd_vars[] = {
          { "cur_dirty_bytes", osc_rd_cur_dirty_bytes, 0, 0 },
          { "cur_grant_bytes", osc_rd_cur_grant_bytes, 0, 0 },
          { "create_count",    osc_rd_create_count, osc_wr_create_count, 0 },
+        { "max_create_count", osc_rd_max_create_count,
+                              osc_wr_max_create_count, 0},
          { "prealloc_next_id", osc_rd_prealloc_next_id, 0, 0 },
          { "prealloc_last_id", osc_rd_prealloc_last_id, 0, 0 },
          { "checksums",       osc_rd_checksum, osc_wr_checksum, 0 },
          { "checksum_type",   osc_rd_checksum_type, osc_wd_checksum_type, 0 },
          { "resend_count",  osc_rd_resend_count, osc_wr_resend_count, 0},
          { "timeouts",        lprocfs_rd_timeouts,      0, 0 },
+        { "import",          lprocfs_rd_import,    0, 0 },
          { 0 }
  };
  
@@ -509,7 +571,7 @@ static int osc_rpc_stats_seq_show(struct seq_file *seq, void *v)
                  write_cum += w;
                  seq_printf(seq, "%d:\t\t%10lu %3lu %3lu   | %10lu %3lu %3lu\n",
                             (i == 0) ? 0 : 1 << (i - 1),
-                           r, pct(r, read_tot), pct(read_cum, read_tot), 
+                           r, pct(r, read_tot), pct(read_cum, read_tot),
                             w, pct(w, write_tot), pct(write_cum, write_tot));
                  if (read_cum == read_tot && write_cum == write_tot)
                          break;
@@ -552,4 +614,3 @@ void lprocfs_osc_init_vars(struct lprocfs_static_vars *lvars)
          lvars->obd_vars    = lprocfs_osc_obd_vars;
  }
  #endif /* LPROCFS */
-
diff --git a/lustre/osc/osc_create.c b/lustre/osc/osc_create.c

index b82bf62..a4baa30 100644 (file)
--- a/lustre/osc/osc_create.c
+++ b/lustre/osc/osc_create.c
@@ -1,32 +1,45 @@
  /* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
   * vim:expandtab:shiftwidth=8:tabstop=8:
   *
- *  Copyright (C) 2001-2003 Cluster File Systems, Inc.
- *   Author Peter Braam <braam@clusterfs.com>
+ * GPL HEADER START
   *
- *   This file is part of the Lustre file system, http://www.lustre.org
- *   Lustre is a trademark of Cluster File Systems, Inc.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   *
- *   You may have signed or agreed to another license before downloading
- *   this software.  If so, you are bound by the terms and conditions
- *   of that agreement, and the following does not apply to you.  See the
- *   LICENSE file included with this distribution for more information.
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
   *
- *   If you did not agree to a different license, then this copy of Lustre
- *   is open source software; you can redistribute it and/or modify it
- *   under the terms of version 2 of the GNU General Public License as
- *   published by the Free Software Foundation.
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
   *
- *   In either case, Lustre is distributed in the hope that it will be
- *   useful, but WITHOUT ANY WARRANTY; without even the implied warranty
- *   of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- *   license text for more details.
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
   *
- *  For testing and management it is treated as an obd_device,
- *  although * it does not export a full OBD method table (the
- *  requests are coming * in over the wire, so object target modules
- *  do not have a full * method table.)
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
   *
+ * GPL HEADER END
+ */
+/*
+ * Copyright  2008 Sun Microsystems, Inc. All rights reserved
+ * Use is subject to license terms.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/osc/osc_create.c
+ * For testing and management it is treated as an obd_device,
+ * although * it does not export a full OBD method table (the
+ * requests are coming * in over the wire, so object target modules
+ * do not have a full * method table.)
+ *
+ * Author: Peter Braam <braam@clusterfs.com>
   */
  
  #ifndef EXPORT_SYMTAB
@@ -63,7 +76,7 @@ static int osc_interpret_create(struct ptlrpc_request *req, void *data, int rc)
  
          oscc = req->rq_async_args.pointer_arg[0];
          LASSERT(oscc && (oscc->oscc_obd != LP_POISON));
-        
+
          spin_lock(&oscc->oscc_lock);
          oscc->oscc_flags &= ~OSCC_FLAG_CREATING;
          switch (rc) {
@@ -128,7 +141,7 @@ static int oscc_internal_create(struct osc_creator *oscc)
  {
          struct ptlrpc_request *request;
          struct ost_body *body;
-        int size[] = { sizeof(struct ptlrpc_body), sizeof(*body) };
+        __u32 size[] = { sizeof(struct ptlrpc_body), sizeof(*body) };
          ENTRY;
  
          LASSERT_SPIN_LOCKED(&oscc->oscc_lock);
@@ -139,7 +152,7 @@ static int oscc_internal_create(struct osc_creator *oscc)
                  RETURN(0);
          }
  
-        if (oscc->oscc_grow_count < OST_MAX_PRECREATE &&
+        if (oscc->oscc_grow_count < oscc->oscc_max_grow_count &&
              ((oscc->oscc_flags & OSCC_FLAG_LOW) == 0) &&
              (__s64)(oscc->oscc_last_id - oscc->oscc_next_id) <=
                     (oscc->oscc_grow_count / 4 + 1)) {
@@ -147,8 +160,8 @@ static int oscc_internal_create(struct osc_creator *oscc)
                  oscc->oscc_grow_count *= 2;
          }
  
-        if (oscc->oscc_grow_count > OST_MAX_PRECREATE / 2)
-                oscc->oscc_grow_count = OST_MAX_PRECREATE / 2;
+        if (oscc->oscc_grow_count > oscc->oscc_max_grow_count / 2)
+                oscc->oscc_grow_count = oscc->oscc_max_grow_count / 2;
  
          oscc->oscc_flags |= OSCC_FLAG_CREATING;
          spin_unlock(&oscc->oscc_lock);
@@ -267,21 +280,17 @@ int osc_precreate(struct obd_export *exp)
          if (imp != NULL && imp->imp_deactive)
                  RETURN(1000);
  
+        if (oscc_recovering(oscc))
+                RETURN(2);
+
+        if (oscc->oscc_flags & OSCC_FLAG_NOSPC)
+                RETURN(1000);
+
          if (oscc->oscc_last_id < oscc->oscc_next_id) {
-                spin_lock(&oscc->oscc_lock);
-                if (oscc->oscc_flags & OSCC_FLAG_NOSPC) {
-                        spin_unlock(&oscc->oscc_lock);
-                        RETURN(1000);
-                }
-                if (oscc->oscc_flags & OSCC_FLAG_SYNC_IN_PROGRESS) {
-                        spin_unlock(&oscc->oscc_lock);
+                if (oscc->oscc_flags & OSCC_FLAG_SYNC_IN_PROGRESS)
                          RETURN(1);
-                }
-                if (oscc->oscc_flags & OSCC_FLAG_RECOVERING) {
-                        spin_unlock(&oscc->oscc_lock);
-                        RETURN(2);
-                }
  
+                spin_lock(&oscc->oscc_lock);
                  if (oscc->oscc_flags & OSCC_FLAG_CREATING) {
                          spin_unlock(&oscc->oscc_lock);
                          RETURN(1);
@@ -438,6 +447,7 @@ void oscc_init(struct obd_device *obd)
          spin_lock_init(&oscc->oscc_lock);
          oscc->oscc_obd = obd;
          oscc->oscc_grow_count = OST_MIN_PRECREATE;
+        oscc->oscc_max_grow_count = OST_MAX_PRECREATE;
  
          oscc->oscc_next_id = 2;
          oscc->oscc_last_id = 1;
diff --git a/lustre/osc/osc_internal.h b/lustre/osc/osc_internal.h

index 2f0f41b..393e6d9 100644 (file)
--- a/lustre/osc/osc_internal.h
+++ b/lustre/osc/osc_internal.h
@@ -1,5 +1,37 @@
  /* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
   * vim:expandtab:shiftwidth=8:tabstop=8:
+ *
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright  2008 Sun Microsystems, Inc. All rights reserved
+ * Use is subject to license terms.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
   */
  
  #ifndef OSC_INTERNAL_H
@@ -93,5 +125,10 @@ static inline int osc_should_resend(int resend, struct client_obd *cli)
                  atomic_read(&cli->cl_resends) > resend : 1; 
  }
  
+static inline int osc_exp_is_2_0_server(struct obd_export *exp) {
+       LASSERT(exp);
+       return !!(exp->exp_connect_flags & OBD_CONNECT_FID);
+}
+
  
  #endif /* OSC_INTERNAL_H */
diff --git a/lustre/osc/osc_request.c b/lustre/osc/osc_request.c

index a4f7394..6654809 100644 (file)
--- a/lustre/osc/osc_request.c
+++ b/lustre/osc/osc_request.c
@@ -1,32 +1,37 @@
  /* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
   * vim:expandtab:shiftwidth=8:tabstop=8:
   *
- *  Copyright (C) 2001-2003 Cluster File Systems, Inc.
- *   Author Peter Braam <braam@clusterfs.com>
+ * GPL HEADER START
   *
- *   This file is part of the Lustre file system, http://www.lustre.org
- *   Lustre is a trademark of Cluster File Systems, Inc.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   *
- *   You may have signed or agreed to another license before downloading
- *   this software.  If so, you are bound by the terms and conditions
- *   of that agreement, and the following does not apply to you.  See the
- *   LICENSE file included with this distribution for more information.
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
   *
- *   If you did not agree to a different license, then this copy of Lustre
- *   is open source software; you can redistribute it and/or modify it
- *   under the terms of version 2 of the GNU General Public License as
- *   published by the Free Software Foundation.
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
   *
- *   In either case, Lustre is distributed in the hope that it will be
- *   useful, but WITHOUT ANY WARRANTY; without even the implied warranty
- *   of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- *   license text for more details.
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
   *
- *  For testing and management it is treated as an obd_device,
- *  although * it does not export a full OBD method table (the
- *  requests are coming * in over the wire, so object target modules
- *  do not have a full * method table.)
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
   *
+ * GPL HEADER END
+ */
+/*
+ * Copyright  2008 Sun Microsystems, Inc. All rights reserved
+ * Use is subject to license terms.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
   */
  
  #ifndef EXPORT_SYMTAB
@@ -70,7 +75,7 @@ static quota_interface_t *quota_interface;
  extern quota_interface_t osc_quota_interface;
  
  /* by default 10s */
-atomic_t osc_resend_time; 
+atomic_t osc_resend_time;
  
  /* Pack OSC object metadata for disk storage (LE byte order). */
  static int osc_packmd(struct obd_export *exp, struct lov_mds_md **lmmp,
@@ -191,7 +196,7 @@ static int osc_getattr_async(struct obd_export *exp, struct obd_info *oinfo,
  {
          struct ptlrpc_request *req;
          struct ost_body *body;
-        int size[2] = { sizeof(struct ptlrpc_body), sizeof(*body) };
+        __u32 size[2] = { sizeof(struct ptlrpc_body), sizeof(*body) };
          struct osc_async_args *aa;
          ENTRY;
  
@@ -207,7 +212,7 @@ static int osc_getattr_async(struct obd_export *exp, struct obd_info *oinfo,
          req->rq_interpret_reply = osc_getattr_interpret;
  
          CLASSERT(sizeof(*aa) <= sizeof(req->rq_async_args));
-        aa = (struct osc_async_args *)&req->rq_async_args;
+        aa = ptlrpc_req_async_args(req);
          aa->aa_oi = oinfo;
  
          ptlrpc_set_add_req(set, req);
@@ -218,7 +223,8 @@ static int osc_getattr(struct obd_export *exp, struct obd_info *oinfo)
  {
          struct ptlrpc_request *req;
          struct ost_body *body;
-        int rc, size[2] = { sizeof(struct ptlrpc_body), sizeof(*body) };
+        __u32 size[2] = { sizeof(struct ptlrpc_body), sizeof(*body) };
+        int rc;
          ENTRY;
  
          req = ptlrpc_prep_req(class_exp2cliimp(exp), LUSTRE_OST_VERSION,
@@ -262,7 +268,8 @@ static int osc_setattr(struct obd_export *exp, struct obd_info *oinfo,
  {
          struct ptlrpc_request *req;
          struct ost_body *body;
-        int rc, size[2] = { sizeof(struct ptlrpc_body), sizeof(*body) };
+        __u32 size[2] = { sizeof(struct ptlrpc_body), sizeof(*body) };
+        int rc;
          ENTRY;
  
          req = ptlrpc_prep_req(class_exp2cliimp(exp), LUSTRE_OST_VERSION,
@@ -320,12 +327,17 @@ static int osc_setattr_async(struct obd_export *exp, struct obd_info *oinfo,
  {
          struct ptlrpc_request *req;
          struct ost_body *body;
-        int size[2] = { sizeof(struct ptlrpc_body), sizeof(*body) };
+        __u32 size[3] = { sizeof(struct ptlrpc_body), sizeof(*body), 0 };
+        int bufcount = 2;
          struct osc_async_args *aa;
          ENTRY;
  
+        if (osc_exp_is_2_0_server(exp)) {
+                bufcount = 3;
+        }
+
          req = ptlrpc_prep_req(class_exp2cliimp(exp), LUSTRE_OST_VERSION,
-                              OST_SETATTR, 2, size, NULL);
+                              OST_SETATTR, bufcount, size, NULL);
          if (!req)
                  RETURN(-ENOMEM);
  
@@ -346,7 +358,7 @@ static int osc_setattr_async(struct obd_export *exp, struct obd_info *oinfo,
                  req->rq_interpret_reply = osc_setattr_interpret;
  
                  CLASSERT(sizeof(*aa) <= sizeof(req->rq_async_args));
-                aa = (struct osc_async_args *)&req->rq_async_args;
+                aa = ptlrpc_req_async_args(req);
                  aa->aa_oi = oinfo;
  
                  ptlrpc_set_add_req(rqset, req);
@@ -361,7 +373,8 @@ int osc_real_create(struct obd_export *exp, struct obdo *oa,
          struct ptlrpc_request *req;
          struct ost_body *body;
          struct lov_stripe_md *lsm;
-        int rc, size[2] = { sizeof(struct ptlrpc_body), sizeof(*body) };
+        __u32 size[2] = { sizeof(struct ptlrpc_body), sizeof(*body) };
+        int rc;
          ENTRY;
  
          LASSERT(oa);
@@ -464,7 +477,7 @@ static int osc_punch(struct obd_export *exp, struct obd_info *oinfo,
          struct ptlrpc_request *req;
          struct osc_async_args *aa;
          struct ost_body *body;
-        int size[2] = { sizeof(struct ptlrpc_body), sizeof(*body) };
+        __u32 size[2] = { sizeof(struct ptlrpc_body), sizeof(*body) };
          ENTRY;
  
          if (!oinfo->oi_oa) {
@@ -492,22 +505,46 @@ static int osc_punch(struct obd_export *exp, struct obd_info *oinfo,
  
          req->rq_interpret_reply = osc_punch_interpret;
          CLASSERT(sizeof(*aa) <= sizeof(req->rq_async_args));
-        aa = (struct osc_async_args *)&req->rq_async_args;
+        aa = ptlrpc_req_async_args(req);
          aa->aa_oi = oinfo;
          ptlrpc_set_add_req(rqset, req);
  
          RETURN(0);
  }
  
-static int osc_sync(struct obd_export *exp, struct obdo *oa,
-                    struct lov_stripe_md *md, obd_size start, obd_size end)
+static int osc_sync_interpret(struct ptlrpc_request *req,
+                              struct osc_async_args *aa, int rc)
+{
+        struct ost_body *body;
+        ENTRY;
+
+        if (rc)
+                GOTO(out, rc);
+
+        body = lustre_swab_repbuf(req, REPLY_REC_OFF, sizeof(*body),
+                                  lustre_swab_ost_body);
+        if (body == NULL) {
+                CERROR ("can't unpack ost_body\n");
+                GOTO(out, rc = -EPROTO);
+        }
+
+        *aa->aa_oi->oi_oa = body->oa;
+out:
+        rc = aa->aa_oi->oi_cb_up(aa->aa_oi, rc);
+        RETURN(rc);
+}
+
+static int osc_sync(struct obd_export *exp, struct obd_info *oinfo,
+                    obd_size start, obd_size end,
+                    struct ptlrpc_request_set *set)
  {
          struct ptlrpc_request *req;
          struct ost_body *body;
-        int rc, size[2] = { sizeof(struct ptlrpc_body), sizeof(*body) };
+        __u32 size[2] = { sizeof(struct ptlrpc_body), sizeof(*body) };
+        struct osc_async_args *aa;
          ENTRY;
  
-        if (!oa) {
+        if (!oinfo->oi_oa) {
                  CERROR("oa NULL\n");
                  RETURN(-EINVAL);
          }
@@ -518,7 +555,7 @@ static int osc_sync(struct obd_export *exp, struct obdo *oa,
                  RETURN(-ENOMEM);
  
          body = lustre_msg_buf(req->rq_reqmsg, REQ_REC_OFF, sizeof(*body));
-        memcpy(&body->oa, oa, sizeof(*oa));
+        memcpy(&body->oa, oinfo->oi_oa, sizeof(*oinfo->oi_oa));
  
          /* overload the size and blocks fields in the oa with start/end */
          body->oa.o_size = start;
@@ -526,39 +563,31 @@ static int osc_sync(struct obd_export *exp, struct obdo *oa,
          body->oa.o_valid |= (OBD_MD_FLSIZE | OBD_MD_FLBLOCKS);
  
          ptlrpc_req_set_repsize(req, 2, size);
+        req->rq_interpret_reply = osc_sync_interpret;
  
-        rc = ptlrpc_queue_wait(req);
-        if (rc)
-                GOTO(out, rc);
-
-        body = lustre_swab_repbuf(req, REPLY_REC_OFF, sizeof(*body),
-                                  lustre_swab_ost_body);
-        if (body == NULL) {
-                CERROR ("can't unpack ost_body\n");
-                GOTO (out, rc = -EPROTO);
-        }
-
-        memcpy(oa, &body->oa, sizeof(*oa));
+        CLASSERT(sizeof(*aa) <= sizeof(req->rq_async_args));
+        aa = ptlrpc_req_async_args(req);
+        aa->aa_oi = oinfo;
  
-        EXIT;
- out:
-        ptlrpc_req_finished(req);
-        return rc;
+        ptlrpc_set_add_req(set, req);
+        RETURN (0);
  }
  
  /* Find and cancel locally locks matched by @mode in the resource found by
   * @objid. Found locks are added into @cancel list. Returns the amount of
   * locks added to @cancels list. */
-static int osc_resource_get_unused(struct obd_export *exp, __u64 objid,
+static int osc_resource_get_unused(struct obd_export *exp, struct obdo *oa,
                                     struct list_head *cancels, ldlm_mode_t mode,
                                     int lock_flags)
  {
          struct ldlm_namespace *ns = exp->exp_obd->obd_namespace;
-        struct ldlm_res_id res_id = { .name = { objid } };
-        struct ldlm_resource *res = ldlm_resource_get(ns, NULL, res_id, 0, 0);
+        struct ldlm_res_id res_id;
+        struct ldlm_resource *res;
          int count;
          ENTRY;
  
+        osc_build_res_name(oa->o_id, oa->o_gr, &res_id);
+        res = ldlm_resource_get(ns, NULL, res_id, 0, 0);
          if (res == NULL)
                  RETURN(0);
  
@@ -613,7 +642,7 @@ static int osc_destroy(struct obd_export *exp, struct obdo *oa,
          CFS_LIST_HEAD(cancels);
          struct ptlrpc_request *req;
          struct ost_body *body;
-        int size[3] = { sizeof(struct ptlrpc_body), sizeof(*body),
+        __u32 size[3] = { sizeof(struct ptlrpc_body), sizeof(*body),
                          sizeof(struct ldlm_request) };
          int count, bufcount = 2;
          struct client_obd *cli = &exp->exp_obd->u.cli;
@@ -626,7 +655,7 @@ static int osc_destroy(struct obd_export *exp, struct obdo *oa,
  
          LASSERT(oa->o_id != 0);
  
-        count = osc_resource_get_unused(exp, oa->o_id, &cancels, LCK_PW,
+        count = osc_resource_get_unused(exp, oa, &cancels, LCK_PW,
                                          LDLM_FL_DISCARD_DATA);
          if (exp_connect_cancelset(exp))
                  bufcount = 3;
@@ -835,7 +864,7 @@ static void handle_short_read(int nob_read, obd_count page_count,
  
                  if (pga[i]->count > nob_read) {
                          /* EOF inside this page */
-                        ptr = cfs_kmap(pga[i]->pg) + 
+                        ptr = cfs_kmap(pga[i]->pg) +
                                  (pga[i]->off & ~CFS_PAGE_MASK);
                          memset(ptr + nob_read, 0, pga[i]->count - nob_read);
                          cfs_kunmap(pga[i]->pg);
@@ -888,7 +917,7 @@ static int check_write_rcs(struct ptlrpc_request *req,
  
          if (req->rq_bulk->bd_nob_transferred != requested_nob) {
                  CERROR("Unexpected # bytes transferred: %d (requested %d)\n",
-                       requested_nob, req->rq_bulk->bd_nob_transferred);
+                       req->rq_bulk->bd_nob_transferred, requested_nob);
                  return(-EPROTO);
          }
  
@@ -957,7 +986,7 @@ static int osc_brw_prep_request(int cmd, struct client_obd *cli,struct obdo *oa,
          struct ost_body         *body;
          struct obd_ioobj        *ioobj;
          struct niobuf_remote    *niobuf;
-        int size[4] = { sizeof(struct ptlrpc_body), sizeof(*body) };
+        __u32 size[4] = { sizeof(struct ptlrpc_body), sizeof(*body) };
          int niocount, i, requested_nob, opc, rc;
          struct ptlrpc_request_pool *pool;
          struct osc_brw_async_args *aa;
@@ -1048,8 +1077,8 @@ static int osc_brw_prep_request(int cmd, struct client_obd *cli,struct obdo *oa,
          LASSERTF((void *)(niobuf - niocount) ==
                  lustre_msg_buf(req->rq_reqmsg, REQ_REC_OFF + 2,
                                 niocount * sizeof(*niobuf)),
-                "want %p - real %p\n", lustre_msg_buf(req->rq_reqmsg, 
-                REQ_REC_OFF + 2, niocount * sizeof(*niobuf)), 
+                "want %p - real %p\n", lustre_msg_buf(req->rq_reqmsg,
+                REQ_REC_OFF + 2, niocount * sizeof(*niobuf)),
                  (void *)(niobuf - niocount));
  
          osc_announce_cached(cli, &body->oa, opc == OST_WRITE ? requested_nob:0);
@@ -1095,7 +1124,7 @@ static int osc_brw_prep_request(int cmd, struct client_obd *cli,struct obdo *oa,
          }
  
          CLASSERT(sizeof(*aa) <= sizeof(req->rq_async_args));
-        aa = (struct osc_brw_async_args *)&req->rq_async_args;
+        aa = ptlrpc_req_async_args(req);
          aa->aa_oa = oa;
          aa->aa_requested_nob = requested_nob;
          aa->aa_nio_count = niocount;
@@ -1152,7 +1181,7 @@ static int check_write_checksum(struct obdo *oa, const lnet_process_id_t *peer,
                             "["LPU64"-"LPU64"]\n",
                             msg, libcfs_nid2str(peer->nid),
                             oa->o_valid & OBD_MD_FLFID ? oa->o_fid : (__u64)0,
-                           oa->o_valid & OBD_MD_FLFID ? oa->o_generation : 
+                           oa->o_valid & OBD_MD_FLFID ? oa->o_generation :
                                                          (__u64)0,
                             oa->o_id,
                             oa->o_valid & OBD_MD_FLGROUP ? oa->o_gr : (__u64)0,
@@ -1168,7 +1197,7 @@ static int check_write_checksum(struct obdo *oa, const lnet_process_id_t *peer,
  /* Note rc enters this function as number of bytes transferred */
  static int osc_brw_fini_request(struct ptlrpc_request *req, int rc)
  {
-        struct osc_brw_async_args *aa = (void *)&req->rq_async_args;
+        struct osc_brw_async_args *aa = ptlrpc_req_async_args(req);
          const lnet_process_id_t *peer =
                          &req->rq_import->imp_connection->c_peer;
          struct client_obd *cli = aa->aa_cli;
@@ -1262,7 +1291,7 @@ static int osc_brw_fini_request(struct ptlrpc_request *req, int rc)
                  if (server_cksum == ~0 && rc > 0) {
                          CERROR("Protocol error: server %s set the 'checksum' "
                                 "bit, but didn't send a checksum.  Not fatal, "
-                               "but please tell CFS.\n",
+                               "but please notify on http://bugzilla.lustre.org/\n",
                                 libcfs_nid2str(peer->nid));
                  } else if (server_cksum != client_cksum) {
                          LCONSOLE_ERROR_MSG(0x133, "%s: BAD READ CHECKSUM: from "
@@ -1401,7 +1430,7 @@ int osc_brw_redo_request(struct ptlrpc_request *request,
          new_req->rq_async_args = request->rq_async_args;
          new_req->rq_sent = CURRENT_SECONDS + aa->aa_resends;
  
-        new_aa = (struct osc_brw_async_args *)&new_req->rq_async_args;
+        new_aa = ptlrpc_req_async_args(new_req);
  
          CFS_INIT_LIST_HEAD(&new_aa->aa_oaps);
          list_splice(&aa->aa_oaps, &new_aa->aa_oaps);
@@ -1414,9 +1443,9 @@ int osc_brw_redo_request(struct ptlrpc_request *request,
                  }
          }
  
-        /* use ptlrpc_set_add_req is safe because interpret functions work 
-         * in check_set context. only one way exist with access to request 
-         * from different thread got -EINTR - this way protected with 
+        /* use ptlrpc_set_add_req is safe because interpret functions work
+         * in check_set context. only one way exist with access to request
+         * from different thread got -EINTR - this way protected with
           * cl_loi_list_lock */
          ptlrpc_set_add_req(set, new_req);
  
@@ -1450,17 +1479,18 @@ static int async_internal(int cmd, struct obd_export *exp, struct obdo *oa,
          rc = osc_brw_prep_request(cmd, &exp->exp_obd->u.cli, oa, lsm,
                                    page_count, pga, &request);
  
-        aa = (struct osc_brw_async_args *)&request->rq_async_args;
+        CLASSERT(sizeof(*aa) <= sizeof(request->rq_async_args));
+        aa = ptlrpc_req_async_args(request);
          if (cmd == OBD_BRW_READ) {
                  lprocfs_oh_tally_log2(&cli->cl_read_page_hist, page_count);
                  lprocfs_oh_tally(&cli->cl_read_rpc_hist, cli->cl_r_in_flight);
-                ptlrpc_lprocfs_brw(request, OST_READ, aa->aa_requested_nob);
          } else {
                  lprocfs_oh_tally_log2(&cli->cl_write_page_hist, page_count);
                  lprocfs_oh_tally(&cli->cl_write_rpc_hist,
                                   cli->cl_w_in_flight);
-                ptlrpc_lprocfs_brw(request, OST_WRITE, aa->aa_requested_nob);
          }
+        ptlrpc_lprocfs_brw(request, aa->aa_requested_nob);
+
          LASSERT(list_empty(&aa->aa_oaps));
  
          if (rc == 0) {
@@ -1762,6 +1792,25 @@ static int lop_makes_rpc(struct client_obd *cli, struct loi_oap_pages *lop,
          RETURN(0);
  }
  
+static int lop_makes_hprpc(struct loi_oap_pages *lop)
+{
+        struct osc_async_page *oap;
+        ENTRY;
+
+        if (list_empty(&lop->lop_urgent))
+                RETURN(0);
+
+        oap = list_entry(lop->lop_urgent.next,
+                         struct osc_async_page, oap_urgent_item);
+
+        if (oap->oap_async_flags & ASYNC_HP) {
+                CDEBUG(D_CACHE, "hp request forcing RPC\n");
+                RETURN(1);
+        }
+
+        RETURN(0);
+}
+
  static void on_list(struct list_head *item, struct list_head *list,
                      int should_be_on)
  {
@@ -1775,9 +1824,17 @@ static void on_list(struct list_head *item, struct list_head *list,
   * can find pages to build into rpcs quickly */
  static void loi_list_maint(struct client_obd *cli, struct lov_oinfo *loi)
  {
-        on_list(&loi->loi_cli_item, &cli->cl_loi_ready_list,
-                lop_makes_rpc(cli, &loi->loi_write_lop, OBD_BRW_WRITE) ||
-                lop_makes_rpc(cli, &loi->loi_read_lop, OBD_BRW_READ));
+        if (lop_makes_hprpc(&loi->loi_write_lop) ||
+            lop_makes_hprpc(&loi->loi_read_lop)) {
+                /* HP rpc */
+                on_list(&loi->loi_ready_item, &cli->cl_loi_ready_list, 0);
+                on_list(&loi->loi_hp_ready_item, &cli->cl_loi_hp_ready_list, 1);
+        } else {
+                on_list(&loi->loi_hp_ready_item, &cli->cl_loi_hp_ready_list, 0);
+                on_list(&loi->loi_ready_item, &cli->cl_loi_ready_list,
+                        lop_makes_rpc(cli, &loi->loi_write_lop, OBD_BRW_WRITE)||
+                        lop_makes_rpc(cli, &loi->loi_read_lop, OBD_BRW_READ));
+        }
  
          on_list(&loi->loi_write_item, &cli->cl_loi_write_list,
                  loi->loi_write_lop.lop_num_pending);
@@ -1873,8 +1930,10 @@ static void osc_oap_to_pending(struct osc_async_page *oap)
          else
                  lop = &oap->oap_loi->loi_read_lop;
  
-        if (oap->oap_async_flags & ASYNC_URGENT)
+        if (oap->oap_async_flags & ASYNC_HP)
                  list_add(&oap->oap_urgent_item, &lop->lop_urgent);
+        else if (oap->oap_async_flags & ASYNC_URGENT)
+                list_add_tail(&oap->oap_urgent_item, &lop->lop_urgent);
          list_add_tail(&oap->oap_pending_item, &lop->lop_pending);
          lop_update_pending(oap->oap_cli, lop, oap->oap_cmd, 1);
  }
@@ -1993,6 +2052,7 @@ static struct ptlrpc_request *osc_build_req(struct client_obd *cli,
          void *caller_data = NULL;
          struct osc_async_page *oap;
          struct ldlm_lock *lock = NULL;
+        obd_valid valid;
          int i, rc;
  
          ENTRY;
@@ -2034,17 +2094,35 @@ static struct ptlrpc_request *osc_build_req(struct client_obd *cli,
                  CERROR("prep_req failed: %d\n", rc);
                  GOTO(out, req = ERR_PTR(rc));
          }
+        oa = &((struct ost_body *)lustre_msg_buf(req->rq_reqmsg, REQ_REC_OFF,
+                                                 sizeof(struct ost_body)))->oa;
  
          /* Need to update the timestamps after the request is built in case
           * we race with setattr (locally or in queue at OST).  If OST gets
           * later setattr before earlier BRW (as determined by the request xid),
           * the OST will not use BRW timestamps.  Sadly, there is no obvious
           * way to do this in a single call.  bug 10150 */
-        ops->ap_update_obdo(caller_data, cmd, oa,
-                            OBD_MD_FLMTIME | OBD_MD_FLCTIME | OBD_MD_FLATIME);
+        if (pga[0]->flag & OBD_BRW_SRVLOCK) {
+                /* in case of lockless read/write do not use inode's
+                 * timestamps because concurrent stat might fill the
+                 * inode with out-of-date times, send current
+                 * instead */
+                if (cmd & OBD_BRW_WRITE) {
+                        oa->o_mtime = oa->o_ctime = LTIME_S(CURRENT_TIME);
+                        oa->o_valid |= OBD_MD_FLMTIME | OBD_MD_FLCTIME;
+                        valid = OBD_MD_FLATIME;
+                } else {
+                        oa->o_atime = LTIME_S(CURRENT_TIME);
+                        oa->o_valid |= OBD_MD_FLATIME;
+                        valid = OBD_MD_FLMTIME | OBD_MD_FLCTIME;
+                }
+        } else {
+                valid = OBD_MD_FLMTIME | OBD_MD_FLCTIME | OBD_MD_FLATIME;
+        }
+        ops->ap_update_obdo(caller_data, cmd, oa, valid);
  
          CLASSERT(sizeof(*aa) <= sizeof(req->rq_async_args));
-        aa = (struct osc_brw_async_args *)&req->rq_async_args;
+        aa = ptlrpc_req_async_args(req);
          CFS_INIT_LIST_HEAD(&aa->aa_oaps);
          list_splice(rpc_list, &aa->aa_oaps);
          CFS_INIT_LIST_HEAD(rpc_list);
@@ -2086,6 +2164,15 @@ static int osc_send_oap_rpc(struct client_obd *cli, struct lov_oinfo *loi,
          int srvlock = 0;
          ENTRY;
  
+        /* If there are HP OAPs we need to handle at least 1 of them,
+         * move it the beginning of the pending list for that. */
+        if (!list_empty(&lop->lop_urgent)) {
+                oap = list_entry(lop->lop_urgent.next,
+                                 struct osc_async_page, oap_urgent_item);
+                if (oap->oap_async_flags & ASYNC_HP)
+                        list_move(&oap->oap_pending_item, &lop->lop_pending);
+        }
+
          /* first we find the pages we're allowed to work with */
          list_for_each_entry_safe(oap, tmp, &lop->lop_pending, oap_pending_item){
                  ops = oap->oap_caller_ops;
@@ -2234,21 +2321,20 @@ static int osc_send_oap_rpc(struct client_obd *cli, struct lov_oinfo *loi,
                  RETURN(PTR_ERR(req));
          }
  
-        aa = (struct osc_brw_async_args *)&req->rq_async_args;
+        aa = ptlrpc_req_async_args(req);
          if (cmd == OBD_BRW_READ) {
                  lprocfs_oh_tally_log2(&cli->cl_read_page_hist, page_count);
                  lprocfs_oh_tally(&cli->cl_read_rpc_hist, cli->cl_r_in_flight);
                  lprocfs_oh_tally_log2(&cli->cl_read_offset_hist,
                                        (starting_offset >> CFS_PAGE_SHIFT) + 1);
-                ptlrpc_lprocfs_brw(req, OST_READ, aa->aa_requested_nob);
          } else {
                  lprocfs_oh_tally_log2(&cli->cl_write_page_hist, page_count);
                  lprocfs_oh_tally(&cli->cl_write_rpc_hist,
                                   cli->cl_w_in_flight);
                  lprocfs_oh_tally_log2(&cli->cl_write_offset_hist,
                                        (starting_offset >> CFS_PAGE_SHIFT) + 1);
-                ptlrpc_lprocfs_brw(req, OST_WRITE, aa->aa_requested_nob);
          }
+        ptlrpc_lprocfs_brw(req, aa->aa_requested_nob);
  
          client_obd_list_lock(&cli->cl_loi_list_lock);
  
@@ -2283,7 +2369,8 @@ static int osc_send_oap_rpc(struct client_obd *cli, struct lov_oinfo *loi,
  
  #define LOI_DEBUG(LOI, STR, args...)                                     \
          CDEBUG(D_INODE, "loi ready %d wr %d:%d rd %d:%d " STR,           \
-               !list_empty(&(LOI)->loi_cli_item),                        \
+               !list_empty(&(LOI)->loi_ready_item) ||                    \
+               !list_empty(&(LOI)->loi_hp_ready_item),                   \
                 (LOI)->loi_write_lop.lop_num_pending,                     \
                 !list_empty(&(LOI)->loi_write_lop.lop_urgent),            \
                 (LOI)->loi_read_lop.lop_num_pending,                      \
@@ -2295,11 +2382,15 @@ static int osc_send_oap_rpc(struct client_obd *cli, struct lov_oinfo *loi,
  struct lov_oinfo *osc_next_loi(struct client_obd *cli)
  {
          ENTRY;
-        /* first return all objects which we already know to have
-         * pages ready to be stuffed into rpcs */
+        /* First return objects that have blocked locks so that they
+         * will be flushed quickly and other clients can get the lock,
+         * then objects which have pages ready to be stuffed into RPCs */
+        if (!list_empty(&cli->cl_loi_hp_ready_list))
+                RETURN(list_entry(cli->cl_loi_hp_ready_list.next,
+                                  struct lov_oinfo, loi_hp_ready_item));
          if (!list_empty(&cli->cl_loi_ready_list))
                  RETURN(list_entry(cli->cl_loi_ready_list.next,
-                                  struct lov_oinfo, loi_cli_item));
+                                  struct lov_oinfo, loi_ready_item));
  
          /* then if we have cache waiters, return all objects with queued
           * writes.  This is especially important when many small files
@@ -2323,6 +2414,26 @@ struct lov_oinfo *osc_next_loi(struct client_obd *cli)
          RETURN(NULL);
  }
  
+static int osc_max_rpc_in_flight(struct client_obd *cli, struct lov_oinfo *loi)
+{
+        struct osc_async_page *oap;
+        int hprpc = 0;
+
+        if (!list_empty(&loi->loi_write_lop.lop_urgent)) {
+                oap = list_entry(loi->loi_write_lop.lop_urgent.next,
+                                 struct osc_async_page, oap_urgent_item);
+                hprpc = !!(oap->oap_async_flags & ASYNC_HP);
+        }
+
+        if (!hprpc && !list_empty(&loi->loi_read_lop.lop_urgent)) {
+                oap = list_entry(loi->loi_write_lop.lop_urgent.next,
+                                 struct osc_async_page, oap_urgent_item);
+                hprpc = !!(oap->oap_async_flags & ASYNC_HP);
+        }
+
+        return rpcs_in_flight(cli) >= cli->cl_max_rpcs_in_flight + hprpc;
+}
+
  /* called with the loi list lock held */
  static void osc_check_rpcs(struct client_obd *cli)
  {
@@ -2333,7 +2444,7 @@ static void osc_check_rpcs(struct client_obd *cli)
          while ((loi = osc_next_loi(cli)) != NULL) {
                  LOI_DEBUG(loi, "%lu in flight\n", rpcs_in_flight(cli));
  
-                if (rpcs_in_flight(cli) >= cli->cl_max_rpcs_in_flight)
+                if (osc_max_rpc_in_flight(cli, loi))
                          break;
  
                  /* attempt some read/write balancing by alternating between
@@ -2365,8 +2476,10 @@ static void osc_check_rpcs(struct client_obd *cli)
  
                  /* attempt some inter-object balancing by issueing rpcs
                   * for each object in turn */
-                if (!list_empty(&loi->loi_cli_item))
-                        list_del_init(&loi->loi_cli_item);
+                if (!list_empty(&loi->loi_hp_ready_item))
+                        list_del_init(&loi->loi_hp_ready_item);
+                if (!list_empty(&loi->loi_ready_item))
+                        list_del_init(&loi->loi_ready_item);
                  if (!list_empty(&loi->loi_write_item))
                          list_del_init(&loi->loi_write_item);
                  if (!list_empty(&loi->loi_read_item))
@@ -2523,9 +2636,9 @@ int osc_prep_async_page(struct obd_export *exp, struct lov_stripe_md *lsm,
  
          spin_lock_init(&oap->oap_lock);
  
-        /* If the page was marked as notcacheable - don't add to any locks */ 
+        /* If the page was marked as notcacheable - don't add to any locks */
          if (!nocache) {
-                oid.name[0] = loi->loi_id;
+                osc_build_res_name(loi->loi_id, loi->loi_gr, &oid);
                  /* This is the only place where we can call cache_add_extent
                     without oap_lock, because this page is locked now, and
                     the lock we are adding it to is referenced, so cannot lose
@@ -2570,7 +2683,6 @@ static int osc_queue_async_io(struct obd_export *exp, struct lov_stripe_md *lsm,
                  RETURN(-EBUSY);
  
          /* check if the file's owner/group is over quota */
-#ifdef HAVE_QUOTA_SUPPORT
          if ((cmd & OBD_BRW_WRITE) && !(cmd & OBD_BRW_NOQUOTA)){
                  struct obd_async_page_ops *ops;
                  struct obdo *oa;
@@ -2589,7 +2701,6 @@ static int osc_queue_async_io(struct obd_export *exp, struct lov_stripe_md *lsm,
                  if (rc)
                          RETURN(rc);
          }
-#endif
  
          if (loi == NULL)
                  loi = lsm->lsm_oinfo[0];
@@ -2673,11 +2784,14 @@ static int osc_set_async_flags(struct obd_export *exp,
          if (SETTING(oap->oap_async_flags, async_flags, ASYNC_READY))
                  oap->oap_async_flags |= ASYNC_READY;
  
-        if (SETTING(oap->oap_async_flags, async_flags, ASYNC_URGENT)) {
-                if (list_empty(&oap->oap_rpc_item)) {
+        if (SETTING(oap->oap_async_flags, async_flags, ASYNC_URGENT) &&
+            list_empty(&oap->oap_rpc_item)) {
+                if (oap->oap_async_flags & ASYNC_HP)
                          list_add(&oap->oap_urgent_item, &lop->lop_urgent);
-                        loi_list_maint(cli, loi);
-                }
+                else
+                        list_add_tail(&oap->oap_urgent_item, &lop->lop_urgent);
+                oap->oap_async_flags |= ASYNC_URGENT;
+                loi_list_maint(cli, loi);
          }
  
          LOI_DEBUG(loi, "oap %p page %p has flags %x\n", oap, oap->oap_page,
@@ -2812,8 +2926,9 @@ static int osc_teardown_async_page(struct obd_export *exp,
  
          if (!list_empty(&oap->oap_urgent_item)) {
                  list_del_init(&oap->oap_urgent_item);
-                oap->oap_async_flags &= ~ASYNC_URGENT;
+                oap->oap_async_flags &= ~(ASYNC_URGENT | ASYNC_HP);
          }
+
          if (!list_empty(&oap->oap_pending_item)) {
                  list_del_init(&oap->oap_pending_item);
                  lop_update_pending(cli, lop, oap->oap_cmd, -1);
@@ -2833,12 +2948,12 @@ int osc_extent_blocking_cb(struct ldlm_lock *lock,
  {
          struct lustre_handle lockh = { 0 };
          int rc;
-        ENTRY;  
-                
+        ENTRY;
+
          if ((unsigned long)data > 0 && (unsigned long)data < 0x1000) {
                  LDLM_ERROR(lock, "cancelling lock with bad data %p", data);
-                LBUG(); 
-        }       
+                LBUG();
+        }
  
          switch (flag) {
          case LDLM_CB_BLOCKING:
@@ -2904,9 +3019,10 @@ static void osc_set_data_with_check(struct lustre_handle *lockh, void *data,
  static int osc_change_cbdata(struct obd_export *exp, struct lov_stripe_md *lsm,
                               ldlm_iterator_t replace, void *data)
  {
-        struct ldlm_res_id res_id = { .name = {lsm->lsm_object_id} };
+        struct ldlm_res_id res_id;
          struct obd_device *obd = class_exp2obd(exp);
  
+        osc_build_res_name(lsm->lsm_object_id, lsm->lsm_object_gr, &res_id);
          ldlm_resource_iterate(obd->obd_namespace, &res_id, replace, data);
          return 0;
  }
@@ -2990,7 +3106,7 @@ static int osc_enqueue(struct obd_export *exp, struct obd_info *oinfo,
                         struct ldlm_enqueue_info *einfo,
                         struct ptlrpc_request_set *rqset)
  {
-        struct ldlm_res_id res_id = { .name = {oinfo->oi_md->lsm_object_id} };
+        struct ldlm_res_id res_id;
          struct obd_device *obd = exp->exp_obd;
          struct ldlm_reply *rep;
          struct ptlrpc_request *req = NULL;
@@ -2999,6 +3115,8 @@ static int osc_enqueue(struct obd_export *exp, struct obd_info *oinfo,
          int rc;
          ENTRY;
  
+        osc_build_res_name(oinfo->oi_md->lsm_object_id,
+                           oinfo->oi_md->lsm_object_gr, &res_id);
          /* Filesystem lock extents are extended to page boundaries so that
           * dealing with the page cache is a little smoother.  */
          oinfo->oi_policy.l_extent.start -=
@@ -3054,7 +3172,7 @@ static int osc_enqueue(struct obd_export *exp, struct obd_info *oinfo,
  
   no_match:
          if (intent) {
-                int size[3] = {
+                __u32 size[3] = {
                          [MSG_PTLRPC_BODY_OFF] = sizeof(struct ptlrpc_body),
                          [DLM_LOCKREQ_OFF]     = sizeof(struct ldlm_request),
                          [DLM_LOCKREQ_OFF + 1] = 0 };
@@ -3064,7 +3182,7 @@ static int osc_enqueue(struct obd_export *exp, struct obd_info *oinfo,
                          RETURN(-ENOMEM);
  
                  size[DLM_LOCKREPLY_OFF] = sizeof(*rep);
-                size[DLM_REPLY_REC_OFF] = 
+                size[DLM_REPLY_REC_OFF] =
                          sizeof(oinfo->oi_md->lsm_oinfo[0]->loi_lvb);
                  ptlrpc_req_set_repsize(req, 3, size);
          }
@@ -3082,7 +3200,7 @@ static int osc_enqueue(struct obd_export *exp, struct obd_info *oinfo,
                  if (!rc) {
                          struct osc_enqueue_args *aa;
                          CLASSERT(sizeof(*aa) <= sizeof(req->rq_async_args));
-                        aa = (struct osc_enqueue_args *)&req->rq_async_args;
+                        aa = ptlrpc_req_async_args(req);
                          aa->oa_oi = oinfo;
                          aa->oa_ei = einfo;
                          aa->oa_exp = exp;
@@ -3106,12 +3224,14 @@ static int osc_match(struct obd_export *exp, struct lov_stripe_md *lsm,
                       __u32 type, ldlm_policy_data_t *policy, __u32 mode,
                       int *flags, void *data, struct lustre_handle *lockh)
  {
-        struct ldlm_res_id res_id = { .name = {lsm->lsm_object_id} };
+        struct ldlm_res_id res_id;
          struct obd_device *obd = exp->exp_obd;
          int lflags = *flags;
          ldlm_mode_t rc;
          ENTRY;
  
+        osc_build_res_name(lsm->lsm_object_id, lsm->lsm_object_gr, &res_id);
+
          OBD_FAIL_RETURN(OBD_FAIL_OSC_MATCH, -EIO);
  
          /* Filesystem lock extents are extended to page boundaries so that
@@ -3157,19 +3277,30 @@ static int osc_cancel_unused(struct obd_export *exp,
                               struct lov_stripe_md *lsm, int flags, void *opaque)
  {
          struct obd_device *obd = class_exp2obd(exp);
-        struct ldlm_res_id res_id = { .name = {lsm->lsm_object_id} };
+        struct ldlm_res_id res_id, *resp = NULL;
+
+        if (lsm != NULL) {
+                resp = osc_build_res_name(lsm->lsm_object_id,
+                                          lsm->lsm_object_gr, &res_id);
+        }
+
+        return ldlm_cli_cancel_unused(obd->obd_namespace, resp, flags, opaque);
  
-        return ldlm_cli_cancel_unused(obd->obd_namespace, &res_id, flags,
-                                      opaque);
  }
  
  static int osc_join_lru(struct obd_export *exp,
                          struct lov_stripe_md *lsm, int join)
  {
          struct obd_device *obd = class_exp2obd(exp);
-        struct ldlm_res_id res_id = { .name = {lsm->lsm_object_id} };
+        struct ldlm_res_id res_id, *resp = NULL;
+
+        if (lsm != NULL) {
+                resp = osc_build_res_name(lsm->lsm_object_id,
+                                          lsm->lsm_object_gr, &res_id);
+        }
+
+        return ldlm_cli_join_lru(obd->obd_namespace, resp, join);
  
-        return ldlm_cli_join_lru(obd->obd_namespace, &res_id, join);
  }
  
  static int osc_statfs_interpret(struct ptlrpc_request *req,
@@ -3199,7 +3330,7 @@ static int osc_statfs_async(struct obd_device *obd, struct obd_info *oinfo,
  {
          struct ptlrpc_request *req;
          struct osc_async_args *aa;
-        int size[2] = { sizeof(struct ptlrpc_body), sizeof(*oinfo->oi_osfs) };
+        __u32 size[2] = { sizeof(struct ptlrpc_body), sizeof(*oinfo->oi_osfs) };
          ENTRY;
  
          /* We could possibly pass max_age in the request (as an absolute
@@ -3224,7 +3355,7 @@ static int osc_statfs_async(struct obd_device *obd, struct obd_info *oinfo,
  
          req->rq_interpret_reply = osc_statfs_interpret;
          CLASSERT(sizeof(*aa) <= sizeof(req->rq_async_args));
-        aa = (struct osc_async_args *)&req->rq_async_args;
+        aa = ptlrpc_req_async_args(req);
          aa->aa_oi = oinfo;
  
          ptlrpc_set_add_req(rqset, req);
@@ -3237,10 +3368,11 @@ static int osc_statfs(struct obd_device *obd, struct obd_statfs *osfs,
          struct obd_statfs *msfs;
          struct ptlrpc_request *req;
          struct obd_import     *imp = NULL;
-        int rc, size[2] = { sizeof(struct ptlrpc_body), sizeof(*osfs) };
+        __u32 size[2] = { sizeof(struct ptlrpc_body), sizeof(*osfs) };
+        int rc;
          ENTRY;
  
-        /*Since the request might also come from lprocfs, so we need 
+        /*Since the request might also come from lprocfs, so we need
           *sync this with client_disconnect_export Bug15684*/
          down_read(&obd->u.cli.cl_sem);
          if (obd->u.cli.cl_import)
@@ -3248,7 +3380,7 @@ static int osc_statfs(struct obd_device *obd, struct obd_statfs *osfs,
          up_read(&obd->u.cli.cl_sem);
          if (!imp)
                  RETURN(-ENODEV);
-  
+
          /* We could possibly pass max_age in the request (as an absolute
           * timestamp or a "seconds.usec ago") so the target can avoid doing
           * extra calls into the filesystem if that isn't necessary (e.g.
@@ -3295,32 +3427,48 @@ static int osc_statfs(struct obd_device *obd, struct obd_statfs *osfs,
   *
   * @lmmu is a pointer to an in-core struct with lmm_ost_count indicating
   * the maximum number of OST indices which will fit in the user buffer.
- * lmm_magic must be LOV_MAGIC (we only use 1 slot here).
+ * lmm_magic must be LOV_MAGIC_V1 or LOV_MAGIC_V3 (we only use 1 slot here).
   */
  static int osc_getstripe(struct lov_stripe_md *lsm, struct lov_user_md *lump)
  {
-        struct lov_user_md lum, *lumk;
+        /* we use lov_user_md_v3 because it is larger than lov_user_md_v1 */
+        struct lov_user_md_v3 lum, *lumk;
          int rc = 0, lum_size;
+        struct lov_user_ost_data_v1 *lmm_objects;
          ENTRY;
  
          if (!lsm)
                  RETURN(-ENODATA);
  
-        if (copy_from_user(&lum, lump, sizeof(lum)))
+        /* we only need the header part from user space to get lmm_magic and
+         * lmm_stripe_count, (the header part is common to v1 and v3) */
+        lum_size = sizeof(struct lov_user_md_v1);
+        if (copy_from_user(&lum, lump, lum_size))
                  RETURN(-EFAULT);
  
-        if (lum.lmm_magic != LOV_USER_MAGIC)
+        if ((lum.lmm_magic != LOV_USER_MAGIC_V1) &&
+            (lum.lmm_magic != LOV_USER_MAGIC_V3))
                  RETURN(-EINVAL);
  
+        /* lov_user_md_vX and lov_mds_md_vX must have the same size */
+        LASSERT(sizeof(struct lov_user_md_v1) == sizeof(struct lov_mds_md_v1));
+        LASSERT(sizeof(struct lov_user_md_v3) == sizeof(struct lov_mds_md_v3));
+        LASSERT(sizeof(lum.lmm_objects[0]) == sizeof(lumk->lmm_objects[0]));
+
+        /* we can use lov_mds_md_size() to compute lum_size
+         * because lov_user_md_vX and lov_mds_md_vX have the same size */
          if (lum.lmm_stripe_count > 0) {
-                lum_size = sizeof(lum) + sizeof(lum.lmm_objects[0]);
+                lum_size = lov_mds_md_size(lum.lmm_stripe_count, lum.lmm_magic);
                  OBD_ALLOC(lumk, lum_size);
                  if (!lumk)
                          RETURN(-ENOMEM);
-
-                lumk->lmm_objects[0].l_object_id = lsm->lsm_object_id;
+                if (lum.lmm_magic == LOV_USER_MAGIC_V1)
+                        lmm_objects = &(((struct lov_user_md_v1 *)lumk)->lmm_objects[0]);
+                else
+                        lmm_objects = &(lumk->lmm_objects[0]);
+                lmm_objects->l_object_id = lsm->lsm_object_id;
          } else {
-                lum_size = sizeof(lum);
+                lum_size = lov_mds_md_size(0, lum.lmm_magic);
                  lumk = &lum;
          }
  
@@ -3345,14 +3493,10 @@ static int osc_iocontrol(unsigned int cmd, struct obd_export *exp, int len,
          int err = 0;
          ENTRY;
  
-#if (LINUX_VERSION_CODE < KERNEL_VERSION(2,5,0))
-        MOD_INC_USE_COUNT;
-#else
          if (!try_module_get(THIS_MODULE)) {
                  CERROR("Can't get module. Is it alive?");
                  return -EINVAL;
          }
-#endif
          switch (cmd) {
          case OBD_IOC_LOV_GET_CONFIG: {
                  char *buf;
@@ -3418,7 +3562,7 @@ static int osc_iocontrol(unsigned int cmd, struct obd_export *exp, int len,
          case OBD_IOC_DESTROY: {
                  struct obdo            *oa;
  
-                if (!capable (CAP_SYS_ADMIN))
+                if (!cfs_capable(CFS_CAP_SYS_ADMIN))
                          GOTO (out, err = -EPERM);
                  oa = &data->ioc_obdo1;
  
@@ -3430,22 +3574,21 @@ static int osc_iocontrol(unsigned int cmd, struct obd_export *exp, int len,
                  err = osc_destroy(exp, oa, NULL, NULL, NULL);
                  GOTO(out, err);
          }
+        case OBD_IOC_PING_TARGET:
+                err = ptlrpc_obd_ping(obd);
+                GOTO(out, err);
          default:
                  CDEBUG(D_INODE, "unrecognised ioctl %#x by %s\n",
                         cmd, cfs_curproc_comm());
                  GOTO(out, err = -ENOTTY);
          }
  out:
-#if (LINUX_VERSION_CODE < KERNEL_VERSION(2,5,0))
-        MOD_DEC_USE_COUNT;
-#else
          module_put(THIS_MODULE);
-#endif
          return err;
  }
  
  static int osc_get_info(struct obd_export *exp, obd_count keylen,
-                        void *key, __u32 *vallen, void *val)
+                        void *key, __u32 *vallen, void *val, struct lov_stripe_md *lsm)
  {
          ENTRY;
          if (!vallen || !val)
@@ -3460,7 +3603,8 @@ static int osc_get_info(struct obd_export *exp, obd_count keylen,
                  struct ptlrpc_request *req;
                  obd_id *reply;
                  char *bufs[2] = { NULL, key };
-                int rc, size[2] = { sizeof(struct ptlrpc_body), keylen };
+                __u32 size[2] = { sizeof(struct ptlrpc_body), keylen };
+                int rc;
  
                  req = ptlrpc_prep_req(class_exp2cliimp(exp), LUSTRE_OST_VERSION,
                                        OST_GET_INFO, 2, size, bufs);
@@ -3483,7 +3627,39 @@ static int osc_get_info(struct obd_export *exp, obd_count keylen,
          out:
                  ptlrpc_req_finished(req);
                  RETURN(rc);
+        } else if (KEY_IS(KEY_FIEMAP)) {
+                struct ptlrpc_request *req;
+                struct ll_user_fiemap *reply;
+                char *bufs[2] = { NULL, key };
+                __u32 size[2] = { sizeof(struct ptlrpc_body), keylen };
+                int rc;
+
+                req = ptlrpc_prep_req(class_exp2cliimp(exp), LUSTRE_OST_VERSION,
+                                      OST_GET_INFO, 2, size, bufs);
+                if (req == NULL)
+                        RETURN(-ENOMEM);
+
+                size[REPLY_REC_OFF] = *vallen;
+                ptlrpc_req_set_repsize(req, 2, size);
+
+                rc = ptlrpc_queue_wait(req);
+                if (rc)
+                        GOTO(out1, rc);
+                reply = lustre_swab_repbuf(req, REPLY_REC_OFF, *vallen,
+                                           lustre_swab_fiemap);
+                if (reply == NULL) {
+                        CERROR("Can't unpack FIEMAP reply.\n");
+                        GOTO(out1, rc = -EPROTO);
+                }
+
+                memcpy(val, reply, *vallen);
+
+        out1:
+                ptlrpc_req_finished(req);
+
+                RETURN(rc);
          }
+
          RETURN(-EINVAL);
  }
  
@@ -3523,7 +3699,7 @@ static int osc_set_info_async(struct obd_export *exp, obd_count keylen,
          struct ptlrpc_request *req;
          struct obd_device  *obd = exp->exp_obd;
          struct obd_import *imp = class_exp2cliimp(exp);
-        int size[3] = { sizeof(struct ptlrpc_body), keylen, vallen };
+        __u32 size[3] = { sizeof(struct ptlrpc_body), keylen, vallen };
          char *bufs[3] = { NULL, key, val };
          ENTRY;
  
@@ -3599,7 +3775,7 @@ static struct llog_operations osc_size_repl_logops = {
  
  static struct llog_operations osc_mds_ost_orig_logops;
  static int osc_llog_init(struct obd_device *obd, struct obd_device *tgt,
-                         int count, struct llog_catid *catid, 
+                         int count, struct llog_catid *catid,
                           struct obd_uuid *uuid)
  {
          int rc;
@@ -3624,11 +3800,16 @@ static int osc_llog_init(struct obd_device *obd, struct obd_device *tgt,
  
          rc = llog_setup(obd, LLOG_SIZE_REPL_CTXT, tgt, count, NULL,
                          &osc_size_repl_logops);
-        if (rc) 
+        if (rc) {
+                struct llog_ctxt *ctxt = 
+                        llog_get_context(obd, LLOG_MDS_OST_ORIG_CTXT);
+                if (ctxt)
+                        llog_cleanup(ctxt);
                  CERROR("failed LLOG_SIZE_REPL_CTXT\n");
+        }
  out:
          if (rc) {
-                CERROR("osc '%s' tgt '%s' cnt %d catid %p rc=%d\n", 
+                CERROR("osc '%s' tgt '%s' cnt %d catid %p rc=%d\n",
                         obd->obd_name, tgt->obd_name, count, catid, rc);
                  CERROR("logid "LPX64":0x%x\n",
                         catid->lci_logid.lgl_oid, catid->lci_logid.lgl_ogen);
@@ -3657,7 +3838,8 @@ static int osc_llog_finish(struct obd_device *obd, int count)
  
  static int osc_reconnect(struct obd_export *exp, struct obd_device *obd,
                           struct obd_uuid *cluuid,
-                         struct obd_connect_data *data)
+                         struct obd_connect_data *data,
+                         void *localdata)
  {
          struct client_obd *cli = &obd->u.cli;
  
@@ -3685,14 +3867,21 @@ static int osc_reconnect(struct obd_export *exp, struct obd_device *obd,
  static int osc_disconnect(struct obd_export *exp)
  {
          struct obd_device *obd = class_exp2obd(exp);
-        struct llog_ctxt *ctxt = llog_get_context(obd, LLOG_SIZE_REPL_CTXT);
+        struct llog_ctxt  *ctxt;
          int rc;
  
-        if (obd->u.cli.cl_conn_count == 1)
-                /* flush any remaining cancel messages out to the target */
-                llog_sync(ctxt, exp);
-        
-        llog_ctxt_put(ctxt);
+        ctxt = llog_get_context(obd, LLOG_SIZE_REPL_CTXT);
+        if (ctxt) {
+                if (obd->u.cli.cl_conn_count == 1) {
+                        /* Flush any remaining cancel messages out to the 
+                         * target */
+                        llog_sync(ctxt, exp);
+                }
+                llog_ctxt_put(ctxt);
+        } else {
+                CDEBUG(D_HA, "No LLOG_SIZE_REPL_CTXT found in obd %p\n", 
+                       obd);
+        }
  
          rc = client_disconnect_export(exp);
          return rc;
@@ -3841,12 +4030,17 @@ static int osc_precleanup(struct obd_device *obd, enum obd_cleanup_stage stage)
                     client import will not have been cleaned. */
                  if (obd->u.cli.cl_import) {
                          struct obd_import *imp;
+                        down_write(&obd->u.cli.cl_sem);
                          imp = obd->u.cli.cl_import;
                          CDEBUG(D_CONFIG, "%s: client import never connected\n",
                                 obd->obd_name);
                          ptlrpc_invalidate_import(imp);
-                        ptlrpc_free_rq_pool(imp->imp_rq_pool);
+                        if (imp->imp_rq_pool) {
+                                ptlrpc_free_rq_pool(imp->imp_rq_pool);
+                                imp->imp_rq_pool = NULL;
+                        }
                          class_destroy_import(imp);
+                        up_write(&obd->u.cli.cl_sem);
                          obd->u.cli.cl_import = NULL;
                  }
                  rc = obd_llog_finish(obd, 0);
@@ -4020,7 +4214,7 @@ static void /*__exit*/ osc_exit(void)
          class_unregister_type(LUSTRE_OSC_NAME);
  }
  
-MODULE_AUTHOR("Cluster File Systems, Inc. <info@clusterfs.com>");
+MODULE_AUTHOR("Sun Microsystems, Inc. <http://www.lustre.org/>");
  MODULE_DESCRIPTION("Lustre Object Storage Client (OSC)");
  MODULE_LICENSE("GPL");
  
diff --git a/lustre/ost/autoMakefile.am b/lustre/ost/autoMakefile.am

index f178425..8db3fe4 100644 (file)
--- a/lustre/ost/autoMakefile.am
+++ b/lustre/ost/autoMakefile.am
@@ -1,7 +1,38 @@
-# Copyright (C) 2001  Cluster File Systems, Inc.
  #
-# This code is issued under the GNU General Public License.
-# See the file COPYING in this distribution
+# GPL HEADER START
+#
+# DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License version 2 only,
+# as published by the Free Software Foundation.
+#
+# This program is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+# General Public License version 2 for more details (a copy is included
+# in the LICENSE file that accompanied this code).
+#
+# You should have received a copy of the GNU General Public License
+# version 2 along with this program; If not, see
+# http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+#
+# Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+# CA 95054 USA or visit www.sun.com if you need additional information or
+# have any questions.
+#
+# GPL HEADER END
+#
+
+#
+# Copyright  2008 Sun Microsystems, Inc. All rights reserved
+# Use is subject to license terms.
+#
+
+#
+# This file is part of Lustre, http://www.lustre.org/
+# Lustre is a trademark of Sun Microsystems, Inc.
+#
  
  if MODULES
  modulefs_DATA = ost$(KMODEXT)
diff --git a/lustre/ost/lproc_ost.c b/lustre/ost/lproc_ost.c

index 77eeaca..9cb8912 100644 (file)
--- a/lustre/ost/lproc_ost.c
+++ b/lustre/ost/lproc_ost.c
@@ -1,26 +1,37 @@
  /* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
   * vim:expandtab:shiftwidth=8:tabstop=8:
   *
- *  Copyright (C) 2002, 2003 Cluster File Systems, Inc.
+ * GPL HEADER START
   *
- *   This file is part of the Lustre file system, http://www.lustre.org
- *   Lustre is a trademark of Cluster File Systems, Inc.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   *
- *   You may have signed or agreed to another license before downloading
- *   this software.  If so, you are bound by the terms and conditions
- *   of that agreement, and the following does not apply to you.  See the
- *   LICENSE file included with this distribution for more information.
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
   *
- *   If you did not agree to a different license, then this copy of Lustre
- *   is open source software; you can redistribute it and/or modify it
- *   under the terms of version 2 of the GNU General Public License as
- *   published by the Free Software Foundation.
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
   *
- *   In either case, Lustre is distributed in the hope that it will be
- *   useful, but WITHOUT ANY WARRANTY; without even the implied warranty
- *   of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- *   license text for more details.
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
   *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright  2008 Sun Microsystems, Inc. All rights reserved
+ * Use is subject to license terms.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
   */
  #define DEBUG_SUBSYSTEM S_OST
  
diff --git a/lustre/ost/ost_handler.c b/lustre/ost/ost_handler.c

index f2a43ea..9eb25aa 100644 (file)
--- a/lustre/ost/ost_handler.c
+++ b/lustre/ost/ost_handler.c
@@ -1,36 +1,42 @@
  /* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
   * vim:expandtab:shiftwidth=8:tabstop=8:
   *
- *  Copyright (C) 2001-2003 Cluster File Systems, Inc.
- *   Author: Peter J. Braam <braam@clusterfs.com>
- *   Author: Phil Schwan <phil@clusterfs.com>
+ * GPL HEADER START
   *
- *   This file is part of the Lustre file system, http://www.lustre.org
- *   Lustre is a trademark of Cluster File Systems, Inc.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   *
- *   You may have signed or agreed to another license before downloading
- *   this software.  If so, you are bound by the terms and conditions
- *   of that agreement, and the following does not apply to you.  See the
- *   LICENSE file included with this distribution for more information.
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
   *
- *   If you did not agree to a different license, then this copy of Lustre
- *   is open source software; you can redistribute it and/or modify it
- *   under the terms of version 2 of the GNU General Public License as
- *   published by the Free Software Foundation.
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
   *
- *   In either case, Lustre is distributed in the hope that it will be
- *   useful, but WITHOUT ANY WARRANTY; without even the implied warranty
- *   of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- *   license text for more details.
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
   *
- *  Storage Target Handling functions
- *  Lustre Object Server Module (OST)
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
   *
- *  This server is single threaded at present (but can easily be multi
- *  threaded). For testing and management it is treated as an
- *  obd_device, although it does not export a full OBD method table
- *  (the requests are coming in over the wire, so object target
- *  modules do not have a full method table.)
+ * GPL HEADER END
+ */
+/*
+ * Copyright  2008 Sun Microsystems, Inc. All rights reserved
+ * Use is subject to license terms.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/ost/ost_handler.c
+ *
+ * Author: Peter J. Braam <braam@clusterfs.com>
+ * Author: Phil Schwan <phil@clusterfs.com>
   */
  
  #ifndef EXPORT_SYMTAB
@@ -46,9 +52,9 @@
  #include <lustre_debug.h>
  #include <linux/init.h>
  #include <lprocfs_status.h>
-#include <lustre_commit_confd.h>
  #include <libcfs/list.h>
  #include <lustre_quota.h>
+#include <lustre_log.h>
  #include "ost_internal.h"
  
  static int oss_num_threads;
@@ -71,8 +77,12 @@ void oti_to_request(struct obd_trans_info *oti, struct ptlrpc_request *req)
          if (oti == NULL)
                  return;
  
-        if (req->rq_repmsg)
+        if (req->rq_repmsg) {
+                __u64 versions[PTLRPC_NUM_VERSIONS] = { 0 };
                  lustre_msg_set_transno(req->rq_repmsg, oti->oti_transno);
+                versions[0] = oti->oti_pre_version;
+                lustre_msg_set_versions(req->rq_repmsg, versions);
+        }
          req->rq_transno = oti->oti_transno;
  
          /* XXX 4 == entries in oti_ack_locks??? */
@@ -88,7 +98,8 @@ static int ost_destroy(struct obd_export *exp, struct ptlrpc_request *req,
                         struct obd_trans_info *oti)
  {
          struct ost_body *body, *repbody;
-        int rc, size[2] = { sizeof(struct ptlrpc_body), sizeof(*body) };
+        __u32 size[2] = { sizeof(struct ptlrpc_body), sizeof(*body) };
+        int rc;
          ENTRY;
  
          body = lustre_swab_reqbuf(req, REQ_REC_OFF, sizeof(*body),
@@ -125,7 +136,8 @@ static int ost_getattr(struct obd_export *exp, struct ptlrpc_request *req)
  {
          struct ost_body *body, *repbody;
          struct obd_info oinfo = { { { 0 } } };
-        int rc, size[2] = { sizeof(struct ptlrpc_body), sizeof(*body) };
+        __u32 size[2] = { sizeof(struct ptlrpc_body), sizeof(*body) };
+        int rc;
          ENTRY;
  
          body = lustre_swab_reqbuf(req, REQ_REC_OFF, sizeof(*body),
@@ -149,7 +161,8 @@ static int ost_getattr(struct obd_export *exp, struct ptlrpc_request *req)
  static int ost_statfs(struct ptlrpc_request *req)
  {
          struct obd_statfs *osfs;
-        int rc, size[2] = { sizeof(struct ptlrpc_body), sizeof(*osfs) };
+        __u32 size[2] = { sizeof(struct ptlrpc_body), sizeof(*osfs) };
+        int rc;
          ENTRY;
  
          rc = lustre_pack_reply(req, 2, size, NULL);
@@ -158,7 +171,7 @@ static int ost_statfs(struct ptlrpc_request *req)
  
          osfs = lustre_msg_buf(req->rq_repmsg, REPLY_REC_OFF, sizeof(*osfs));
  
-        req->rq_status = obd_statfs(req->rq_export->exp_obd, osfs, 
+        req->rq_status = obd_statfs(req->rq_export->exp_obd, osfs,
                                      cfs_time_current_64() - HZ, 0);
          if (OBD_FAIL_CHECK_ONCE(OBD_FAIL_OST_ENOSPC))
                  osfs->os_bfree = osfs->os_bavail = 64;
@@ -172,7 +185,8 @@ static int ost_create(struct obd_export *exp, struct ptlrpc_request *req,
                        struct obd_trans_info *oti)
  {
          struct ost_body *body, *repbody;
-        int rc, size[2] = { sizeof(struct ptlrpc_body), sizeof(*repbody) };
+        __u32 size[2] = { sizeof(struct ptlrpc_body), sizeof(*repbody) };
+        int rc;
          ENTRY;
  
          body = lustre_swab_reqbuf(req, REQ_REC_OFF, sizeof(*body),
@@ -236,7 +250,7 @@ static int ost_punch_lock_get(struct obd_export *exp, struct obdo *oa,
          else
                  policy.l_extent.end = finis | ~CFS_PAGE_MASK;
  
-        RETURN(ldlm_cli_enqueue_local(exp->exp_obd->obd_namespace, &res_id, 
+        RETURN(ldlm_cli_enqueue_local(exp->exp_obd->obd_namespace, &res_id,
                                        LDLM_EXTENT, &policy, LCK_PW, &flags,
                                        ldlm_blocking_ast, ldlm_completion_ast,
                                        ldlm_glimpse_ast, NULL, 0, NULL, lh));
@@ -260,17 +274,17 @@ static int ost_punch(struct obd_export *exp, struct ptlrpc_request *req,
  {
          struct obd_info oinfo = { { { 0 } } };
          struct ost_body *body, *repbody;
-        int rc, size[2] = { sizeof(struct ptlrpc_body), sizeof(*repbody) };
+        __u32 size[2] = { sizeof(struct ptlrpc_body), sizeof(*repbody) };
+        int rc;
          struct lustre_handle lh = {0,};
          ENTRY;
  
          /* check that we do support OBD_CONNECT_TRUNCLOCK. */
          CLASSERT(OST_CONNECT_SUPPORTED & OBD_CONNECT_TRUNCLOCK);
  
-        body = lustre_swab_reqbuf(req, REQ_REC_OFF, sizeof(*body),
-                                  lustre_swab_ost_body);
-        if (body == NULL)
-                RETURN(-EFAULT);
+        /* ost_body is varified and swabbed in ost_hpreq_handler() */
+        body = lustre_msg_buf(req->rq_reqmsg, REQ_REC_OFF, sizeof(*body));
+        LASSERT(body != NULL);
  
          oinfo.oi_oa = &body->oa;
          oinfo.oi_policy.l_extent.start = oinfo.oi_oa->o_size;
@@ -306,8 +320,10 @@ static int ost_punch(struct obd_export *exp, struct ptlrpc_request *req,
  
  static int ost_sync(struct obd_export *exp, struct ptlrpc_request *req)
  {
+        struct obd_info oinfo = { { { 0 } } };
          struct ost_body *body, *repbody;
-        int rc, size[2] = { sizeof(struct ptlrpc_body), sizeof(*repbody) };
+        __u32 size[2] = { sizeof(struct ptlrpc_body), sizeof(*repbody) };
+        int rc;
          ENTRY;
  
          body = lustre_swab_reqbuf(req, REQ_REC_OFF, sizeof(*body),
@@ -321,9 +337,11 @@ static int ost_sync(struct obd_export *exp, struct ptlrpc_request *req)
  
          repbody = lustre_msg_buf(req->rq_repmsg, REPLY_REC_OFF,
                                   sizeof(*repbody));
-        memcpy(&repbody->oa, &body->oa, sizeof(body->oa));
-        req->rq_status = obd_sync(exp, &repbody->oa, NULL, repbody->oa.o_size,
-                                  repbody->oa.o_blocks);
+
+        oinfo.oi_oa = &body->oa;
+        req->rq_status = obd_sync(exp, &oinfo, repbody->oa.o_size,
+                                  repbody->oa.o_blocks, NULL);
+        repbody->oa = *oinfo.oi_oa;
          RETURN(0);
  }
  
@@ -331,7 +349,8 @@ static int ost_setattr(struct obd_export *exp, struct ptlrpc_request *req,
                         struct obd_trans_info *oti)
  {
          struct ost_body *body, *repbody;
-        int rc, size[2] = { sizeof(struct ptlrpc_body), sizeof(*repbody) };
+        __u32 size[2] = { sizeof(struct ptlrpc_body), sizeof(*repbody) };
+        int rc;
          struct obd_info oinfo = { { { 0 } } };
          ENTRY;
  
@@ -362,100 +381,6 @@ static int ost_bulk_timeout(void *data)
          RETURN(1);
  }
  
-static int get_per_page_niobufs(struct obd_ioobj *ioo, int nioo,
-                                struct niobuf_remote *rnb, int nrnb,
-                                struct niobuf_remote **pp_rnbp)
-{
-        /* Copy a remote niobuf, splitting it into page-sized chunks
-         * and setting ioo[i].ioo_bufcnt accordingly */
-        struct niobuf_remote *pp_rnb;
-        int   i;
-        int   j;
-        int   page;
-        int   rnbidx = 0;
-        int   npages = 0;
-
-        /*
-         * array of sufficient size already preallocated by caller
-         */
-        LASSERT(pp_rnbp != NULL);
-        LASSERT(*pp_rnbp != NULL);
-
-        /* first count and check the number of pages required */
-        for (i = 0; i < nioo; i++)
-                for (j = 0; j < ioo->ioo_bufcnt; j++, rnbidx++) {
-                        obd_off offset = rnb[rnbidx].offset;
-                        obd_off p0 = offset >> CFS_PAGE_SHIFT;
-                        obd_off pn = (offset + rnb[rnbidx].len - 1)>>CFS_PAGE_SHIFT;
-
-                        LASSERT(rnbidx < nrnb);
-
-                        npages += (pn + 1 - p0);
-
-                        if (rnb[rnbidx].len == 0) {
-                                CERROR("zero len BRW: obj %d objid "LPX64
-                                       " buf %u\n", i, ioo[i].ioo_id, j);
-                                return -EINVAL;
-                        }
-                        if (j > 0 &&
-                            rnb[rnbidx].offset <= rnb[rnbidx-1].offset) {
-                                CERROR("unordered BRW: obj %d objid "LPX64
-                                       " buf %u offset "LPX64" <= "LPX64"\n",
-                                       i, ioo[i].ioo_id, j, rnb[rnbidx].offset,
-                                       rnb[rnbidx].offset);
-                                return -EINVAL;
-                        }
-                }
-
-        LASSERT(rnbidx == nrnb);
-
-        if (npages == nrnb) {       /* all niobufs are for single pages */
-                *pp_rnbp = rnb;
-                return npages;
-        }
-
-        pp_rnb = *pp_rnbp;
-
-        /* now do the actual split */
-        page = rnbidx = 0;
-        for (i = 0; i < nioo; i++) {
-                int  obj_pages = 0;
-
-                for (j = 0; j < ioo[i].ioo_bufcnt; j++, rnbidx++) {
-                        obd_off off = rnb[rnbidx].offset;
-                        int     nob = rnb[rnbidx].len;
-
-                        LASSERT(rnbidx < nrnb);
-                        do {
-                                obd_off  poff = off & ~CFS_PAGE_MASK;
-                                int      pnob = (poff + nob > CFS_PAGE_SIZE) ?
-                                                CFS_PAGE_SIZE - poff : nob;
-
-                                LASSERT(page < npages);
-                                pp_rnb[page].len = pnob;
-                                pp_rnb[page].offset = off;
-                                pp_rnb[page].flags = rnb[rnbidx].flags;
-
-                                CDEBUG(0, "   obj %d id "LPX64
-                                       "page %d(%d) "LPX64" for %d, flg %x\n",
-                                       i, ioo[i].ioo_id, obj_pages, page,
-                                       pp_rnb[page].offset, pp_rnb[page].len,
-                                       pp_rnb[page].flags);
-                                page++;
-                                obj_pages++;
-
-                                off += pnob;
-                                nob -= pnob;
-                        } while (nob > 0);
-                        LASSERT(nob == 0);
-                }
-                ioo[i].ioo_bufcnt = obj_pages;
-        }
-        LASSERT(page == npages);
-
-        return npages;
-}
-
  static __u32 ost_checksum_bulk(struct ptlrpc_bulk_desc *desc, int opc,
                                 cksum_type_t cksum_type)
  {
@@ -478,62 +403,17 @@ static __u32 ost_checksum_bulk(struct ptlrpc_bulk_desc *desc, int opc,
                  /* corrupt the data after we compute the checksum, to
                   * simulate an OST->client data error */
                  if (i == 0 && opc == OST_READ &&
-                    OBD_FAIL_CHECK_ONCE(OBD_FAIL_OST_CHECKSUM_SEND))
+                    OBD_FAIL_CHECK_ONCE(OBD_FAIL_OST_CHECKSUM_SEND)) {
                          memcpy(ptr, "bad4", min(4, len));
+                        /* nobody should use corrupted page again */
+                        ClearPageUptodate(page);
+                }
                  kunmap(page);
          }
  
          return cksum;
  }
  
-/*
- * populate @nio by @nrpages pages from per-thread page pool
- */
-static void ost_nio_pages_get(struct ptlrpc_request *req,
-                              struct niobuf_local *nio, int nrpages)
-{
-        int i;
-        struct ost_thread_local_cache *tls;
-
-        ENTRY;
-
-        LASSERT(nrpages <= OST_THREAD_POOL_SIZE);
-        LASSERT(req != NULL);
-        LASSERT(req->rq_svc_thread != NULL);
-
-        tls = ost_tls(req);
-        LASSERT(tls != NULL);
-
-        memset(nio, 0, nrpages * sizeof *nio);
-        for (i = 0; i < nrpages; ++ i) {
-                struct page *page;
-
-                page = tls->page[i];
-                LASSERT(page != NULL);
-                POISON_PAGE(page, 0xf1);
-                nio[i].page = page;
-                LL_CDEBUG_PAGE(D_INFO, page, "%d\n", i);
-        }
-        EXIT;
-}
-
-/*
- * Dual for ost_nio_pages_get(). Poison pages in pool for debugging
- */
-static void ost_nio_pages_put(struct ptlrpc_request *req,
-                              struct niobuf_local *nio, int nrpages)
-{
-        int i;
-
-        ENTRY;
-
-        LASSERT(nrpages <= OST_THREAD_POOL_SIZE);
-
-        for (i = 0; i < nrpages; ++ i)
-                POISON_PAGE(nio[i].page, 0xf2);
-        EXIT;
-}
-
  static int ost_brw_lock_get(int mode, struct obd_export *exp,
                              struct obd_ioobj *obj, struct niobuf_remote *nb,
                              struct lustre_handle *lh)
@@ -561,7 +441,7 @@ static int ost_brw_lock_get(int mode, struct obd_export *exp,
          policy.l_extent.end   = (nb[nrbufs - 1].offset +
                                   nb[nrbufs - 1].len - 1) | ~CFS_PAGE_MASK;
  
-        RETURN(ldlm_cli_enqueue_local(exp->exp_obd->obd_namespace, &res_id, 
+        RETURN(ldlm_cli_enqueue_local(exp->exp_obd->obd_namespace, &res_id,
                                        LDLM_EXTENT, &policy, mode, &flags,
                                        ldlm_blocking_ast, ldlm_completion_ast,
                                        ldlm_glimpse_ast, NULL, 0, NULL, lh));
@@ -583,7 +463,10 @@ static void ost_brw_lock_put(int mode,
  struct ost_prolong_data {
          struct obd_export *opd_exp;
          ldlm_policy_data_t opd_policy;
+        struct obdo *opd_oa;
          ldlm_mode_t opd_mode;
+        int opd_lock_match;
+        int opd_timeout;
  };
  
  static int ost_prolong_locks_iter(struct ldlm_lock *lock, void *data)
@@ -613,6 +496,14 @@ static int ost_prolong_locks_iter(struct ldlm_lock *lock, void *data)
                  return LDLM_ITER_CONTINUE;
          }
  
+        /* Fill the obdo with the matched lock handle.
+         * XXX: it is possible in some cases the IO RPC is covered by several
+         * locks, even for the write case, so it may need to be a lock list. */
+        if (opd->opd_oa && !(opd->opd_oa->o_valid & OBD_MD_FLHANDLE)) {
+                opd->opd_oa->o_handle.cookie = lock->l_handle.h_cookie;
+                opd->opd_oa->o_valid |= OBD_MD_FLHANDLE;
+        }
+
          if (!(lock->l_flags & LDLM_FL_AST_SENT)) {
                  /* ignore locks not being cancelled */
                  return LDLM_ITER_CONTINUE;
@@ -620,29 +511,36 @@ static int ost_prolong_locks_iter(struct ldlm_lock *lock, void *data)
  
          /* OK. this is a possible lock the user holds doing I/O
           * let's refresh eviction timer for it */
-        ldlm_refresh_waiting_lock(lock);
+        ldlm_refresh_waiting_lock(lock, opd->opd_timeout);
+        opd->opd_lock_match = 1;
  
          return LDLM_ITER_CONTINUE;
  }
  
-static void ost_prolong_locks(struct obd_export *exp, struct obd_ioobj *obj,
-                              struct niobuf_remote *nb, struct obdo *oa,
-                              ldlm_mode_t mode)
+static int ost_rw_prolong_locks(struct ptlrpc_request *req, struct obd_ioobj *obj,
+                                struct niobuf_remote *nb, struct obdo *oa,
+                                ldlm_mode_t mode)
  
  
  {
          struct ldlm_res_id res_id = { .name = { obj->ioo_id } };
+        struct ost_prolong_data opd = { 0 };
          int nrbufs = obj->ioo_bufcnt;
-        struct ost_prolong_data opd;
  
          ENTRY;
  
          opd.opd_mode = mode;
-        opd.opd_exp = exp;
+        opd.opd_exp = req->rq_export;
          opd.opd_policy.l_extent.start = nb[0].offset & CFS_PAGE_MASK;
          opd.opd_policy.l_extent.end = (nb[nrbufs - 1].offset +
                                         nb[nrbufs - 1].len - 1) | ~CFS_PAGE_MASK;
  
+        /* prolong locks for the current service time of the corresponding
+         * portal (= OST_IO_PORTAL) */
+        opd.opd_timeout = AT_OFF ? obd_timeout / 2 :
+                          max(at_est2timeout(at_get(&req->rq_rqbd->
+                              rqbd_service->srv_at_estimate)), ldlm_timeout);
+
          CDEBUG(D_DLMTRACE,"refresh locks: "LPU64"/"LPU64" ("LPU64"->"LPU64")\n",
                 res_id.name[0], res_id.name[1], opd.opd_policy.l_extent.start,
                 opd.opd_policy.l_extent.end);
@@ -653,30 +551,41 @@ static void ost_prolong_locks(struct obd_export *exp, struct obd_ioobj *obj,
                  lock = ldlm_handle2lock(&oa->o_handle);
                  if (lock != NULL) {
                          ost_prolong_locks_iter(lock, &opd);
+                        if (opd.opd_lock_match) {
+                                LDLM_LOCK_PUT(lock);
+                                RETURN(1);
+                        }
+
+                        /* Check if the lock covers the whole IO region,
+                         * otherwise iterate through the resource. */
+                        if (lock->l_policy_data.l_extent.end >=
+                            opd.opd_policy.l_extent.end &&
+                            lock->l_policy_data.l_extent.start <=
+                            opd.opd_policy.l_extent.start) {
+                                LDLM_LOCK_PUT(lock);
+                                RETURN(0);
+                        }
                          LDLM_LOCK_PUT(lock);
-                        EXIT;
-                        return;
                  }
          }
  
-        ldlm_resource_iterate(exp->exp_obd->obd_namespace, &res_id,
+        opd.opd_oa = oa;
+        ldlm_resource_iterate(req->rq_export->exp_obd->obd_namespace, &res_id,
                                ost_prolong_locks_iter, &opd);
-
-        EXIT;
+        RETURN(opd.opd_lock_match);
  }
  
  static int ost_brw_read(struct ptlrpc_request *req, struct obd_trans_info *oti)
  {
-        struct ptlrpc_bulk_desc *desc;
+        struct ptlrpc_bulk_desc *desc = NULL;
          struct obd_export       *exp = req->rq_export;
          struct niobuf_remote *remote_nb;
-        struct niobuf_remote *pp_rnb = NULL;
          struct niobuf_local *local_nb;
          struct obd_ioobj *ioo;
          struct ost_body *body, *repbody;
          struct l_wait_info lwi;
          struct lustre_handle lockh = { 0 };
-        int size[2] = { sizeof(struct ptlrpc_body), sizeof(*body) };
+        __u32  size[2] = { sizeof(struct ptlrpc_body), sizeof(*body) };
          int niocount, npages, nob = 0, rc, i;
          int no_reply = 0;
          ENTRY;
@@ -697,39 +606,18 @@ static int ost_brw_read(struct ptlrpc_request *req, struct obd_trans_info *oti)
          if (exp->exp_failed)
                  GOTO(out, rc = -ENOTCONN);
  
-        body = lustre_swab_reqbuf(req, REQ_REC_OFF, sizeof(*body),
-                                  lustre_swab_ost_body);
-        if (body == NULL) {
-                CERROR("Missing/short ost_body\n");
-                GOTO(out, rc = -EFAULT);
-        }
+        /* ost_body, ioobj & noibuf_remote are verified and swabbed in
+         * ost_rw_hpreq_check(). */
+        body = lustre_msg_buf(req->rq_reqmsg, REQ_REC_OFF, sizeof(*body));
+        LASSERT(body != NULL);
  
-        ioo = lustre_swab_reqbuf(req, REQ_REC_OFF + 1, sizeof(*ioo),
-                                 lustre_swab_obd_ioobj);
-        if (ioo == NULL) {
-                CERROR("Missing/short ioobj\n");
-                GOTO(out, rc = -EFAULT);
-        }
+        ioo = lustre_msg_buf(req->rq_reqmsg, REQ_REC_OFF + 1, sizeof(*ioo));
+        LASSERT(ioo != NULL);
  
          niocount = ioo->ioo_bufcnt;
-        if (niocount > PTLRPC_MAX_BRW_PAGES) {
-                DEBUG_REQ(D_ERROR, req, "bulk has too many pages (%d)",
-                          niocount);
-                GOTO(out, rc = -EFAULT);
-        }
-
-        remote_nb = lustre_swab_reqbuf(req, REQ_REC_OFF + 2,
-                                       niocount * sizeof(*remote_nb),
-                                       lustre_swab_niobuf_remote);
-        if (remote_nb == NULL) {
-                CERROR("Missing/short niobuf\n");
-                GOTO(out, rc = -EFAULT);
-        }
-        if (lustre_req_need_swab(req)) {
-                /* swab remaining niobufs */
-                for (i = 1; i < niocount; i++)
-                        lustre_swab_niobuf_remote (&remote_nb[i]);
-        }
+        remote_nb = lustre_msg_buf(req->rq_reqmsg, REQ_REC_OFF + 2,
+                                   niocount * sizeof(*remote_nb));
+        LASSERT(remote_nb != NULL);
  
          rc = lustre_pack_reply(req, 2, size, NULL);
          if (rc)
@@ -740,32 +628,16 @@ static int ost_brw_read(struct ptlrpc_request *req, struct obd_trans_info *oti)
           * ost_thread_init().
           */
          local_nb = ost_tls(req)->local;
-        pp_rnb   = ost_tls(req)->remote;
-
-        /* FIXME all niobuf splitting should be done in obdfilter if needed */
-        /* CAVEAT EMPTOR this sets ioo->ioo_bufcnt to # pages */
-        npages = get_per_page_niobufs(ioo, 1, remote_nb, niocount, &pp_rnb);
-        if (npages < 0)
-                GOTO(out, rc = npages);
  
-        LASSERT(npages <= OST_THREAD_POOL_SIZE);
-
-        ost_nio_pages_get(req, local_nb, npages);
-
-        desc = ptlrpc_prep_bulk_exp(req, npages,
-                                     BULK_PUT_SOURCE, OST_BULK_PORTAL);
-        if (desc == NULL)
-                GOTO(out, rc = -ENOMEM);
-
-        rc = ost_brw_lock_get(LCK_PR, exp, ioo, pp_rnb, &lockh);
+        rc = ost_brw_lock_get(LCK_PR, exp, ioo, remote_nb, &lockh);
          if (rc != 0)
                  GOTO(out_bulk, rc);
  
-        /* 
+        /*
           * If getting the lock took more time than
           * client was willing to wait, drop it. b=11330
           */
-        if (cfs_time_current_sec() > req->rq_deadline || 
+        if (cfs_time_current_sec() > req->rq_deadline ||
              OBD_FAIL_CHECK(OBD_FAIL_OST_DROP_REQ)) {
                  no_reply = 1;
                  CERROR("Dropping timed-out read from %s because locking"
@@ -776,12 +648,18 @@ static int ost_brw_read(struct ptlrpc_request *req, struct obd_trans_info *oti)
                  GOTO(out_lock, rc = -ETIMEDOUT);
          }
  
-        rc = obd_preprw(OBD_BRW_READ, exp, &body->oa, 1,
-                        ioo, npages, pp_rnb, local_nb, oti);
+        npages = OST_THREAD_POOL_SIZE;
+        rc = obd_preprw(OBD_BRW_READ, exp, &body->oa, 1, ioo,
+                        remote_nb, &npages, local_nb, oti);
          if (rc != 0)
                  GOTO(out_lock, rc);
  
-        ost_prolong_locks(exp, ioo, pp_rnb, &body->oa, LCK_PW | LCK_PR);
+        desc = ptlrpc_prep_bulk_exp(req, npages,
+                                     BULK_PUT_SOURCE, OST_BULK_PORTAL);
+        if (desc == NULL) /* XXX: check all cleanup stuff */
+                GOTO(out, rc = -ENOMEM);
+
+        ost_rw_prolong_locks(req, ioo, remote_nb, &body->oa, LCK_PW | LCK_PR);
  
          nob = 0;
          for (i = 0; i < npages; i++) {
@@ -792,17 +670,15 @@ static int ost_brw_read(struct ptlrpc_request *req, struct obd_trans_info *oti)
                          break;
                  }
  
-                LASSERTF(page_rc <= pp_rnb[i].len, "page_rc (%d) > "
-                         "pp_rnb[%d].len (%d)\n", page_rc, i, pp_rnb[i].len);
                  nob += page_rc;
                  if (page_rc != 0) {             /* some data! */
                          LASSERT (local_nb[i].page != NULL);
                          ptlrpc_prep_bulk_page(desc, local_nb[i].page,
-                                              pp_rnb[i].offset & ~CFS_PAGE_MASK,
+                                              local_nb[i].offset & ~CFS_PAGE_MASK,
                                                page_rc);
                  }
  
-                if (page_rc != pp_rnb[i].len) { /* short read */
+                if (page_rc != local_nb[i].len) { /* short read */
                          /* All subsequent pages should be 0 */
                          while(++i < npages)
                                  LASSERT(local_nb[i].rc == 0);
@@ -845,20 +721,20 @@ static int ost_brw_read(struct ptlrpc_request *req, struct obd_trans_info *oti)
                  if (rc == 0) {
                          time_t start = cfs_time_current_sec();
                          do {
-                                long timeoutl = req->rq_deadline - 
+                                long timeoutl = req->rq_deadline -
                                          cfs_time_current_sec();
-                                cfs_duration_t timeout = (timeoutl <= 0 || rc) ? 
+                                cfs_duration_t timeout = (timeoutl <= 0 || rc) ?
                                          CFS_TICK : cfs_time_seconds(timeoutl);
-                                lwi = LWI_TIMEOUT_INTERVAL(timeout, 
+                                lwi = LWI_TIMEOUT_INTERVAL(timeout,
                                                             cfs_time_seconds(1),
-                                                           ost_bulk_timeout, 
+                                                           ost_bulk_timeout,
                                                             desc);
-                                rc = l_wait_event(desc->bd_waitq, 
-                                                  !ptlrpc_bulk_active(desc) ||
+                                rc = l_wait_event(desc->bd_waitq,
+                                                  !ptlrpc_server_bulk_active(desc) ||
                                                    exp->exp_failed, &lwi);
                                  LASSERT(rc == 0 || rc == -ETIMEDOUT);
                                  /* Wait again if we changed deadline */
-                        } while ((rc == -ETIMEDOUT) && 
+                        } while ((rc == -ETIMEDOUT) &&
                                   (req->rq_deadline > cfs_time_current_sec()));
  
                          if (rc == -ETIMEDOUT) {
@@ -889,10 +765,8 @@ static int ost_brw_read(struct ptlrpc_request *req, struct obd_trans_info *oti)
          }
  
          /* Must commit after prep above in all cases */
-        rc = obd_commitrw(OBD_BRW_READ, exp, &body->oa, 1,
-                          ioo, npages, local_nb, oti, rc);
-
-        ost_nio_pages_put(req, local_nb, npages);
+        rc = obd_commitrw(OBD_BRW_READ, exp, &body->oa, 1, ioo,
+                          remote_nb, npages, local_nb, oti, rc);
  
          if (rc == 0) {
                  repbody = lustre_msg_buf(req->rq_repmsg, REPLY_REC_OFF,
@@ -901,13 +775,15 @@ static int ost_brw_read(struct ptlrpc_request *req, struct obd_trans_info *oti)
          }
  
   out_lock:
-        ost_brw_lock_put(LCK_PR, ioo, pp_rnb, &lockh);
+        ost_brw_lock_put(LCK_PR, ioo, remote_nb, &lockh);
   out_bulk:
-        ptlrpc_free_bulk(desc);
+        if (desc)
+                ptlrpc_free_bulk(desc);
   out:
          LASSERT(rc <= 0);
          if (rc == 0) {
                  req->rq_status = nob;
+                ptlrpc_lprocfs_brw(req, nob);
                  target_committed_to_req(req);
                  ptlrpc_reply(req);
          } else if (!no_reply) {
@@ -931,22 +807,21 @@ static int ost_brw_read(struct ptlrpc_request *req, struct obd_trans_info *oti)
  
  static int ost_brw_write(struct ptlrpc_request *req, struct obd_trans_info *oti)
  {
-        struct ptlrpc_bulk_desc *desc;
+        struct ptlrpc_bulk_desc *desc = NULL;
          struct obd_export       *exp = req->rq_export;
          struct niobuf_remote    *remote_nb;
-        struct niobuf_remote    *pp_rnb;
          struct niobuf_local     *local_nb;
          struct obd_ioobj        *ioo;
          struct ost_body         *body, *repbody;
          struct l_wait_info       lwi;
          struct lustre_handle     lockh = {0};
          __u32                   *rcs;
-        int size[3] = { sizeof(struct ptlrpc_body), sizeof(*body) };
+        __u32 size[3] = { sizeof(struct ptlrpc_body), sizeof(*body) };
          int objcount, niocount, npages;
-        int rc, swab, i, j;
+        int rc, i, j;
          obd_count                client_cksum = 0, server_cksum = 0;
          cksum_type_t             cksum_type = OBD_CKSUM_CRC32;
-        int                      no_reply = 0; 
+        int                      no_reply = 0;
          ENTRY;
  
          if (OBD_FAIL_CHECK(OBD_FAIL_OST_BRW_WRITE_BULK))
@@ -968,56 +843,22 @@ static int ost_brw_write(struct ptlrpc_request *req, struct obd_trans_info *oti)
          if (exp->exp_failed)
                  GOTO(out, rc = -ENOTCONN);
  
-        swab = lustre_req_need_swab(req);
-        body = lustre_swab_reqbuf(req, REQ_REC_OFF, sizeof(*body),
-                                  lustre_swab_ost_body);
-        if (body == NULL) {
-                CERROR("Missing/short ost_body\n");
-                GOTO(out, rc = -EFAULT);
-        }
+        /* ost_body, ioobj & noibuf_remote are verified and swabbed in
+         * ost_rw_hpreq_check(). */
+        body = lustre_msg_buf(req->rq_reqmsg, REQ_REC_OFF, sizeof(*body));
+        LASSERT(body != NULL);
  
          objcount = lustre_msg_buflen(req->rq_reqmsg, REQ_REC_OFF + 1) /
                     sizeof(*ioo);
-        if (objcount == 0) {
-                CERROR("Missing/short ioobj\n");
-                GOTO(out, rc = -EFAULT);
-        }
-        if (objcount > 1) {
-                CERROR("too many ioobjs (%d)\n", objcount);
-                GOTO(out, rc = -EFAULT);
-        }
-
-        lustre_set_req_swabbed(req, REQ_REC_OFF + 1);
          ioo = lustre_msg_buf(req->rq_reqmsg, REQ_REC_OFF + 1,
                               objcount * sizeof(*ioo));
-        LASSERT (ioo != NULL);
-        for (niocount = i = 0; i < objcount; i++) {
-                if (swab)
-                        lustre_swab_obd_ioobj(&ioo[i]);
-                if (ioo[i].ioo_bufcnt == 0) {
-                        CERROR("ioo[%d] has zero bufcnt\n", i);
-                        GOTO(out, rc = -EFAULT);
-                }
+        LASSERT(ioo != NULL);
+        for (niocount = i = 0; i < objcount; i++)
                  niocount += ioo[i].ioo_bufcnt;
-        }
  
-        if (niocount > PTLRPC_MAX_BRW_PAGES) {
-                DEBUG_REQ(D_ERROR, req, "bulk has too many pages (%d)",
-                          niocount);
-                GOTO(out, rc = -EFAULT);
-        }
-
-        remote_nb = lustre_swab_reqbuf(req, REQ_REC_OFF + 2,
-                                       niocount * sizeof(*remote_nb),
-                                       lustre_swab_niobuf_remote);
-        if (remote_nb == NULL) {
-                CERROR("Missing/short niobuf\n");
-                GOTO(out, rc = -EFAULT);
-        }
-        if (swab) {                             /* swab the remaining niobufs */
-                for (i = 1; i < niocount; i++)
-                        lustre_swab_niobuf_remote (&remote_nb[i]);
-        }
+        remote_nb = lustre_msg_buf(req->rq_reqmsg, REQ_REC_OFF + 2,
+                                   niocount * sizeof(*remote_nb));
+        LASSERT(remote_nb != NULL);
  
          size[REPLY_REC_OFF + 1] = niocount * sizeof(*rcs);
          rc = lustre_pack_reply(req, 3, size, NULL);
@@ -1033,32 +874,16 @@ static int ost_brw_write(struct ptlrpc_request *req, struct obd_trans_info *oti)
           * ost_thread_init().
           */
          local_nb = ost_tls(req)->local;
-        pp_rnb   = ost_tls(req)->remote;
-
-        /* FIXME all niobuf splitting should be done in obdfilter if needed */
-        /* CAVEAT EMPTOR this sets ioo->ioo_bufcnt to # pages */
-        npages = get_per_page_niobufs(ioo, objcount,remote_nb,niocount,&pp_rnb);
-        if (npages < 0)
-                GOTO(out, rc = npages);
  
-        LASSERT(npages <= OST_THREAD_POOL_SIZE);
-
-        ost_nio_pages_get(req, local_nb, npages);
-
-        desc = ptlrpc_prep_bulk_exp(req, npages,
-                                     BULK_GET_SINK, OST_BULK_PORTAL);
-        if (desc == NULL)
-                GOTO(out, rc = -ENOMEM);
-
-        rc = ost_brw_lock_get(LCK_PW, exp, ioo, pp_rnb, &lockh);
+        rc = ost_brw_lock_get(LCK_PW, exp, ioo, remote_nb, &lockh);
          if (rc != 0)
                  GOTO(out_bulk, rc);
  
-        /* 
+        /*
           * If getting the lock took more time than
           * client was willing to wait, drop it. b=11330
           */
-        if (cfs_time_current_sec() > req->rq_deadline || 
+        if (cfs_time_current_sec() > req->rq_deadline ||
              OBD_FAIL_CHECK(OBD_FAIL_OST_DROP_REQ)) {
                  no_reply = 1;
                  CERROR("Dropping timed-out write from %s because locking "
@@ -1069,7 +894,7 @@ static int ost_brw_write(struct ptlrpc_request *req, struct obd_trans_info *oti)
                  GOTO(out_lock, rc = -ETIMEDOUT);
          }
  
-        ost_prolong_locks(exp, ioo, pp_rnb, &body->oa, LCK_PW);
+        ost_rw_prolong_locks(req, ioo, remote_nb,&body->oa,  LCK_PW);
  
          /* obd_preprw clobbers oa->valid, so save what we need */
          if (body->oa.o_valid & OBD_MD_FLCKSUM) {
@@ -1079,46 +904,52 @@ static int ost_brw_write(struct ptlrpc_request *req, struct obd_trans_info *oti)
          }
  
          /* Because we already sync grant info with client when reconnect,
-         * grant info will be cleared for resent req, then fed_grant and 
-         * total_grant will not be modified in following preprw_write*/ 
+         * grant info will be cleared for resent req, then fed_grant and
+         * total_grant will not be modified in following preprw_write*/
          if (lustre_msg_get_flags(req->rq_reqmsg) & (MSG_RESENT | MSG_REPLAY)) {
                  DEBUG_REQ(D_CACHE, req, "clear resent/replay req grant info");
                  body->oa.o_valid &= ~OBD_MD_FLGRANT;
          }
  
+        npages = OST_THREAD_POOL_SIZE;
          rc = obd_preprw(OBD_BRW_WRITE, exp, &body->oa, objcount,
-                        ioo, npages, pp_rnb, local_nb, oti);
+                        ioo, remote_nb, &npages, local_nb, oti);
          if (rc != 0)
                  GOTO(out_lock, rc);
  
+        desc = ptlrpc_prep_bulk_exp(req, npages,
+                                     BULK_GET_SINK, OST_BULK_PORTAL);
+        if (desc == NULL)
+                GOTO(out, rc = -ENOMEM);
+
          /* NB Having prepped, we must commit... */
  
          for (i = 0; i < npages; i++)
                  ptlrpc_prep_bulk_page(desc, local_nb[i].page,
-                                      pp_rnb[i].offset & ~CFS_PAGE_MASK,
-                                      pp_rnb[i].len);
+                                      local_nb[i].offset & ~CFS_PAGE_MASK,
+                                      local_nb[i].len);
  
          /* Check if client was evicted while we were doing i/o before touching
             network */
          if (desc->bd_export->exp_failed)
                  rc = -ENOTCONN;
          else
-                rc = ptlrpc_start_bulk_transfer (desc);
+                rc = ptlrpc_start_bulk_transfer(desc);
          if (rc == 0) {
                  time_t start = cfs_time_current_sec();
                  do {
-                        long timeoutl = req->rq_deadline - 
+                        long timeoutl = req->rq_deadline -
                                  cfs_time_current_sec();
-                        cfs_duration_t timeout = (timeoutl <= 0 || rc) ? 
+                        cfs_duration_t timeout = (timeoutl <= 0 || rc) ?
                                  CFS_TICK : cfs_time_seconds(timeoutl);
                          lwi = LWI_TIMEOUT_INTERVAL(timeout, cfs_time_seconds(1),
                                                     ost_bulk_timeout, desc);
-                        rc = l_wait_event(desc->bd_waitq, 
-                                          !ptlrpc_bulk_active(desc) ||
+                        rc = l_wait_event(desc->bd_waitq,
+                                          !ptlrpc_server_bulk_active(desc) ||
                                            desc->bd_export->exp_failed, &lwi);
                          LASSERT(rc == 0 || rc == -ETIMEDOUT);
                          /* Wait again if we changed deadline */
-                } while ((rc == -ETIMEDOUT) && 
+                } while ((rc == -ETIMEDOUT) &&
                           (req->rq_deadline > cfs_time_current_sec()));
  
                  if (rc == -ETIMEDOUT) {
@@ -1182,8 +1013,8 @@ static int ost_brw_write(struct ptlrpc_request *req, struct obd_trans_info *oti)
                  rc = -ENOTCONN;
  
          /* Must commit after prep above in all cases */
-        rc = obd_commitrw(OBD_BRW_WRITE, exp, &repbody->oa,
-                           objcount, ioo, npages, local_nb, oti, rc);
+        rc = obd_commitrw(OBD_BRW_WRITE, exp, &repbody->oa, objcount, ioo,
+                          remote_nb, npages, local_nb, oti, rc);
  
          if (unlikely(client_cksum != server_cksum && rc == 0)) {
                  int  new_cksum = ost_checksum_bulk(desc, OST_WRITE, cksum_type);
@@ -1218,38 +1049,41 @@ static int ost_brw_write(struct ptlrpc_request *req, struct obd_trans_info *oti)
                                     body->oa.o_id,
                                     body->oa.o_valid & OBD_MD_FLGROUP ?
                                                  body->oa.o_gr : (__u64)0,
-                                   pp_rnb[0].offset,
-                                   pp_rnb[npages-1].offset+pp_rnb[npages-1].len
-                                   - 1 );
+                                   local_nb[0].offset,
+                                   local_nb[npages-1].offset +
+                                   local_nb[npages-1].len - 1 );
                  CERROR("client csum %x, original server csum %x, "
                         "server csum now %x\n",
                         client_cksum, server_cksum, new_cksum);
          }
  
-        ost_nio_pages_put(req, local_nb, npages);
-
          if (rc == 0) {
+                int nob = 0;
+
                  /* set per-requested niobuf return codes */
                  for (i = j = 0; i < niocount; i++) {
-                        int nob = remote_nb[i].len;
+                        int len = remote_nb[i].len;
  
+                        nob += len;
                          rcs[i] = 0;
                          do {
                                  LASSERT(j < npages);
                                  if (local_nb[j].rc < 0)
                                          rcs[i] = local_nb[j].rc;
-                                nob -= pp_rnb[j].len;
+                                len -= local_nb[j].len;
                                  j++;
-                        } while (nob > 0);
-                        LASSERT(nob == 0);
+                        } while (len > 0);
+                        LASSERT(len == 0);
                  }
                  LASSERT(j == npages);
+                ptlrpc_lprocfs_brw(req, nob);
          }
  
   out_lock:
-        ost_brw_lock_put(LCK_PW, ioo, pp_rnb, &lockh);
+        ost_brw_lock_put(LCK_PW, ioo, remote_nb, &lockh);
   out_bulk:
-        ptlrpc_free_bulk(desc);
+        if (desc)
+                ptlrpc_free_bulk(desc);
   out:
          if (rc == 0) {
                  oti_to_request(oti, req);
@@ -1322,7 +1156,7 @@ static int ost_get_info(struct obd_export *exp, struct ptlrpc_request *req)
          keylen = lustre_msg_buflen(req->rq_reqmsg, REQ_REC_OFF);
  
          /* call once to get the size to allocate the reply buffer */
-        rc = obd_get_info(exp, keylen, key, &size[1], NULL);
+        rc = obd_get_info(exp, keylen, key, &size[1], NULL, NULL);
          if (rc)
                  RETURN(rc);
  
@@ -1332,16 +1166,18 @@ static int ost_get_info(struct obd_export *exp, struct ptlrpc_request *req)
  
          reply = lustre_msg_buf(req->rq_repmsg, REPLY_REC_OFF, sizeof(*reply));
          /* call again to fill in the reply buffer */
-        rc = obd_get_info(exp, keylen, key, size, reply);
+        rc = obd_get_info(exp, keylen, key, size, reply, NULL);
          lustre_msg_set_status(req->rq_repmsg, 0);
  
          RETURN(rc);
  }
  
+#ifdef HAVE_QUOTA_SUPPORT
  static int ost_handle_quotactl(struct ptlrpc_request *req)
  {
          struct obd_quotactl *oqctl, *repoqc;
-        int rc, size[2] = { sizeof(struct ptlrpc_body), sizeof(*repoqc) };
+        __u32 size[2] = { sizeof(struct ptlrpc_body), sizeof(*repoqc) };
+        int rc;
          ENTRY;
  
          oqctl = lustre_swab_reqbuf(req, REQ_REC_OFF, sizeof(*oqctl),
@@ -1369,7 +1205,7 @@ static int ost_handle_quotacheck(struct ptlrpc_request *req)
  
          oqctl = lustre_swab_reqbuf(req, REQ_REC_OFF, sizeof(*oqctl),
                                     lustre_swab_obd_quotactl);
-        if (oqctl == NULL) 
+        if (oqctl == NULL)
                  RETURN(-EPROTO);
  
          rc = lustre_pack_reply(req, 1, NULL, NULL);
@@ -1383,10 +1219,12 @@ static int ost_handle_quotacheck(struct ptlrpc_request *req)
  static int ost_handle_quota_adjust_qunit(struct ptlrpc_request *req)
  {
          struct quota_adjust_qunit *oqaq, *repoqa;
+        struct lustre_quota_ctxt *qctxt;
          int size[2] = { sizeof(struct ptlrpc_body), sizeof(*repoqa) };
          int rc;
          ENTRY;
  
+        qctxt = &req->rq_export->exp_obd->u.obt.obt_qctxt;
          oqaq = lustre_swab_reqbuf(req, REQ_REC_OFF, sizeof(*oqaq),
                                    lustre_swab_quota_adjust_qunit);
  
@@ -1396,11 +1234,12 @@ static int ost_handle_quota_adjust_qunit(struct ptlrpc_request *req)
          if (rc)
                  GOTO(out, rc);
          repoqa = lustre_msg_buf(req->rq_repmsg, REPLY_REC_OFF, sizeof(*repoqa));
-        req->rq_status = obd_quota_adjust_qunit(req->rq_export, oqaq);
+        req->rq_status = obd_quota_adjust_qunit(req->rq_export, oqaq, qctxt);
          *repoqa = *oqaq;
   out:
          RETURN(rc);
  }
+#endif
  
  static int ost_filter_recovery_request(struct ptlrpc_request *req,
                                         struct obd_device *obd, int *process)
@@ -1458,9 +1297,11 @@ int ost_msg_check_version(struct lustre_msg *msg)
          case OST_SYNC:
          case OST_SET_INFO:
          case OST_GET_INFO:
+#ifdef HAVE_QUOTA_SUPPORT
          case OST_QUOTACHECK:
          case OST_QUOTACTL:
          case OST_QUOTA_ADJUST_QUNIT:
+#endif
                  rc = lustre_msg_check_version(msg, LUSTRE_OST_VERSION);
                  if (rc)
                          CERROR("bad opc %u version %08x, expecting %08x\n",
@@ -1496,6 +1337,257 @@ int ost_msg_check_version(struct lustre_msg *msg)
          return rc;
  }
  
+static int ost_rw_hpreq_lock_match(struct ptlrpc_request *req,
+                                       struct ldlm_lock *lock)
+{
+        struct niobuf_remote *nb;
+        struct obd_ioobj *ioo;
+        struct ost_body *body;
+        int objcount, niocount;
+        int mode, opc, i;
+        __u64 start, end;
+        ENTRY;
+
+        opc = lustre_msg_get_opc(req->rq_reqmsg);
+        LASSERT(opc == OST_READ || opc == OST_WRITE);
+
+        /* As the request may be covered by several locks, do not look at
+         * o_handle, look at the RPC IO region. */
+        body = lustre_swab_reqbuf(req, REQ_REC_OFF, sizeof(*body),
+                                  lustre_swab_obdo);
+        objcount = lustre_msg_buflen(req->rq_reqmsg, REQ_REC_OFF + 1) /
+                   sizeof(*ioo);
+        ioo = lustre_msg_buf(req->rq_reqmsg, REQ_REC_OFF + 1,
+                             objcount * sizeof(*ioo));
+        LASSERT(ioo != NULL);
+        for (niocount = i = 0; i < objcount; i++)
+                niocount += ioo[i].ioo_bufcnt;
+
+        nb = lustre_msg_buf(req->rq_reqmsg, REQ_REC_OFF + 2,
+                            niocount * sizeof(*nb));
+        LASSERT(nb != NULL);
+
+        mode = LCK_PW;
+        if (opc == OST_READ)
+                mode |= LCK_PR;
+
+        start = nb[0].offset & CFS_PAGE_MASK;
+        end = (nb[ioo->ioo_bufcnt - 1].offset +
+               nb[ioo->ioo_bufcnt - 1].len - 1) | ~CFS_PAGE_MASK;
+
+        if (!(lock->l_granted_mode & mode))
+                RETURN(0);
+
+        if (lock->l_policy_data.l_extent.end < start ||
+            lock->l_policy_data.l_extent.start > end)
+                RETURN(0);
+
+        RETURN(1);
+}
+
+/**
+ * Swab buffers needed to call ost_rw_prolong_locks() and call it.
+ * Return the value from ost_rw_prolong_locks() which is non-zero if
+ * there is a cancelled lock which is waiting for this IO request.
+ */
+static int ost_rw_hpreq_check(struct ptlrpc_request *req)
+{
+        struct niobuf_remote *nb;
+        struct obd_ioobj *ioo;
+        struct ost_body *body;
+        int objcount, niocount;
+        int mode, opc, i;
+        ENTRY;
+
+        opc = lustre_msg_get_opc(req->rq_reqmsg);
+        LASSERT(opc == OST_READ || opc == OST_WRITE);
+
+        body = lustre_msg_buf(req->rq_reqmsg, REQ_REC_OFF, sizeof(*body));
+        LASSERT(body != NULL);
+
+        objcount = lustre_msg_buflen(req->rq_reqmsg, REQ_REC_OFF + 1) /
+                   sizeof(*ioo);
+        ioo = lustre_msg_buf(req->rq_reqmsg, REQ_REC_OFF + 1,
+                             objcount * sizeof(*ioo));
+        LASSERT(ioo != NULL);
+
+        for (niocount = i = 0; i < objcount; i++)
+                niocount += ioo[i].ioo_bufcnt;
+        nb = lustre_msg_buf(req->rq_reqmsg, REQ_REC_OFF + 2,
+                            niocount * sizeof(*nb));
+        LASSERT(nb != NULL);
+        LASSERT(niocount == 0 || !(nb[0].flags & OBD_BRW_SRVLOCK));
+
+        mode = LCK_PW;
+        if (opc == OST_READ)
+                mode |= LCK_PR;
+        RETURN(ost_rw_prolong_locks(req, ioo, nb, &body->oa, mode));
+}
+
+static int ost_punch_prolong_locks(struct ptlrpc_request *req, struct obdo *oa)
+{
+        struct ldlm_res_id res_id = { .name = { oa->o_id } };
+        struct ost_prolong_data opd = { 0 };
+        __u64 start, end;
+        ENTRY;
+
+        start = oa->o_size;
+        end = start + oa->o_blocks;
+
+        opd.opd_mode = LCK_PW;
+        opd.opd_exp = req->rq_export;
+        opd.opd_policy.l_extent.start = start & CFS_PAGE_MASK;
+        if (oa->o_blocks == OBD_OBJECT_EOF || end < start)
+                opd.opd_policy.l_extent.end = OBD_OBJECT_EOF;
+        else
+                opd.opd_policy.l_extent.end = end | ~CFS_PAGE_MASK;
+
+        /* prolong locks for the current service time of the corresponding
+         * portal (= OST_IO_PORTAL) */
+        opd.opd_timeout = AT_OFF ? obd_timeout / 2 :
+                          max(at_est2timeout(at_get(&req->rq_rqbd->
+                              rqbd_service->srv_at_estimate)), ldlm_timeout);
+        
+        CDEBUG(D_DLMTRACE,"refresh locks: "LPU64"/"LPU64" ("LPU64"->"LPU64")\n",
+               res_id.name[0], res_id.name[1], opd.opd_policy.l_extent.start,
+               opd.opd_policy.l_extent.end);
+
+        opd.opd_oa = oa;
+
+        ldlm_resource_iterate(req->rq_export->exp_obd->obd_namespace, &res_id,
+                              ost_prolong_locks_iter, &opd);
+        RETURN(opd.opd_lock_match);
+}
+
+static int ost_punch_hpreq_lock_match(struct ptlrpc_request *req,
+                                      struct ldlm_lock *lock)
+{
+        struct ost_body *body;
+        ENTRY;
+
+        body = lustre_swab_reqbuf(req, REQ_REC_OFF, sizeof(*body),
+                                  lustre_swab_obdo);
+        LASSERT(body != NULL);
+
+        if (body->oa.o_valid & OBD_MD_FLHANDLE &&
+            body->oa.o_handle.cookie == lock->l_handle.h_cookie)
+                RETURN(1);
+        RETURN(0);
+}
+
+static int ost_punch_hpreq_check(struct ptlrpc_request *req)
+{
+        struct ost_body *body = lustre_msg_buf(req->rq_reqmsg,
+                                               REQ_REC_OFF, sizeof(*body));
+        LASSERT(body != NULL);
+        LASSERT(!(body->oa.o_valid & OBD_MD_FLFLAGS) ||
+                !(body->oa.o_flags & OBD_FL_TRUNCLOCK));
+
+        RETURN(ost_punch_prolong_locks(req, &body->oa));
+}
+
+struct ptlrpc_hpreq_ops ost_hpreq_rw = {
+        .hpreq_lock_match  = ost_rw_hpreq_lock_match,
+        .hpreq_check       = ost_rw_hpreq_check,
+};
+
+struct ptlrpc_hpreq_ops ost_hpreq_punch = {
+        .hpreq_lock_match  = ost_punch_hpreq_lock_match,
+        .hpreq_check       = ost_punch_hpreq_check,
+};
+
+/** Assign high priority operations to the request if needed. */
+static int ost_hpreq_handler(struct ptlrpc_request *req)
+{
+        ENTRY;
+        if (req->rq_export) {
+                int opc = lustre_msg_get_opc(req->rq_reqmsg);
+                struct ost_body *body;
+
+                if (opc == OST_READ || opc == OST_WRITE) {
+                        struct niobuf_remote *nb;
+                        struct obd_ioobj *ioo;
+                        int objcount, niocount;
+                        int swab, i;
+
+                        body = lustre_swab_reqbuf(req, REQ_REC_OFF,
+                                                  sizeof(*body),
+                                                  lustre_swab_obdo);
+                        if (!body) {
+                                CERROR("Missing/short ost_body\n");
+                                RETURN(-EFAULT);
+                        }
+                        objcount = lustre_msg_buflen(req->rq_reqmsg,
+                                                     REQ_REC_OFF + 1) /
+                                sizeof(*ioo);
+                        if (objcount == 0) {
+                                CERROR("Missing/short ioobj\n");
+                                RETURN(-EFAULT);
+                        }
+                        if (objcount > 1) {
+                                CERROR("too many ioobjs (%d)\n", objcount);
+                                RETURN(-EFAULT);
+                        }
+
+                        swab = !lustre_req_swabbed(req, REQ_REC_OFF + 1) &&
+                                lustre_req_need_swab(req);
+                        ioo = lustre_swab_reqbuf(req, REQ_REC_OFF + 1,
+                                                 objcount * sizeof(*ioo),
+                                                 lustre_swab_obd_ioobj);
+                        if (!ioo) {
+                                CERROR("Missing/short ioobj\n");
+                                RETURN(-EFAULT);
+                        }
+                        for (niocount = i = 0; i < objcount; i++) {
+                                if (i > 0 && swab)
+                                        lustre_swab_obd_ioobj(&ioo[i]);
+                                if (ioo[i].ioo_bufcnt == 0) {
+                                        CERROR("ioo[%d] has zero bufcnt\n", i);
+                                        RETURN(-EFAULT);
+                                }
+                                niocount += ioo[i].ioo_bufcnt;
+                        }
+                        if (niocount > PTLRPC_MAX_BRW_PAGES) {
+                                DEBUG_REQ(D_ERROR, req, "bulk has too many "
+                                          "pages (%d)", niocount);
+                                RETURN(-EFAULT);
+                        }
+
+                        swab = !lustre_req_swabbed(req, REQ_REC_OFF + 2) &&
+                                lustre_req_need_swab(req);
+                        nb = lustre_swab_reqbuf(req, REQ_REC_OFF + 2,
+                                                niocount * sizeof(*nb),
+                                                lustre_swab_niobuf_remote);
+                        if (!nb) {
+                                CERROR("Missing/short niobuf\n");
+                                RETURN(-EFAULT);
+                        }
+
+                        if (swab) {
+                                /* swab remaining niobufs */
+                                for (i = 1; i < niocount; i++)
+                                        lustre_swab_niobuf_remote(&nb[i]);
+                        }
+
+                        if (niocount == 0 || !(nb[0].flags & OBD_BRW_SRVLOCK))
+                                req->rq_ops = &ost_hpreq_rw;
+                } else if (opc == OST_PUNCH) {
+                        body = lustre_swab_reqbuf(req, REQ_REC_OFF,
+                                                  sizeof(*body),
+                                                  lustre_swab_obdo);
+                        if (!body) {
+                                CERROR("Missing/short ost_body\n");
+                                RETURN(-EFAULT);
+                        }
+
+                        if (!(body->oa.o_valid & OBD_MD_FLFLAGS) ||
+                            !(body->oa.o_flags & OBD_FL_TRUNCLOCK))
+                                req->rq_ops = &ost_hpreq_punch;
+                }
+        }
+        RETURN(0);
+}
+
  static int ost_handle(struct ptlrpc_request *req)
  {
          struct obd_trans_info trans_info = { 0, };
@@ -1507,7 +1599,7 @@ static int ost_handle(struct ptlrpc_request *req)
          LASSERT(current->journal_info == NULL);
          /* XXX identical to MDS */
          if (lustre_msg_get_opc(req->rq_reqmsg) != OST_CONNECT) {
-                int abort_recovery, recovering;
+                int recovering;
  
                  if (req->rq_export == NULL) {
                          CDEBUG(D_HA,"operation %d on unconnected OST from %s\n",
@@ -1521,12 +1613,10 @@ static int ost_handle(struct ptlrpc_request *req)
  
                  /* Check for aborted recovery. */
                  spin_lock_bh(&obd->obd_processing_task_lock);
-                abort_recovery = obd->obd_abort_recovery;
                  recovering = obd->obd_recovering;
                  spin_unlock_bh(&obd->obd_processing_task_lock);
-                if (abort_recovery) {
-                        target_abort_recovery(obd);
-                } else if (recovering) {
+                if (recovering &&
+                    target_recovery_check_and_stop(obd) == 0) {
                          rc = ost_filter_recovery_request(req, obd,
                                                           &should_process);
                          if (rc || !should_process)
@@ -1539,10 +1629,6 @@ static int ost_handle(struct ptlrpc_request *req)
          if (rc)
                  RETURN(rc);
  
-        rc = ost_msg_check_version(req->rq_reqmsg);
-        if (rc)
-                RETURN(rc);
-
          switch (lustre_msg_get_opc(req->rq_reqmsg)) {
          case OST_CONNECT: {
                  CDEBUG(D_INODE, "connect\n");
@@ -1644,6 +1730,7 @@ static int ost_handle(struct ptlrpc_request *req)
                  DEBUG_REQ(D_INODE, req, "get_info");
                  rc = ost_get_info(req->rq_export, req);
                  break;
+#ifdef HAVE_QUOTA_SUPPORT
          case OST_QUOTACHECK:
                  CDEBUG(D_INODE, "quotacheck\n");
                  OBD_FAIL_RETURN(OBD_FAIL_OST_QUOTACHECK_NET, 0);
@@ -1658,6 +1745,7 @@ static int ost_handle(struct ptlrpc_request *req)
                  CDEBUG(D_INODE, "quota_adjust_qunit\n");
                  rc = ost_handle_quota_adjust_qunit(req);
                  break;
+#endif
          case OBD_PING:
                  DEBUG_REQ(D_INODE, req, "ping");
                  rc = target_handle_ping(req);
@@ -1675,6 +1763,7 @@ static int ost_handle(struct ptlrpc_request *req)
                  CDEBUG(D_INODE, "log cancel\n");
                  OBD_FAIL_RETURN(OBD_FAIL_OBD_LOG_CANCEL_NET, 0);
                  rc = llog_origin_handle_cancel(req);
+                OBD_FAIL_RETURN(OBD_FAIL_OBD_LOG_CANCEL_REP, 0);
                  req->rq_status = rc;
                  rc = lustre_pack_reply(req, 1, NULL, NULL);
                  if (rc)
@@ -1719,20 +1808,9 @@ static int ost_handle(struct ptlrpc_request *req)
                  target_committed_to_req(req);
  
  out:
-        if (lustre_msg_get_flags(req->rq_reqmsg) & MSG_LAST_REPLAY) {
-                if (obd && obd->obd_recovering) {
-                        DEBUG_REQ(D_HA, req, "LAST_REPLAY, queuing reply");
-                        return target_queue_last_replay_reply(req, rc);
-                }
-                /* Lost a race with recovery; let the error path DTRT. */
-                rc = req->rq_status = -ENOTCONN;
-        }
-
          if (!rc)
                  oti_to_request(oti, req);
-
-        target_send_reply(req, rc, fail);
-        return 0;
+        return target_handle_reply(req, rc, fail);
  }
  
  /*
@@ -1740,7 +1818,6 @@ out:
   */
  static void ost_thread_done(struct ptlrpc_thread *thread)
  {
-        int i;
          struct ost_thread_local_cache *tls; /* TLS stands for Thread-Local
                                               * Storage */
  
@@ -1754,10 +1831,6 @@ static void ost_thread_done(struct ptlrpc_thread *thread)
           */
          tls = thread->t_data;
          if (tls != NULL) {
-                for (i = 0; i < OST_THREAD_POOL_SIZE; ++ i) {
-                        if (tls->page[i] != NULL)
-                                OBD_PAGE_FREE(tls->page[i]);
-                }
                  OBD_FREE_PTR(tls);
                  thread->t_data = NULL;
          }
@@ -1769,8 +1842,6 @@ static void ost_thread_done(struct ptlrpc_thread *thread)
   */
  static int ost_thread_init(struct ptlrpc_thread *thread)
  {
-        int result;
-        int i;
          struct ost_thread_local_cache *tls;
  
          ENTRY;
@@ -1780,23 +1851,10 @@ static int ost_thread_init(struct ptlrpc_thread *thread)
          LASSERTF(thread->t_id <= OSS_THREADS_MAX, "%u\n", thread->t_id);
  
          OBD_ALLOC_PTR(tls);
-        if (tls != NULL) {
-                result = 0;
-                thread->t_data = tls;
-                /*
-                 * populate pool
-                 */
-                for (i = 0; i < OST_THREAD_POOL_SIZE; ++ i) {
-                        OBD_PAGE_ALLOC(tls->page[i], OST_THREAD_POOL_GFP);
-                        if (tls->page[i] == NULL) {
-                                ost_thread_done(thread);
-                                result = -ENOMEM;
-                                break;
-                        }
-                }
-        } else
-                result = -ENOMEM;
-        RETURN(result);
+        if (tls == NULL)
+                RETURN(-ENOMEM);
+        thread->t_data = tls;
+        RETURN(0);
  }
  
  /* Sigh - really, this is an OSS, the _server_, not the _target_ */
@@ -1821,30 +1879,31 @@ static int ost_setup(struct obd_device *obd, obd_count len, void *buf)
  
          if (oss_num_threads) {
                  /* If oss_num_threads is set, it is the min and the max. */
-                if (oss_num_threads > OSS_THREADS_MAX) 
+                if (oss_num_threads > OSS_THREADS_MAX)
                          oss_num_threads = OSS_THREADS_MAX;
                  if (oss_num_threads < OSS_THREADS_MIN)
                          oss_num_threads = OSS_THREADS_MIN;
                  oss_max_threads = oss_min_threads = oss_num_threads;
          } else {
                  /* Base min threads on memory and cpus */
-                oss_min_threads = num_possible_cpus() * num_physpages >> 
+                oss_min_threads = num_possible_cpus() * num_physpages >>
                          (27 - CFS_PAGE_SHIFT);
                  if (oss_min_threads < OSS_THREADS_MIN)
                          oss_min_threads = OSS_THREADS_MIN;
                  /* Insure a 4x range for dynamic threads */
-                if (oss_min_threads > OSS_THREADS_MAX / 4) 
+                if (oss_min_threads > OSS_THREADS_MAX / 4)
                          oss_min_threads = OSS_THREADS_MAX / 4;
-                oss_max_threads = min(OSS_THREADS_MAX, oss_min_threads * 4);
+                oss_max_threads = min(OSS_THREADS_MAX, oss_min_threads * 4 + 1);
          }
  
          ost->ost_service =
                  ptlrpc_init_svc(OST_NBUFS, OST_BUFSIZE, OST_MAXREQSIZE,
                                  OST_MAXREPSIZE, OST_REQUEST_PORTAL,
-                                OSC_REPLY_PORTAL, OSS_SERVICE_WATCHDOG_FACTOR, 
+                                OSC_REPLY_PORTAL, OSS_SERVICE_WATCHDOG_FACTOR,
                                  ost_handle, LUSTRE_OSS_NAME,
                                  obd->obd_proc_entry, target_print_req,
-                                oss_min_threads, oss_max_threads, "ll_ost");
+                                oss_min_threads, oss_max_threads, "ll_ost",
+                                NULL);
          if (ost->ost_service == NULL) {
                  CERROR("failed to start OST service\n");
                  GOTO(out_lprocfs, rc = -ENOMEM);
@@ -1859,7 +1918,7 @@ static int ost_setup(struct obd_device *obd, obd_count len, void *buf)
                          oss_num_create_threads = OSS_MAX_CREATE_THREADS;
                  if (oss_num_create_threads < OSS_DEF_CREATE_THREADS)
                          oss_num_create_threads = OSS_DEF_CREATE_THREADS;
-                oss_min_create_threads = oss_max_create_threads = 
+                oss_min_create_threads = oss_max_create_threads =
                          oss_num_create_threads;
          } else {
                  oss_min_create_threads = OSS_DEF_CREATE_THREADS;
@@ -1874,7 +1933,7 @@ static int ost_setup(struct obd_device *obd, obd_count len, void *buf)
                                  obd->obd_proc_entry, target_print_req,
                                  oss_min_create_threads,
                                  oss_max_create_threads,
-                                "ll_ost_creat");
+                                "ll_ost_creat", NULL);
          if (ost->ost_create_service == NULL) {
                  CERROR("failed to start OST create service\n");
                  GOTO(out_service, rc = -ENOMEM);
@@ -1887,10 +1946,11 @@ static int ost_setup(struct obd_device *obd, obd_count len, void *buf)
          ost->ost_io_service =
                  ptlrpc_init_svc(OST_NBUFS, OST_BUFSIZE, OST_MAXREQSIZE,
                                  OST_MAXREPSIZE, OST_IO_PORTAL,
-                                OSC_REPLY_PORTAL, OSS_SERVICE_WATCHDOG_FACTOR, 
+                                OSC_REPLY_PORTAL, OSS_SERVICE_WATCHDOG_FACTOR,
                                  ost_handle, "ost_io",
                                  obd->obd_proc_entry, target_print_req,
-                                oss_min_threads, oss_max_threads, "ll_ost_io");
+                                oss_min_threads, oss_max_threads, "ll_ost_io",
+                                ost_hpreq_handler);
          if (ost->ost_io_service == NULL) {
                  CERROR("failed to start OST I/O service\n");
                  GOTO(out_create, rc = -ENOMEM);
@@ -2009,7 +2069,7 @@ static void /*__exit*/ ost_exit(void)
          class_unregister_type(LUSTRE_OSS_NAME);
  }
  
-MODULE_AUTHOR("Cluster File Systems, Inc. <info@clusterfs.com>");
+MODULE_AUTHOR("Sun Microsystems, Inc. <http://www.lustre.org/>");
  MODULE_DESCRIPTION("Lustre Object Storage Target (OST) v0.01");
  MODULE_LICENSE("GPL");
  
diff --git a/lustre/ost/ost_internal.h b/lustre/ost/ost_internal.h

index 18630a3..058db78 100644 (file)
--- a/lustre/ost/ost_internal.h
+++ b/lustre/ost/ost_internal.h
@@ -1,5 +1,37 @@
  /* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
   * vim:expandtab:shiftwidth=8:tabstop=8:
+ *
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright  2008 Sun Microsystems, Inc. All rights reserved
+ * Use is subject to license terms.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
   */
  
  #ifndef OST_INTERNAL_H
diff --git a/lustre/ptlrpc/autoMakefile.am b/lustre/ptlrpc/autoMakefile.am

index 66fbf83..d345492 100644 (file)
--- a/lustre/ptlrpc/autoMakefile.am
+++ b/lustre/ptlrpc/autoMakefile.am
@@ -1,7 +1,38 @@
-# Copyright (C) 2001  Cluster File Systems, Inc.
  #
-# This code is issued under the GNU General Public License.
-# See the file COPYING in this distribution
+# GPL HEADER START
+#
+# DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License version 2 only,
+# as published by the Free Software Foundation.
+#
+# This program is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+# General Public License version 2 for more details (a copy is included
+# in the LICENSE file that accompanied this code).
+#
+# You should have received a copy of the GNU General Public License
+# version 2 along with this program; If not, see
+# http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+#
+# Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+# CA 95054 USA or visit www.sun.com if you need additional information or
+# have any questions.
+#
+# GPL HEADER END
+#
+
+#
+# Copyright  2008 Sun Microsystems, Inc. All rights reserved
+# Use is subject to license terms.
+#
+
+#
+# This file is part of Lustre, http://www.lustre.org/
+# Lustre is a trademark of Sun Microsystems, Inc.
+#
  
  LDLM_COMM_SOURCES= $(top_srcdir)/lustre/ldlm/l_lock.c  \
         $(top_srcdir)/lustre/ldlm/ldlm_lock.c           \
diff --git a/lustre/ptlrpc/client.c b/lustre/ptlrpc/client.c

index 1f2a4cb..0d4c0a9 100644 (file)
--- a/lustre/ptlrpc/client.c
+++ b/lustre/ptlrpc/client.c
@@ -1,26 +1,37 @@
  /* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
   * vim:expandtab:shiftwidth=8:tabstop=8:
   *
- *  Copyright (c) 2002, 2003 Cluster File Systems, Inc.
+ * GPL HEADER START
   *
- *   This file is part of the Lustre file system, http://www.lustre.org
- *   Lustre is a trademark of Cluster File Systems, Inc.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   *
- *   You may have signed or agreed to another license before downloading
- *   this software.  If so, you are bound by the terms and conditions
- *   of that agreement, and the following does not apply to you.  See the
- *   LICENSE file included with this distribution for more information.
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
   *
- *   If you did not agree to a different license, then this copy of Lustre
- *   is open source software; you can redistribute it and/or modify it
- *   under the terms of version 2 of the GNU General Public License as
- *   published by the Free Software Foundation.
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
   *
- *   In either case, Lustre is distributed in the hope that it will be
- *   useful, but WITHOUT ANY WARRANTY; without even the implied warranty
- *   of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- *   license text for more details.
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
   *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright  2008 Sun Microsystems, Inc. All rights reserved
+ * Use is subject to license terms.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
   */
  
  #define DEBUG_SUBSYSTEM S_RPC
@@ -59,7 +70,7 @@ struct ptlrpc_connection *ptlrpc_uuid_to_connection(struct obd_uuid *uuid)
                  return NULL;
          }
  
-        c = ptlrpc_get_connection(peer, self, uuid);
+        c = ptlrpc_connection_get(peer, self, uuid);
          if (c) {
                  memcpy(c->c_remote_uuid.uuid,
                         uuid->uuid, sizeof(c->c_remote_uuid.uuid));
@@ -70,24 +81,6 @@ struct ptlrpc_connection *ptlrpc_uuid_to_connection(struct obd_uuid *uuid)
          return c;
  }
  
-void ptlrpc_readdress_connection(struct ptlrpc_connection *conn,
-                                 struct obd_uuid *uuid)
-{
-        lnet_nid_t        self;
-        lnet_process_id_t peer;
-        int               err;
-
-        err = ptlrpc_uuid_to_peer(uuid, &peer, &self);
-        if (err != 0) {
-                CERROR("cannot find peer %s!\n", uuid->uuid);
-                return;
-        }
-
-        conn->c_peer = peer;
-        conn->c_self = self;
-        return;
-}
-
  static inline struct ptlrpc_bulk_desc *new_bulk(int npages, int type, int portal)
  {
          struct ptlrpc_bulk_desc *desc;
@@ -107,8 +100,8 @@ static inline struct ptlrpc_bulk_desc *new_bulk(int npages, int type, int portal
          return desc;
  }
  
-struct ptlrpc_bulk_desc *ptlrpc_prep_bulk_imp (struct ptlrpc_request *req,
-                                               int npages, int type, int portal)
+struct ptlrpc_bulk_desc *ptlrpc_prep_bulk_imp(struct ptlrpc_request *req,
+                                              int npages, int type, int portal)
  {
          struct obd_import *imp = req->rq_import;
          struct ptlrpc_bulk_desc *desc;
@@ -132,8 +125,8 @@ struct ptlrpc_bulk_desc *ptlrpc_prep_bulk_imp (struct ptlrpc_request *req,
          return desc;
  }
  
-struct ptlrpc_bulk_desc *ptlrpc_prep_bulk_exp (struct ptlrpc_request *req,
-                                               int npages, int type, int portal)
+struct ptlrpc_bulk_desc *ptlrpc_prep_bulk_exp(struct ptlrpc_request *req,
+                                              int npages, int type, int portal)
  {
          struct obd_export *exp = req->rq_export;
          struct ptlrpc_bulk_desc *desc;
@@ -200,38 +193,39 @@ void ptlrpc_at_set_req_timeout(struct ptlrpc_request *req)
  
          if (AT_OFF) {
                  /* non-AT settings */
-                req->rq_timeout = req->rq_import->imp_server_timeout ? 
+                req->rq_timeout = req->rq_import->imp_server_timeout ?
                          obd_timeout / 2 : obd_timeout;
                  lustre_msg_set_timeout(req->rq_reqmsg, req->rq_timeout);
                  return;
          }
  
          at = &req->rq_import->imp_at;
-        idx = import_at_get_index(req->rq_import, 
+        idx = import_at_get_index(req->rq_import,
                                    req->rq_request_portal);
          serv_est = at_get(&at->iat_service_estimate[idx]);
-        /* add an arbitrary minimum: 125% +5 sec */
-        req->rq_timeout = serv_est + (serv_est >> 2) + 5;
+        req->rq_timeout = at_est2timeout(serv_est);
          /* We could get even fancier here, using history to predict increased
             loading... */
-             
-        /* Let the server know what this RPC timeout is by putting it in the 
+
+        /* Let the server know what this RPC timeout is by putting it in the
             reqmsg*/
          lustre_msg_set_timeout(req->rq_reqmsg, req->rq_timeout);
  }
  
  /* Adjust max service estimate based on server value */
-static void ptlrpc_at_adj_service(struct ptlrpc_request *req) 
+static void ptlrpc_at_adj_service(struct ptlrpc_request *req,
+                                  unsigned int serv_est)
  {
          int idx;
-        unsigned int serv_est, oldse;
-        struct imp_at *at = &req->rq_import->imp_at;
+        unsigned int oldse;
+        struct imp_at *at;
+
+        /* do estimate only if is not in recovery */
+        if (!(req->rq_send_state & (LUSTRE_IMP_FULL | LUSTRE_IMP_CONNECTING)))
+                return;
  
          LASSERT(req->rq_import);
-        
-        /* service estimate is returned in the repmsg timeout field,
-           may be 0 on err */
-        serv_est = lustre_msg_get_timeout(req->rq_repmsg);
+        at = &req->rq_import->imp_at;
  
          idx = import_at_get_index(req->rq_import, req->rq_request_portal);
          /* max service estimates are tracked on the server side,
@@ -239,7 +233,7 @@ static void ptlrpc_at_adj_service(struct ptlrpc_request *req)
          oldse = at_add(&at->iat_service_estimate[idx], serv_est);
          if (oldse != 0)
                  CDEBUG(D_ADAPTTO, "The RPC service estimate for %s ptl %d "
-                       "has changed from %d to %d\n", 
+                       "has changed from %d to %d\n",
                         req->rq_import->imp_obd->obd_name,req->rq_request_portal,
                         oldse, at_get(&at->iat_service_estimate[idx]));
  }
@@ -251,33 +245,33 @@ int ptlrpc_at_get_net_latency(struct ptlrpc_request *req)
  }
  
  /* Adjust expected network latency */
-static void ptlrpc_at_adj_net_latency(struct ptlrpc_request *req)
+static void ptlrpc_at_adj_net_latency(struct ptlrpc_request *req,
+                                      unsigned int service_time)
  {
-        unsigned int st, nl, oldnl;
-        struct imp_at *at = &req->rq_import->imp_at;
+        unsigned int nl, oldnl;
+        struct imp_at *at;
          time_t now = cfs_time_current_sec();
  
          LASSERT(req->rq_import);
-
-        st = lustre_msg_get_service_time(req->rq_repmsg);
+        at = &req->rq_import->imp_at;
  
          /* Network latency is total time less server processing time */
-        nl = max_t(int, now - req->rq_sent - st, 0) + 1/*st rounding*/;
-        if (st > now - req->rq_sent + 2 /* rounding */)
-                CERROR("Reported service time %u > total measured time %ld\n",
-                       st, now - req->rq_sent);
+        nl = max_t(int, now - req->rq_sent - service_time, 0) + 1/*st rounding*/;
+        if (service_time > now - req->rq_sent + 3 /* bz16408 */)
+                CWARN("Reported service time %u > total measured time %ld\n",
+                      service_time, now - req->rq_sent);
  
          oldnl = at_add(&at->iat_net_latency, nl);
          if (oldnl != 0)
                  CDEBUG(D_ADAPTTO, "The network latency for %s (nid %s) "
-                       "has changed from %d to %d\n", 
+                       "has changed from %d to %d\n",
                         req->rq_import->imp_obd->obd_name,
                         obd_uuid2str(
                                 &req->rq_import->imp_connection->c_remote_uuid),
                         oldnl, at_get(&at->iat_net_latency));
  }
  
-static int unpack_reply(struct ptlrpc_request *req)
+static int unpack_reply_common(struct ptlrpc_request *req)
  {
          int rc;
  
@@ -291,6 +285,17 @@ static int unpack_reply(struct ptlrpc_request *req)
          if (rc > 0)
                  lustre_set_rep_swabbed(req, MSG_PTLRPC_HEADER_OFF);
  
+        return rc;
+}
+
+static int unpack_reply(struct ptlrpc_request *req)
+{
+        int rc;
+
+        rc = unpack_reply_common(req);
+        if (rc < 0)
+                return rc;
+
          rc = lustre_unpack_rep_ptlrpc_body(req, MSG_PTLRPC_BODY_OFF);
          if (rc) {
                  DEBUG_REQ(D_ERROR, req, "unpack ptlrpc body failed: %d", rc);
@@ -299,71 +304,113 @@ static int unpack_reply(struct ptlrpc_request *req)
          return 0;
  }
  
-/* Handle an early reply message.
-   We can't risk the real reply coming in and changing rq_repmsg, 
-   so this fn must be called under the rq_lock */
-static int ptlrpc_at_recv_early_reply(struct ptlrpc_request *req) {
-        struct lustre_msg *oldmsg, *msgcpy;
-        time_t olddl;
-        int oldlen, rc;
+static inline void unpack_reply_free_msg(struct lustre_msg *msg, int len)
+{
+        OBD_FREE(msg, len);
+}
+
+static int unpack_reply_copy_msg(struct ptlrpc_request *req,
+                                 struct lustre_msg **msg, int *len)
+{
+        struct lustre_msg *msgcpy;
+        __u32 csum_calc, csum_get;
+        int lencpy, rc;
          ENTRY;
  
-        req->rq_early = 0;
+        LASSERT_SPIN_LOCKED(&req->rq_lock);
+        *msg = NULL;
+        *len = 0;
  
-        rc = unpack_reply(req);
-        if (rc) 
-                /* Let's just ignore it - same as if it never got here */ 
+        /* Swabbing required when rc == 1 */
+        rc = unpack_reply_common(req);
+        if (rc < 0)
                  RETURN(rc);
  
-        /* We've got to make sure another early reply doesn't land on
-           top of our current repbuf.  Make a copy and verify checksum. */
-        oldlen = req->rq_replen;
+        lencpy = req->rq_replen;
          spin_unlock(&req->rq_lock);
-        OBD_ALLOC(msgcpy, oldlen);
+
+        OBD_ALLOC(msgcpy, lencpy);
          if (!msgcpy) {
                  spin_lock(&req->rq_lock);
                  RETURN(-ENOMEM);
          }
          spin_lock(&req->rq_lock);
-        /* Another reply might have changed the repmsg and replen while 
-           we dropped the lock; doesn't really matter, just use the latest.
-           If it doesn't fit in oldlen, checksum will be wrong. */
-        oldmsg = req->rq_repmsg;
-        memcpy(msgcpy, oldmsg, oldlen);
-        if (lustre_msg_get_cksum(msgcpy) != 
-            lustre_msg_calc_cksum(msgcpy)) {
-                CDEBUG(D_ADAPTTO, "Early reply checksum mismatch, "
-                       "discarding %x != %x\n", lustre_msg_get_cksum(msgcpy),
-                       lustre_msg_calc_cksum(msgcpy));
-                GOTO(out, rc = -EINVAL); 
-        }
-
-        /* Our copied msg is valid, now we can adjust the timeouts without 
-           worrying that a new reply will land on the copy. */
-        req->rq_repmsg = msgcpy;
+
+        /* Checksum must be calculated before being unswabbed.  If the magic
+         * in the copy is unswabbed discard like the checksum failure case */
+        memcpy(msgcpy, req->rq_repmsg, lencpy);
+        if (lustre_msg_need_swab(msgcpy)) {
+                DEBUG_REQ(D_NET, req, "incorrect message magic: %08x\n",
+                          msgcpy->lm_magic);
+                GOTO(err, rc = -EINVAL);
+        }
+
+        csum_calc = lustre_msg_calc_cksum(msgcpy);
+
+        /* Unpack the copy the original rq_repmsg is untouched */
+        rc = lustre_unpack_msg_ptlrpc_body(msgcpy, MSG_PTLRPC_BODY_OFF, rc);
+        if (rc) {
+                DEBUG_REQ(D_ERROR, req, "unpack msg copy failed: %d", rc);
+                GOTO(err, rc = -EPROTO);
+        }
+
+        /* For early replies the LND may update repmsg outside req->rq_lock
+         * resulting in a checksum failure which is non-harmful */
+        csum_get = lustre_msg_get_cksum(msgcpy);
+        if (csum_calc != csum_get) {
+                DEBUG_REQ(D_NET, req, "checksum mismatch: %x != %x\n",
+                          csum_calc, csum_get);
+                GOTO(err, rc = -EINVAL);
+        }
+
+        *msg = msgcpy;
+        *len = lencpy;
+        return 0;
+err:
+        unpack_reply_free_msg(msgcpy, lencpy);
+        return rc;
+}
+
+/* Handle an early reply message.  To prevent a real reply from arriving
+ * and changing req->rq_repmsg this func is called under the rq_lock */
+static int ptlrpc_at_recv_early_reply(struct ptlrpc_request *req) {
+        struct lustre_msg *msg;
+        time_t olddl;
+        int len, rc;
+        ENTRY;
+
+        LASSERT_SPIN_LOCKED(&req->rq_lock);
+        req->rq_early = 0;
+
+        /* All early replys for this request use a single repbuf which can
+         * be updated outside the req->rq_lock.  To prevent racing we create
+         * a copy of the repmsg and verify its checksum before it is used. */
+        rc = unpack_reply_copy_msg(req, &msg, &len);
+        if (rc) {
+                /* Let's just ignore it - same as if it never got here */
+                CDEBUG(D_ADAPTTO, "Discarding racing early reply: %d\n", rc);
+                RETURN(rc);
+        }
  
          /* Expecting to increase the service time estimate here */
-        ptlrpc_at_adj_service(req);
-        ptlrpc_at_adj_net_latency(req);
+        ptlrpc_at_adj_service(req, lustre_msg_get_timeout(msg));
+        ptlrpc_at_adj_net_latency(req, lustre_msg_get_service_time(msg));
  
          /* Adjust the local timeout for this req */
          ptlrpc_at_set_req_timeout(req);
  
          olddl = req->rq_deadline;
-        /* server assumes it now has rq_timeout from when it sent the 
+        /* Server assumes it now has rq_timeout from when it sent the
             early reply, so client should give it at least that long. */
-        req->rq_deadline = cfs_time_current_sec() + req->rq_timeout + 
+        req->rq_deadline = cfs_time_current_sec() + req->rq_timeout +
                      ptlrpc_at_get_net_latency(req);
  
-        DEBUG_REQ(D_ADAPTTO, req, 
-                  "Early reply #%d, new deadline in %lds (%+lds)", 
+        DEBUG_REQ(D_ADAPTTO, req,
+                  "Early reply #%d, new deadline in %lds (%+lds)",
                    req->rq_early_count, req->rq_deadline -
                    cfs_time_current_sec(), req->rq_deadline - olddl);
-        
-        req->rq_repmsg = oldmsg;
-        
-out:
-        OBD_FREE(msgcpy, oldlen);
+
+        unpack_reply_free_msg(msg, len);
          RETURN(rc);
  }
  
@@ -372,16 +419,17 @@ void ptlrpc_free_rq_pool(struct ptlrpc_request_pool *pool)
          struct list_head *l, *tmp;
          struct ptlrpc_request *req;
  
-        if (!pool)
-                return;
+        LASSERT(pool != NULL);
  
+        spin_lock(&pool->prp_lock);
          list_for_each_safe(l, tmp, &pool->prp_req_list) {
                  req = list_entry(l, struct ptlrpc_request, rq_list);
                  list_del(&req->rq_list);
-                LASSERT (req->rq_reqmsg);
+                LASSERT(req->rq_reqmsg);
                  OBD_FREE(req->rq_reqmsg, pool->prp_rq_size);
                  OBD_FREE(req, sizeof(*req));
          }
+        spin_unlock(&pool->prp_lock);
          OBD_FREE(pool, sizeof(*pool));
  }
  
@@ -469,7 +517,7 @@ static struct ptlrpc_request *ptlrpc_prep_req_from_pool(struct ptlrpc_request_po
  
          request = list_entry(pool->prp_req_list.next, struct ptlrpc_request,
                               rq_list);
-        list_del(&request->rq_list);
+        list_del_init(&request->rq_list);
          spin_unlock(&pool->prp_lock);
  
          LASSERT(request->rq_reqmsg);
@@ -485,7 +533,7 @@ static struct ptlrpc_request *ptlrpc_prep_req_from_pool(struct ptlrpc_request_po
  
  struct ptlrpc_request *
  ptlrpc_prep_req_pool(struct obd_import *imp, __u32 version, int opcode,
-                     int count, int *lengths, char **bufs,
+                     int count, __u32 *lengths, char **bufs,
                       struct ptlrpc_request_pool *pool)
  {
          struct ptlrpc_request *request = NULL;
@@ -531,11 +579,13 @@ ptlrpc_prep_req_pool(struct obd_import *imp, __u32 version, int opcode,
          request->rq_reply_cbid.cbid_fn  = reply_in_callback;
          request->rq_reply_cbid.cbid_arg = request;
  
+        request->rq_reply_deadline = 0;
          request->rq_phase = RQ_PHASE_NEW;
+        request->rq_next_phase = RQ_PHASE_UNDEFINED;
  
          request->rq_request_portal = imp->imp_client->cli_request_portal;
          request->rq_reply_portal = imp->imp_client->cli_reply_portal;
-        
+
          ptlrpc_at_set_req_timeout(request);
  
          spin_lock_init(&request->rq_lock);
@@ -543,6 +593,7 @@ ptlrpc_prep_req_pool(struct obd_import *imp, __u32 version, int opcode,
          CFS_INIT_LIST_HEAD(&request->rq_replay_list);
          CFS_INIT_LIST_HEAD(&request->rq_set_chain);
          CFS_INIT_LIST_HEAD(&request->rq_history_list);
+        CFS_INIT_LIST_HEAD(&request->rq_exp_list);
          cfs_waitq_init(&request->rq_reply_waitq);
          request->rq_xid = ptlrpc_next_xid();
          atomic_set(&request->rq_refcount, 1);
@@ -554,7 +605,7 @@ ptlrpc_prep_req_pool(struct obd_import *imp, __u32 version, int opcode,
  
  struct ptlrpc_request *
  ptlrpc_prep_req(struct obd_import *imp, __u32 version, int opcode, int count,
-                int *lengths, char **bufs)
+                __u32 *lengths, char **bufs)
  {
          return ptlrpc_prep_req_pool(imp, version, opcode, count, lengths, bufs,
                                      NULL);
@@ -638,7 +689,7 @@ int ptlrpc_set_add_cb(struct ptlrpc_request_set *set,
  {
          struct ptlrpc_set_cbdata *cbdata;
  
-        OBD_SLAB_ALLOC(cbdata, ptlrpc_cbdata_slab, 
+        OBD_SLAB_ALLOC(cbdata, ptlrpc_cbdata_slab,
                          CFS_ALLOC_STD, sizeof(*cbdata));
          if (cbdata == NULL)
                  RETURN(-ENOMEM);
@@ -646,7 +697,7 @@ int ptlrpc_set_add_cb(struct ptlrpc_request_set *set,
          cbdata->psc_interpret = fn;
          cbdata->psc_data = data;
          list_add_tail(&cbdata->psc_item, &set->set_cblist);
-        
+
          RETURN(0);
  }
  
@@ -657,20 +708,38 @@ void ptlrpc_set_add_req(struct ptlrpc_request_set *set,
          list_add_tail(&req->rq_set_chain, &set->set_requests);
          req->rq_set = set;
          set->set_remaining++;
-
-        atomic_inc(&req->rq_import->imp_inflight);
  }
  
-/* lock so many callers can add things, the context that owns the set
- * is supposed to notice these and move them into the set proper. */
-void ptlrpc_set_add_new_req(struct ptlrpc_request_set *set,
-                            struct ptlrpc_request *req)
+/** 
+ * Lock so many callers can add things, the context that owns the set
+ * is supposed to notice these and move them into the set proper. 
+ */
+int ptlrpc_set_add_new_req(struct ptlrpcd_ctl *pc,
+                           struct ptlrpc_request *req)
  {
+        struct ptlrpc_request_set *set = pc->pc_set;
+
+        /* 
+         * Let caller know that we stopped and will not handle this request.
+         * It needs to take care itself of request.
+         */
+        if (test_bit(LIOD_STOP, &pc->pc_flags))
+                return -EALREADY;
+
          spin_lock(&set->set_new_req_lock);
-        /* The set takes over the caller's request reference */
+        /* 
+         * The set takes over the caller's request reference. 
+         */
          list_add_tail(&req->rq_set_chain, &set->set_new_requests);
          req->rq_set = set;
          spin_unlock(&set->set_new_req_lock);
+
+        /*
+         * Let thead know that we added something and better it to wake up 
+         * and process.
+         */
+        cfs_waitq_signal(&set->set_waitq);
+        return 0;
  }
  
  /*
@@ -737,31 +806,42 @@ static int ptlrpc_import_delay_req(struct obd_import *imp,
  static int ptlrpc_check_reply(struct ptlrpc_request *req)
  {
          int rc = 0;
+        const char *what = "";
          ENTRY;
  
          /* serialise with network callback */
          spin_lock(&req->rq_lock);
  
-        if (req->rq_replied)
+        if (ptlrpc_client_replied(req)) {
+                what = "REPLIED: ";
                  GOTO(out, rc = 1);
+        }
  
          if (req->rq_net_err && !req->rq_timedout) {
+                what = "NETERR: ";
                  spin_unlock(&req->rq_lock);
-                rc = ptlrpc_expire_one_request(req);
+                rc = ptlrpc_expire_one_request(req, 0);
                  spin_lock(&req->rq_lock);
                  GOTO(out, rc);
          }
  
-        if (req->rq_err)
+        if (req->rq_err) {
+                what = "ABORTED: ";
                  GOTO(out, rc = 1);
+        }
  
-        if (req->rq_resend)
+        if (req->rq_resend) {
+                what = "RESEND: ";
                  GOTO(out, rc = 1);
+        }
  
-        if (req->rq_restart)
+        if (req->rq_restart) {
+                what = "RESTART: ";
                  GOTO(out, rc = 1);
+        }
  
-        if (req->rq_early) {
+        if (ptlrpc_client_early(req)) {
+                what = "EARLYREP: ";
                  ptlrpc_at_recv_early_reply(req);
                  GOTO(out, rc = 0); /* keep waiting */
          }
@@ -769,7 +849,7 @@ static int ptlrpc_check_reply(struct ptlrpc_request *req)
          EXIT;
   out:
          spin_unlock(&req->rq_lock);
-        DEBUG_REQ(D_NET, req, "rc = %d for", rc);
+        DEBUG_REQ(D_NET, req, "%src = %d for", what, rc);
          return rc;
  }
  
@@ -800,6 +880,27 @@ static int ptlrpc_check_status(struct ptlrpc_request *req)
          RETURN(err);
  }
  
+/* VBR: we should save pre-versions for replay*/
+static void ptlrpc_save_versions(struct ptlrpc_request *req)
+{
+        struct lustre_msg *repmsg = req->rq_repmsg;
+        struct lustre_msg *reqmsg = req->rq_reqmsg;
+        __u64 *versions = lustre_msg_get_versions(repmsg);
+        ENTRY;
+        /* Interoperability with 1.6. This should be changed to LASSERT in HEAD */
+        if (versions == NULL)
+                return;
+
+        if (lustre_msg_get_flags(req->rq_reqmsg) & MSG_REPLAY)
+                return;
+
+        lustre_msg_set_versions(reqmsg, versions);
+        CDEBUG(D_INFO, "Client save versions ["LPX64"/"LPX64"]\n",
+               versions[0], versions[1]);
+
+        EXIT;
+}
+
  static int after_reply(struct ptlrpc_request *req)
  {
          struct obd_import *imp = req->rq_import;
@@ -817,7 +918,7 @@ static int after_reply(struct ptlrpc_request *req)
  
          LASSERT (req->rq_nob_received <= req->rq_replen);
          rc = unpack_reply(req);
-        if (rc) 
+        if (rc)
                  RETURN(rc);
  
          do_gettimeofday(&work_start);
@@ -827,8 +928,8 @@ static int after_reply(struct ptlrpc_request *req)
                                      timediff);
  
          OBD_FAIL_TIMEOUT(OBD_FAIL_PTLRPC_PAUSE_REP, obd_fail_val);
-        ptlrpc_at_adj_service(req);
-        ptlrpc_at_adj_net_latency(req);
+        ptlrpc_at_adj_service(req, lustre_msg_get_timeout(req->rq_repmsg));
+        ptlrpc_at_adj_net_latency(req, lustre_msg_get_service_time(req->rq_repmsg));
  
          if (lustre_msg_get_type(req->rq_repmsg) != PTL_RPC_MSG_REPLY &&
              lustre_msg_get_type(req->rq_repmsg) != PTL_RPC_MSG_ERR) {
@@ -851,25 +952,29 @@ static int after_reply(struct ptlrpc_request *req)
                          RETURN(rc);
                  }
          } else {
-                /* Let's look if server sent slv. Do it only for RPC with 
+                /* Let's look if server sent slv. Do it only for RPC with
                   * rc == 0. */
                  ldlm_cli_update_pool(req);
          }
  
          /* Store transno in reqmsg for replay. */
-        req->rq_transno = lustre_msg_get_transno(req->rq_repmsg);
-        lustre_msg_set_transno(req->rq_reqmsg, req->rq_transno);
+        if (!(lustre_msg_get_flags(req->rq_reqmsg) & MSG_REPLAY)) {
+                req->rq_transno = lustre_msg_get_transno(req->rq_repmsg);
+                lustre_msg_set_transno(req->rq_reqmsg, req->rq_transno);
+        }
  
-        if (req->rq_import->imp_replayable) {
+        if (imp->imp_replayable) {
                  spin_lock(&imp->imp_lock);
                  /* no point in adding already-committed requests to the replay
                   * list, we will just remove them immediately. b=9829 */
-                if (req->rq_transno != 0 && 
-                    (req->rq_transno > 
+                if (req->rq_transno != 0 &&
+                    (req->rq_transno >
                       lustre_msg_get_last_committed(req->rq_repmsg) ||
-                     req->rq_replay))
+                     req->rq_replay)) {
+                        /* version recovery */
+                        ptlrpc_save_versions(req);
                          ptlrpc_retain_replayable_request(req, imp);
-                else if (req->rq_commit_cb != NULL) {
+                } else if (req->rq_commit_cb != NULL) {
                          spin_unlock(&imp->imp_lock);
                          req->rq_commit_cb(req);
                          spin_lock(&imp->imp_lock);
@@ -895,8 +1000,8 @@ static int ptlrpc_send_new_req(struct ptlrpc_request *req)
          LASSERT(req->rq_phase == RQ_PHASE_NEW);
          if (req->rq_sent && (req->rq_sent > CURRENT_SECONDS))
                  RETURN (0);
-        
-        req->rq_phase = RQ_PHASE_RPC;
+
+        ptlrpc_rqphase_move(req, RQ_PHASE_RPC);
  
          imp = req->rq_import;
          spin_lock(&imp->imp_lock);
@@ -904,18 +1009,17 @@ static int ptlrpc_send_new_req(struct ptlrpc_request *req)
          req->rq_import_generation = imp->imp_generation;
  
          if (ptlrpc_import_delay_req(imp, req, &rc)) {
-                spin_lock (&req->rq_lock);
+                spin_lock(&req->rq_lock);
                  req->rq_waiting = 1;
-                spin_unlock (&req->rq_lock);
+                spin_unlock(&req->rq_lock);
  
                  DEBUG_REQ(D_HA, req, "req from PID %d waiting for recovery: "
-                          "(%s != %s)",
-                          lustre_msg_get_status(req->rq_reqmsg) ,
+                          "(%s != %s)", lustre_msg_get_status(req->rq_reqmsg),
                            ptlrpc_import_state_name(req->rq_send_state),
                            ptlrpc_import_state_name(imp->imp_state));
-                LASSERT(list_empty (&req->rq_list));
-
+                LASSERT(list_empty(&req->rq_list));
                  list_add_tail(&req->rq_list, &imp->imp_delayed_list);
+                atomic_inc(&req->rq_import->imp_inflight);
                  spin_unlock(&imp->imp_lock);
                  RETURN(0);
          }
@@ -923,13 +1027,13 @@ static int ptlrpc_send_new_req(struct ptlrpc_request *req)
          if (rc != 0) {
                  spin_unlock(&imp->imp_lock);
                  req->rq_status = rc;
-                req->rq_phase = RQ_PHASE_INTERPRET;
+                ptlrpc_rqphase_move(req, RQ_PHASE_INTERPRET);
                  RETURN(rc);
          }
  
-        /* XXX this is the same as ptlrpc_queue_wait */
          LASSERT(list_empty(&req->rq_list));
          list_add_tail(&req->rq_list, &imp->imp_sending_list);
+        atomic_inc(&req->rq_import->imp_inflight);
          spin_unlock(&imp->imp_lock);
  
          lustre_msg_set_status(req->rq_reqmsg, cfs_curproc_pid());
@@ -969,6 +1073,7 @@ int ptlrpc_check_set(struct ptlrpc_request_set *set)
                      ptlrpc_send_new_req(req)) {
                          force_timer_recalc = 1;
                  }
+
                  /* delayed send - skip */
                  if (req->rq_phase == RQ_PHASE_NEW && req->rq_sent)
                          continue;
@@ -976,30 +1081,63 @@ int ptlrpc_check_set(struct ptlrpc_request_set *set)
                  if (!(req->rq_phase == RQ_PHASE_RPC ||
                        req->rq_phase == RQ_PHASE_BULK ||
                        req->rq_phase == RQ_PHASE_INTERPRET ||
+                      req->rq_phase == RQ_PHASE_UNREGISTERING ||
                        req->rq_phase == RQ_PHASE_COMPLETE)) {
                          DEBUG_REQ(D_ERROR, req, "bad phase %x", req->rq_phase);
                          LBUG();
                  }
  
+                if (req->rq_phase == RQ_PHASE_UNREGISTERING) {
+                        LASSERT(req->rq_next_phase != req->rq_phase);
+                        LASSERT(req->rq_next_phase != RQ_PHASE_UNDEFINED);
+
+                        /* Skip processing until reply is unlinked. We
+                         * can't return to pool before that and we can't
+                         * call interpret before that. We need to make
+                         * sure that all rdma transfers finished and will
+                         * not corrupt any data. */
+                        if (ptlrpc_client_recv_or_unlink(req) ||
+                            ptlrpc_client_bulk_active(req))
+                                continue;
+
+                        /* Turn repl fail_loc off to prevent it from looping
+                         * forever. */
+                        if (OBD_FAIL_CHECK(OBD_FAIL_PTLRPC_LONG_REPL_UNLINK)) {
+                                OBD_FAIL_CHECK(OBD_FAIL_PTLRPC_LONG_REPL_UNLINK |
+                                               OBD_FAIL_ONCE);
+                        }
+
+                        /* Turn off bulk fail_loc. */
+                        if (OBD_FAIL_CHECK(OBD_FAIL_PTLRPC_LONG_BULK_UNLINK)) {
+                                OBD_FAIL_CHECK(OBD_FAIL_PTLRPC_LONG_BULK_UNLINK |
+                                               OBD_FAIL_ONCE);
+                        }
+
+                        /* Move to next phase if reply was successfully 
+                         * unlinked. */
+                        ptlrpc_rqphase_move(req, req->rq_next_phase);
+                }
+
                  if (req->rq_phase == RQ_PHASE_COMPLETE)
                          continue;
  
                  if (req->rq_phase == RQ_PHASE_INTERPRET)
                          GOTO(interpret, req->rq_status);
  
-                if (req->rq_net_err && !req->rq_timedout)
-                        ptlrpc_expire_one_request(req);
+                /* Note that this also will start async reply unlink */
+                if (req->rq_net_err && !req->rq_timedout) {
+                        ptlrpc_expire_one_request(req, 1);
+
+                        /* Check if we still need to wait for unlink. */
+                        if (ptlrpc_client_recv_or_unlink(req) ||
+                            ptlrpc_client_bulk_active(req))
+                                continue;
+                }
  
                  if (req->rq_err) {
-                        ptlrpc_unregister_reply(req);
                          if (req->rq_status == 0)
                                  req->rq_status = -EIO;
-                        req->rq_phase = RQ_PHASE_INTERPRET;
-
-                        spin_lock(&imp->imp_lock);
-                        list_del_init(&req->rq_list);
-                        spin_unlock(&imp->imp_lock);
-
+                        ptlrpc_rqphase_move(req, RQ_PHASE_INTERPRET);
                          GOTO(interpret, req->rq_status);
                  }
  
@@ -1009,15 +1147,8 @@ int ptlrpc_check_set(struct ptlrpc_request_set *set)
                   * seen a timeout.  our policy is to only interpret
                   * interrupted rpcs after they have timed out */
                  if (req->rq_intr && (req->rq_timedout || req->rq_waiting)) {
-                        /* NB could be on delayed list */
-                        ptlrpc_unregister_reply(req);
                          req->rq_status = -EINTR;
-                        req->rq_phase = RQ_PHASE_INTERPRET;
-
-                        spin_lock(&imp->imp_lock);
-                        list_del_init(&req->rq_list);
-                        spin_unlock(&imp->imp_lock);
-
+                        ptlrpc_rqphase_move(req, RQ_PHASE_INTERPRET);
                          GOTO(interpret, req->rq_status);
                  }
  
@@ -1025,43 +1156,54 @@ int ptlrpc_check_set(struct ptlrpc_request_set *set)
                          if (req->rq_timedout||req->rq_waiting||req->rq_resend) {
                                  int status;
  
-                                ptlrpc_unregister_reply(req);
+                                if (!ptlrpc_unregister_reply(req, 1))
+                                        continue;
  
                                  spin_lock(&imp->imp_lock);
-
                                  if (ptlrpc_import_delay_req(imp, req, &status)){
+                                        /* put on delay list - only if we wait
+                                         * recovery finished - before send */
+                                        list_del_init(&req->rq_list);
+                                        list_add_tail(&req->rq_list, &imp->imp_delayed_list);
                                          spin_unlock(&imp->imp_lock);
                                          continue;
                                  }
  
-                                list_del_init(&req->rq_list);
                                  if (status != 0)  {
                                          req->rq_status = status;
-                                        req->rq_phase = RQ_PHASE_INTERPRET;
+                                        ptlrpc_rqphase_move(req, 
+                                                RQ_PHASE_INTERPRET);
                                          spin_unlock(&imp->imp_lock);
                                          GOTO(interpret, req->rq_status);
                                  }
                                  if (req->rq_no_resend) {
                                          req->rq_status = -ENOTCONN;
-                                        req->rq_phase = RQ_PHASE_INTERPRET;
+                                        ptlrpc_rqphase_move(req, 
+                                                RQ_PHASE_INTERPRET);
                                          spin_unlock(&imp->imp_lock);
                                          GOTO(interpret, req->rq_status);
                                  }
+
+                                list_del_init(&req->rq_list);
                                  list_add_tail(&req->rq_list,
                                                &imp->imp_sending_list);
  
                                  spin_unlock(&imp->imp_lock);
  
                                  req->rq_waiting = 0;
-                                if (req->rq_resend) {
-                                        lustre_msg_add_flags(req->rq_reqmsg,
-                                                             MSG_RESENT);
+
+                                if (req->rq_timedout||req->rq_resend) {
+                                        /* This is re-sending anyways, 
+                                         * let's mark req as resend. */
+                                        req->rq_resend = 1;
                                          if (req->rq_bulk) {
-                                                __u64 old_xid = req->rq_xid;
+                                                __u64 old_xid;
  
-                                                ptlrpc_unregister_bulk (req);
+                                                if (!ptlrpc_unregister_bulk(req, 1))
+                                                        continue;
  
                                                  /* ensure previous bulk fails */
+                                                old_xid = req->rq_xid;
                                                  req->rq_xid = ptlrpc_next_xid();
                                                  CDEBUG(D_HA, "resend bulk "
                                                         "old x"LPU64
@@ -1083,57 +1225,45 @@ int ptlrpc_check_set(struct ptlrpc_request_set *set)
  
                          spin_lock(&req->rq_lock);
  
-                        if (req->rq_early) {
+                        if (ptlrpc_client_early(req)) {
                                  ptlrpc_at_recv_early_reply(req);
                                  spin_unlock(&req->rq_lock);
                                  continue;
                          }
  
                          /* Still waiting for a reply? */
-                        if (req->rq_receiving_reply) {
+                        if (ptlrpc_client_recv(req)) {
                                  spin_unlock(&req->rq_lock);
                                  continue;
                          }
  
                          /* Did we actually receive a reply? */
-                        if (!req->rq_replied) {
+                        if (!ptlrpc_client_replied(req)) {
                                  spin_unlock(&req->rq_lock);
                                  continue;
                          }
  
                          spin_unlock(&req->rq_lock);
  
-                        spin_lock(&imp->imp_lock);
-                        list_del_init(&req->rq_list);
-                        spin_unlock(&imp->imp_lock);
-
                          req->rq_status = after_reply(req);
-                        if (req->rq_resend) {
-                                /* Add this req to the delayed list so
-                                   it can be errored if the import is
-                                   evicted after recovery. */
-                                spin_lock(&imp->imp_lock);
-                                list_add_tail(&req->rq_list,
-                                              &imp->imp_delayed_list);
-                                spin_unlock(&imp->imp_lock);
+                        if (req->rq_resend)
                                  continue;
-                        }
  
                          /* If there is no bulk associated with this request,
                           * then we're done and should let the interpreter
-                         * process the reply.  Similarly if the RPC returned
+                         * process the reply. Similarly if the RPC returned
                           * an error, and therefore the bulk will never arrive.
                           */
                          if (req->rq_bulk == NULL || req->rq_status != 0) {
-                                req->rq_phase = RQ_PHASE_INTERPRET;
+                                ptlrpc_rqphase_move(req, RQ_PHASE_INTERPRET);
                                  GOTO(interpret, req->rq_status);
                          }
  
-                        req->rq_phase = RQ_PHASE_BULK;
+                        ptlrpc_rqphase_move(req, RQ_PHASE_BULK);
                  }
  
                  LASSERT(req->rq_phase == RQ_PHASE_BULK);
-                if (ptlrpc_bulk_active(req->rq_bulk))
+                if (ptlrpc_client_bulk_active(req))
                          continue;
  
                  if (!req->rq_bulk->bd_success) {
@@ -1143,19 +1273,26 @@ int ptlrpc_check_set(struct ptlrpc_request_set *set)
                           * the ACK for her PUT. */
                          DEBUG_REQ(D_ERROR, req, "bulk transfer failed");
                          req->rq_status = -EIO;
-                        req->rq_phase = RQ_PHASE_INTERPRET;
+                        ptlrpc_rqphase_move(req, RQ_PHASE_INTERPRET);
                          GOTO(interpret, req->rq_status);
                  }
  
-                req->rq_phase = RQ_PHASE_INTERPRET;
+                ptlrpc_rqphase_move(req, RQ_PHASE_INTERPRET);
  
          interpret:
                  LASSERT(req->rq_phase == RQ_PHASE_INTERPRET);
-                LASSERT(!req->rq_receiving_reply);
  
-                ptlrpc_unregister_reply(req);
-                if (req->rq_bulk != NULL)
-                        ptlrpc_unregister_bulk (req);
+                /* This moves to "unregistering" phase we need to wait for
+                 * reply unlink. */
+                if (!ptlrpc_unregister_reply(req, 1))
+                        continue;
+
+                if (!ptlrpc_unregister_bulk(req, 1))
+                        continue;
+
+                /* When calling interpret receiving already should be
+                 * finished. */
+                LASSERT(!req->rq_receiving_reply);
  
                  if (req->rq_interpret_reply != NULL) {
                          int (*interpreter)(struct ptlrpc_request *,void *,int) =
@@ -1163,7 +1300,7 @@ int ptlrpc_check_set(struct ptlrpc_request_set *set)
                          req->rq_status = interpreter(req, &req->rq_async_args,
                                                       req->rq_status);
                  }
-                req->rq_phase = RQ_PHASE_COMPLETE;
+                ptlrpc_rqphase_move(req, RQ_PHASE_COMPLETE);
  
                  CDEBUG(D_RPCTRACE, "Completed RPC pname:cluuid:pid:xid:nid:"
                         "opc %s:%s:%d:x"LPU64":%s:%d\n", cfs_curproc_comm(),
@@ -1172,9 +1309,18 @@ int ptlrpc_check_set(struct ptlrpc_request_set *set)
                         libcfs_nid2str(imp->imp_connection->c_peer.nid),
                         lustre_msg_get_opc(req->rq_reqmsg));
  
-                set->set_remaining--;
+                spin_lock(&imp->imp_lock);
+                /* Request already may be not on sending or delaying list. This
+                 * may happen in the case of marking it errorneous for the case
+                 * ptlrpc_import_delay_req(req, status) find it impossible to 
+                 * allow sending this rpc and returns *status != 0. */
+                if (!list_empty(&req->rq_list)) {
+                        list_del_init(&req->rq_list);
+                        atomic_dec(&imp->imp_inflight);
+                }
+                spin_unlock(&imp->imp_lock);
  
-                atomic_dec(&imp->imp_inflight);
+                set->set_remaining--;
                  cfs_waitq_signal(&imp->imp_recovery_waitq);
          }
  
@@ -1183,7 +1329,7 @@ int ptlrpc_check_set(struct ptlrpc_request_set *set)
  }
  
  /* Return 1 if we should give up, else 0 */
-int ptlrpc_expire_one_request(struct ptlrpc_request *req)
+int ptlrpc_expire_one_request(struct ptlrpc_request *req, int async_unlink)
  {
          struct obd_import *imp = req->rq_import;
          int rc = 0;
@@ -1209,14 +1355,12 @@ int ptlrpc_expire_one_request(struct ptlrpc_request *req)
          req->rq_timedout = 1;
          spin_unlock(&req->rq_lock);
  
-        ptlrpc_unregister_reply (req);
+        ptlrpc_unregister_reply(req, async_unlink);
+        ptlrpc_unregister_bulk(req, async_unlink);
  
          if (obd_dump_on_timeout)
                  libcfs_debug_dumplog();
  
-        if (req->rq_bulk != NULL)
-                ptlrpc_unregister_bulk (req);
-
          if (imp == NULL) {
                  DEBUG_REQ(D_HA, req, "NULL import: already cleaned up?");
                  RETURN(1);
@@ -1240,7 +1384,7 @@ int ptlrpc_expire_one_request(struct ptlrpc_request *req)
                  RETURN(1);
          }
  
-        /* if a request can't be resent we can't wait for an answer after 
+        /* if a request can't be resent we can't wait for an answer after
             the timeout */
          if (req->rq_no_resend) {
                  DEBUG_REQ(D_RPCTRACE, req, "TIMEOUT-NORESEND:");
@@ -1262,28 +1406,28 @@ int ptlrpc_expired_set(void *data)
          LASSERT(set != NULL);
  
          /* A timeout expired; see which reqs it applies to... */
-        list_for_each (tmp, &set->set_requests) {
+        list_for_each(tmp, &set->set_requests) {
                  struct ptlrpc_request *req =
                          list_entry(tmp, struct ptlrpc_request, rq_set_chain);
  
-                /* request in-flight? */
-                if (!((req->rq_phase == RQ_PHASE_RPC && !req->rq_waiting &&
-                       !req->rq_resend) ||
+                /* Request in-flight? */
+                if (!((req->rq_phase == RQ_PHASE_RPC &&
+                       !req->rq_waiting && !req->rq_resend) ||
                        (req->rq_phase == RQ_PHASE_BULK)))
                          continue;
  
-                if (req->rq_timedout ||           /* already dealt with */
+                if (req->rq_timedout ||     /* already dealt with */
                      req->rq_deadline > now) /* not expired */
                          continue;
  
-                /* deal with this guy */
-                ptlrpc_expire_one_request (req);
+                /* Deal with this guy. Do it asynchronously to not block
+                 * ptlrpcd thread. */
+                ptlrpc_expire_one_request(req, 1);
          }
  
          /* When waiting for a whole set, we always to break out of the
           * sleep so we can recalculate the timeout, or enable interrupts
-         * iff everyone's timed out.
-         */
+         * if everyone's timed out. */
          RETURN(1);
  }
  
@@ -1306,7 +1450,8 @@ void ptlrpc_interrupted_set(void *data)
                  struct ptlrpc_request *req =
                          list_entry(tmp, struct ptlrpc_request, rq_set_chain);
  
-                if (req->rq_phase != RQ_PHASE_RPC)
+                if (req->rq_phase != RQ_PHASE_RPC &&
+                    req->rq_phase != RQ_PHASE_UNREGISTERING)
                          continue;
  
                  ptlrpc_mark_interrupted(req);
@@ -1330,25 +1475,25 @@ int ptlrpc_set_next_timeout(struct ptlrpc_request_set *set)
  
                  /* request in-flight? */
                  if (!(((req->rq_phase == RQ_PHASE_RPC) && !req->rq_waiting) ||
-                      (req->rq_phase == RQ_PHASE_BULK) || 
+                      (req->rq_phase == RQ_PHASE_BULK) ||
                        (req->rq_phase == RQ_PHASE_NEW)))
                          continue;
  
-                if (req->rq_timedout)   /* already timed out */
+                /* Already timed out. */
+                if (req->rq_timedout)
                          continue;
  
                  if (req->rq_phase == RQ_PHASE_NEW)
-                        deadline = req->rq_sent;        /* delayed send */
+                        deadline = req->rq_sent;    /* delayed send */
                  else
                          deadline = req->rq_deadline;
  
                  if (deadline <= now) {  /* actually expired already */
                          timeout = 1;    /* ASAP */
                          break;
-                } 
-                if ((timeout == 0) || (timeout > (deadline - now))) {
-                        timeout = deadline - now;
                  }
+                if ((timeout == 0) || (timeout > (deadline - now)))
+                        timeout = deadline - now;
          }
          RETURN(timeout);
  }
@@ -1412,13 +1557,13 @@ int ptlrpc_set_wait(struct ptlrpc_request_set *set)
                  struct ptlrpc_set_cbdata *cbdata, *n;
                  int err;
  
-                list_for_each_entry_safe(cbdata, n, 
+                list_for_each_entry_safe(cbdata, n,
                                           &set->set_cblist, psc_item) {
                          list_del_init(&cbdata->psc_item);
                          err = cbdata->psc_interpret(set, cbdata->psc_data, rc);
                          if (err && !rc)
                                  rc = err;
-                        OBD_SLAB_FREE(cbdata, ptlrpc_cbdata_slab, 
+                        OBD_SLAB_FREE(cbdata, ptlrpc_cbdata_slab,
                                          sizeof(*cbdata));
                  }
          }
@@ -1431,6 +1576,8 @@ static void __ptlrpc_free_req_to_pool(struct ptlrpc_request *request)
          struct ptlrpc_request_pool *pool = request->rq_pool;
  
          spin_lock(&pool->prp_lock);
+        LASSERT(list_empty(&request->rq_list));
+        LASSERT(!request->rq_receiving_reply);
          list_add_tail(&request->rq_list, &pool->prp_req_list);
          spin_unlock(&pool->prp_lock);
  }
@@ -1447,6 +1594,8 @@ static void __ptlrpc_free_req(struct ptlrpc_request *request, int locked)
          LASSERTF(request->rq_rqbd == NULL, "req %p\n",request);/* client-side */
          LASSERTF(list_empty(&request->rq_list), "req %p\n", request);
          LASSERTF(list_empty(&request->rq_set_chain), "req %p\n", request);
+        LASSERTF(list_empty(&request->rq_exp_list), "req %p\n", request);
+        LASSERTF(!request->rq_replay, "req %p\n", request);
  
          /* We must take it off the imp_replay_list first.  Otherwise, we'll set
           * request->rq_reqmsg to NULL while osc_close is dereferencing it. */
@@ -1493,11 +1642,6 @@ static void __ptlrpc_free_req(struct ptlrpc_request *request, int locked)
          EXIT;
  }
  
-void ptlrpc_free_req(struct ptlrpc_request *request)
-{
-        __ptlrpc_free_req(request, 0);
-}
-
  static int __ptlrpc_req_finished(struct ptlrpc_request *request, int locked);
  void ptlrpc_req_finished_with_imp_lock(struct ptlrpc_request *request)
  {
@@ -1545,24 +1689,41 @@ EXPORT_SYMBOL(ptlrpc_req_xid);
   * IDEMPOTENT, but _not_ safe against concurrent callers.
   * The request owner (i.e. the thread doing the I/O) must call...
   */
-void ptlrpc_unregister_reply (struct ptlrpc_request *request)
+int ptlrpc_unregister_reply(struct ptlrpc_request *request, int async)
  {
          int                rc;
          cfs_waitq_t       *wq;
          struct l_wait_info lwi;
          ENTRY;
  
-        LASSERT(!in_interrupt ());             /* might sleep */
+        /* Might sleep. */
+        LASSERT(!in_interrupt());
+
+        /* Let's setup deadline for reply unlink. */
+        if (OBD_FAIL_CHECK(OBD_FAIL_PTLRPC_LONG_REPL_UNLINK) && 
+            async && request->rq_reply_deadline == 0)
+                request->rq_reply_deadline = cfs_time_current_sec()+LONG_UNLINK;
+
+        /* Nothing left to do. */
          if (!ptlrpc_client_recv_or_unlink(request))
-                /* Nothing left to do */
-                return;
+                RETURN(1);
  
-        LNetMDUnlink (request->rq_reply_md_h);
+        LNetMDUnlink(request->rq_reply_md_h);
+
+        /* Let's check it once again. */        
+        if (!ptlrpc_client_recv_or_unlink(request))
+                RETURN(1);
+
+        /* Move to "Unregistering" phase as reply was not unlinked yet. */
+        ptlrpc_rqphase_move(request, RQ_PHASE_UNREGISTERING);
+
+        /* Do not wait for unlink to finish. */
+        if (async)
+                RETURN(0);
  
          /* We have to l_wait_event() whatever the result, to give liblustre
-         * a chance to run reply_in_callback(), and to make sure we've 
+         * a chance to run reply_in_callback(), and to make sure we've
           * unlinked before returning a req to the pool */
-
          if (request->rq_set != NULL)
                  wq = &request->rq_set->set_waitq;
          else
@@ -1571,18 +1732,21 @@ void ptlrpc_unregister_reply (struct ptlrpc_request *request)
          for (;;) {
                  /* Network access will complete in finite time but the HUGE
                   * timeout lets us CWARN for visibility of sluggish NALs */
-                lwi = LWI_TIMEOUT(cfs_time_seconds(LONG_UNLINK), NULL, NULL);
-                rc = l_wait_event (*wq, !ptlrpc_client_recv_or_unlink(request),
-                                   &lwi);
-                if (rc == 0)
-                        return;
+                lwi = LWI_TIMEOUT_INTERVAL(cfs_time_seconds(LONG_UNLINK),
+                                           cfs_time_seconds(1), NULL, NULL);
+                rc = l_wait_event(*wq, !ptlrpc_client_recv_or_unlink(request),
+                                  &lwi);
+                if (rc == 0) {
+                        ptlrpc_rqphase_move(request, request->rq_next_phase);
+                        RETURN(1);
+                }
  
-                LASSERT (rc == -ETIMEDOUT);
+                LASSERT(rc == -ETIMEDOUT);
                  DEBUG_REQ(D_WARNING, request, "Unexpectedly long timeout "
                            "rvcng=%d unlnk=%d", request->rq_receiving_reply,
                            request->rq_must_unlink);
          }
-        EXIT;
+        RETURN(0);
  }
  
  /* caller must hold imp->imp_lock */
@@ -1676,7 +1840,7 @@ void ptlrpc_resend_req(struct ptlrpc_request *req)
                  CDEBUG(D_HA, "resend bulk old x"LPU64" new x"LPU64"\n",
                         old_xid, req->rq_xid);
          }
-        ptlrpc_wake_client_req(req);
+        ptlrpc_client_wake_req(req);
          spin_unlock(&req->rq_lock);
  }
  
@@ -1689,7 +1853,7 @@ void ptlrpc_restart_req(struct ptlrpc_request *req)
          spin_lock(&req->rq_lock);
          req->rq_restart = 1;
          req->rq_timedout = 0;
-        ptlrpc_wake_client_req(req);
+        ptlrpc_client_wake_req(req);
          spin_unlock(&req->rq_lock);
  }
  
@@ -1767,7 +1931,6 @@ int ptlrpc_queue_wait(struct ptlrpc_request *req)
  
          LASSERT(req->rq_set == NULL);
          LASSERT(!req->rq_receiving_reply);
-        atomic_inc(&imp->imp_inflight);
  
          /* for distributed debugging */
          lustre_msg_set_status(req->rq_reqmsg, cfs_curproc_pid());
@@ -1780,15 +1943,15 @@ int ptlrpc_queue_wait(struct ptlrpc_request *req)
                 lustre_msg_get_opc(req->rq_reqmsg));
  
          /* Mark phase here for a little debug help */
-        req->rq_phase = RQ_PHASE_RPC;
+        ptlrpc_rqphase_move(req, RQ_PHASE_RPC);
  
          spin_lock(&imp->imp_lock);
          req->rq_import_generation = imp->imp_generation;
  restart:
          if (ptlrpc_import_delay_req(imp, req, &rc)) {
-                list_del(&req->rq_list);
-
+                list_del_init(&req->rq_list);
                  list_add_tail(&req->rq_list, &imp->imp_delayed_list);
+                atomic_inc(&imp->imp_inflight);
                  spin_unlock(&imp->imp_lock);
  
                  DEBUG_REQ(D_HA, req, "\"%s\" waiting for recovery: (%s != %s)",
@@ -1808,6 +1971,7 @@ restart:
  
                  spin_lock(&imp->imp_lock);
                  list_del_init(&req->rq_list);
+                atomic_dec(&imp->imp_inflight);
  
                  if (req->rq_err) {
                          /* rq_status was set locally */
@@ -1826,17 +1990,14 @@ restart:
          }
  
          if (rc != 0) {
-                list_del_init(&req->rq_list);
                  spin_unlock(&imp->imp_lock);
                  req->rq_status = rc; // XXX this ok?
                  GOTO(out, rc);
          }
  
          if (req->rq_resend) {
-                lustre_msg_add_flags(req->rq_reqmsg, MSG_RESENT);
-
                  if (req->rq_bulk != NULL) {
-                        ptlrpc_unregister_bulk (req);
+                        ptlrpc_unregister_bulk(req, 0);
  
                          /* bulk requests are supposed to be
                           * idempotent, so we are free to bump the xid
@@ -1854,29 +2015,30 @@ restart:
          /* XXX this is the same as ptlrpc_set_wait */
          LASSERT(list_empty(&req->rq_list));
          list_add_tail(&req->rq_list, &imp->imp_sending_list);
+        atomic_inc(&imp->imp_inflight);
          spin_unlock(&imp->imp_lock);
  
          rc = ptl_send_rpc(req, 0);
-        if (rc) 
+        if (rc)
                  DEBUG_REQ(D_HA, req, "send failed (%d); recovering", rc);
          do {
                  timeoutl = req->rq_deadline - cfs_time_current_sec();
                  timeout = (timeoutl <= 0 || rc) ? CFS_TICK :
                          cfs_time_seconds(timeoutl);
-                DEBUG_REQ(D_NET, req, 
+                DEBUG_REQ(D_NET, req,
                            "-- sleeping for "CFS_DURATION_T" ticks", timeout);
                  lwi = LWI_TIMEOUT_INTR(timeout, NULL, interrupted_request, req);
                  brc = l_wait_event(req->rq_reply_waitq, ptlrpc_check_reply(req),
                                    &lwi);
                  /* Wait again if we changed deadline */
-        } while ((brc == -ETIMEDOUT) && 
+        } while ((brc == -ETIMEDOUT) &&
                   (req->rq_deadline > cfs_time_current_sec()));
  
-        if ((brc == -ETIMEDOUT) && !ptlrpc_expire_one_request(req)) {
+        if ((brc == -ETIMEDOUT) && !ptlrpc_expire_one_request(req, 0)) {
                  /* Wait forever for reconnect / replay or failure */
                  lwi = LWI_INTR(interrupted_request, req);
-                rc = l_wait_event(req->rq_reply_waitq, ptlrpc_check_reply(req),
-                                  &lwi);
+                brc = l_wait_event(req->rq_reply_waitq, ptlrpc_check_reply(req),
+                                   &lwi);
          }
  
          CDEBUG(D_RPCTRACE, "Completed RPC pname:cluuid:pid:xid:nid:opc "
@@ -1886,20 +2048,20 @@ restart:
                 libcfs_nid2str(imp->imp_connection->c_peer.nid),
                 lustre_msg_get_opc(req->rq_reqmsg));
  
-        spin_lock(&imp->imp_lock);
-        list_del_init(&req->rq_list);
-        spin_unlock(&imp->imp_lock);
-
          /* If the reply was received normally, this just grabs the spinlock
           * (ensuring the reply callback has returned), sees that
           * req->rq_receiving_reply is clear and returns. */
-        ptlrpc_unregister_reply (req);
+        ptlrpc_unregister_reply(req, 0);
  
+        spin_lock(&imp->imp_lock);
+        list_del_init(&req->rq_list);
+        atomic_dec(&imp->imp_inflight);
+        spin_unlock(&imp->imp_lock);
  
          if (req->rq_err) {
-                DEBUG_REQ(D_RPCTRACE, req, "err rc=%d status=%d", 
+                DEBUG_REQ(D_RPCTRACE, req, "err rc=%d status=%d",
                            rc, req->rq_status);
-                GOTO(out, rc = -EIO);
+                GOTO(out, rc = rc ? rc : -EIO);
          }
  
          if (req->rq_intr) {
@@ -1911,19 +2073,19 @@ restart:
          }
  
          /* Resend if we need to */
-        if (req->rq_resend) {
+        if (req->rq_resend||req->rq_timedout) {
                  /* ...unless we were specifically told otherwise. */
                  if (req->rq_no_resend)
                          GOTO(out, rc = -ETIMEDOUT);
                  spin_lock(&imp->imp_lock);
+                /* we can have rq_timeout on dlm fake import which not support
+                 * recovery - but me need resend request on this import instead
+                 * of return error */
+                req->rq_resend = 1;
                  goto restart;
          }
  
-        if (req->rq_timedout) {                 /* non-recoverable timeout */
-                GOTO(out, rc = -ETIMEDOUT);
-        }
-
-        if (!req->rq_replied) {
+        if (!ptlrpc_client_replied(req)) {
                  /* How can this be? -eeb */
                  DEBUG_REQ(D_ERROR, req, "!rq_replied: ");
                  LBUG();
@@ -1947,7 +2109,7 @@ restart:
                           * me. */
                          lwi = LWI_TIMEOUT(timeout, NULL, NULL);
                          brc = l_wait_event(req->rq_reply_waitq,
-                                           !ptlrpc_bulk_active(req->rq_bulk),
+                                           !ptlrpc_client_bulk_active(req),
                                             &lwi);
                          LASSERT(brc == 0 || brc == -ETIMEDOUT);
                          if (brc != 0) {
@@ -1960,13 +2122,11 @@ restart:
                          }
                  }
                  if (rc < 0)
-                        ptlrpc_unregister_bulk (req);
+                        ptlrpc_unregister_bulk(req, 0);
          }
  
          LASSERT(!req->rq_receiving_reply);
-        req->rq_phase = RQ_PHASE_INTERPRET;
-
-        atomic_dec(&imp->imp_inflight);
+        ptlrpc_rqphase_move(req, RQ_PHASE_INTERPRET);
          cfs_waitq_signal(&imp->imp_recovery_waitq);
          RETURN(rc);
  }
@@ -1985,7 +2145,7 @@ static int ptlrpc_replay_interpret(struct ptlrpc_request *req,
          ENTRY;
          atomic_dec(&imp->imp_replay_inflight);
  
-        if (!req->rq_replied) {
+        if (!ptlrpc_client_replied(req)) {
                  CERROR("request replay timed out, restarting recovery\n");
                  GOTO(out, rc = -ETIMEDOUT);
          }
@@ -1995,9 +2155,28 @@ static int ptlrpc_replay_interpret(struct ptlrpc_request *req,
               lustre_msg_get_status(req->rq_repmsg) == -ENODEV))
                  GOTO(out, rc = lustre_msg_get_status(req->rq_repmsg));
  
-        /* The transno had better not change over replay. */
-        LASSERT(lustre_msg_get_transno(req->rq_reqmsg) ==
-                lustre_msg_get_transno(req->rq_repmsg));
+        /* VBR: check version failure */
+        if (lustre_msg_get_status(req->rq_repmsg) == -EOVERFLOW) {
+                /* replay was failed due to version mismatch */
+                DEBUG_REQ(D_WARNING, req, "Version mismatch during replay\n");
+                spin_lock(&imp->imp_lock);
+                imp->imp_vbr_failed = 1;
+                imp->imp_no_lock_replay = 1;
+                spin_unlock(&imp->imp_lock);
+        } else {
+                /* The transno had better not change over replay. */
+                LASSERT(lustre_msg_get_transno(req->rq_reqmsg) ==
+                        lustre_msg_get_transno(req->rq_repmsg) ||
+                        lustre_msg_get_transno(req->rq_repmsg) == 0);
+        }
+
+        spin_lock(&imp->imp_lock);
+        /* if replays by version then gap was occur on server, no trust to locks */
+        if (lustre_msg_get_flags(req->rq_repmsg) & MSG_VERSION_REPLAY)
+                imp->imp_no_lock_replay = 1;
+        imp->imp_last_replay_transno = lustre_msg_get_transno(req->rq_reqmsg);
+        spin_unlock(&imp->imp_lock);
+        LASSERT(imp->imp_last_replay_transno);
  
          DEBUG_REQ(D_HA, req, "got rep");
  
@@ -2005,7 +2184,7 @@ static int ptlrpc_replay_interpret(struct ptlrpc_request *req,
          if (req->rq_replay_cb)
                  req->rq_replay_cb(req);
  
-        if (req->rq_replied &&
+        if (ptlrpc_client_replied(req) &&
              lustre_msg_get_status(req->rq_repmsg) != aa->praa_old_status) {
                  DEBUG_REQ(D_ERROR, req, "status %d, old was %d",
                            lustre_msg_get_status(req->rq_repmsg),
@@ -2015,10 +2194,6 @@ static int ptlrpc_replay_interpret(struct ptlrpc_request *req,
                  lustre_msg_set_status(req->rq_repmsg, aa->praa_old_status);
          }
  
-        spin_lock(&imp->imp_lock);
-        imp->imp_last_replay_transno = req->rq_transno;
-        spin_unlock(&imp->imp_lock);
-
          /* continue with recovery */
          rc = ptlrpc_import_recovery_state_machine(imp);
   out:
@@ -2041,14 +2216,15 @@ int ptlrpc_replay_req(struct ptlrpc_request *req)
          /* Not handling automatic bulk replay yet (or ever?) */
          LASSERT(req->rq_bulk == NULL);
  
-        LASSERT (sizeof (*aa) <= sizeof (req->rq_async_args));
-        aa = (struct ptlrpc_replay_async_args *)&req->rq_async_args;
+        CLASSERT(sizeof(*aa) <= sizeof(req->rq_async_args));
+        aa = ptlrpc_req_async_args(req);
          memset(aa, 0, sizeof *aa);
  
          /* Prepare request to be resent with ptlrpcd */
          aa->praa_old_state = req->rq_send_state;
          req->rq_send_state = LUSTRE_IMP_REPLAY;
          req->rq_phase = RQ_PHASE_NEW;
+        req->rq_next_phase = RQ_PHASE_UNDEFINED;
          if (req->rq_repmsg)
                  aa->praa_old_status = lustre_msg_get_status(req->rq_repmsg);
          req->rq_status = 0;
@@ -2090,7 +2266,7 @@ void ptlrpc_abort_inflight(struct obd_import *imp)
                  if (req->rq_import_generation < imp->imp_generation) {
                          req->rq_err = 1;
                          req->rq_status = -EINTR;
-                        ptlrpc_wake_client_req(req);
+                        ptlrpc_client_wake_req(req);
                  }
                  spin_unlock (&req->rq_lock);
          }
@@ -2105,7 +2281,7 @@ void ptlrpc_abort_inflight(struct obd_import *imp)
                  if (req->rq_import_generation < imp->imp_generation) {
                          req->rq_err = 1;
                          req->rq_status = -EINTR;
-                        ptlrpc_wake_client_req(req);
+                        ptlrpc_client_wake_req(req);
                  }
                  spin_unlock (&req->rq_lock);
          }
@@ -2120,8 +2296,60 @@ void ptlrpc_abort_inflight(struct obd_import *imp)
          EXIT;
  }
  
-static __u64 ptlrpc_last_xid = 0;
-spinlock_t ptlrpc_last_xid_lock;
+void ptlrpc_abort_set(struct ptlrpc_request_set *set)
+{
+        struct list_head *tmp, *pos;
+
+        LASSERT(set != NULL);
+
+        list_for_each_safe(pos, tmp, &set->set_requests) {
+                struct ptlrpc_request *req =
+                        list_entry(pos, struct ptlrpc_request, rq_set_chain);
+
+                spin_lock(&req->rq_lock);
+                if (req->rq_phase != RQ_PHASE_RPC) {
+                        spin_unlock(&req->rq_lock);
+                        continue;
+                }
+
+                req->rq_err = 1;
+                req->rq_status = -EINTR;
+                ptlrpc_client_wake_req(req);
+                spin_unlock(&req->rq_lock);
+        }
+}
+
+static __u64 ptlrpc_last_xid;
+static spinlock_t ptlrpc_last_xid_lock;
+
+/* Initialize the XID for the node.  This is common among all requests on
+ * this node, and only requires the property that it is monotonically
+ * increasing.  It does not need to be sequential.  Since this is also used
+ * as the RDMA match bits, it is important that a single client NOT have
+ * the same match bits for two different in-flight requests, hence we do
+ * NOT want to have an XID per target or similar.
+ *
+ * To avoid an unlikely collision between match bits after a client reboot
+ * (which would cause old to be delivered into the wrong buffer) we initialize
+ * the XID based on the current time, assuming a maximum RPC rate of 1M RPC/s.
+ * If the time is clearly incorrect, we instead use a 62-bit random number.
+ * In the worst case the random number will overflow 1M RPCs per second in
+ * 9133 years, or permutations thereof.
+ */
+#define YEAR_2004 (1ULL << 30)
+void ptlrpc_init_xid(void)
+{
+        time_t now = cfs_time_current_sec();
+
+        spin_lock_init(&ptlrpc_last_xid_lock);
+        if (now < YEAR_2004) {
+                ll_get_random_bytes(&ptlrpc_last_xid, sizeof(ptlrpc_last_xid));
+                ptlrpc_last_xid >>= 2;
+                ptlrpc_last_xid |= (1ULL << 61);
+        } else {
+                ptlrpc_last_xid = (now << 20);
+        }
+}
  
  __u64 ptlrpc_next_xid(void)
  {
@@ -2134,11 +2362,15 @@ __u64 ptlrpc_next_xid(void)
  
  __u64 ptlrpc_sample_next_xid(void)
  {
-        __u64 tmp;
-        spin_lock(&ptlrpc_last_xid_lock);
-        tmp = ptlrpc_last_xid + 1;
-        spin_unlock(&ptlrpc_last_xid_lock);
-        return tmp;
+        if (sizeof(long) < 8) {
+                /* need to avoid possible word tearing on 32-bit systems */
+                __u64 tmp;
+                spin_lock(&ptlrpc_last_xid_lock);
+                tmp = ptlrpc_last_xid + 1;
+                spin_unlock(&ptlrpc_last_xid_lock);
+                return tmp;
+        }
+        /* No need to lock, since returned value is racy anyways */
+        return ptlrpc_last_xid + 1;
  }
  EXPORT_SYMBOL(ptlrpc_sample_next_xid);
-
diff --git a/lustre/ptlrpc/connection.c b/lustre/ptlrpc/connection.c

index cd09f2b..fb86c6c 100644 (file)
--- a/lustre/ptlrpc/connection.c
+++ b/lustre/ptlrpc/connection.c
@@ -1,26 +1,37 @@
  /* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
   * vim:expandtab:shiftwidth=8:tabstop=8:
   *
- *  Copyright (C) 2002 Cluster File Systems, Inc.
+ * GPL HEADER START
   *
- *   This file is part of the Lustre file system, http://www.lustre.org
- *   Lustre is a trademark of Cluster File Systems, Inc.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   *
- *   You may have signed or agreed to another license before downloading
- *   this software.  If so, you are bound by the terms and conditions
- *   of that agreement, and the following does not apply to you.  See the
- *   LICENSE file included with this distribution for more information.
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
   *
- *   If you did not agree to a different license, then this copy of Lustre
- *   is open source software; you can redistribute it and/or modify it
- *   under the terms of version 2 of the GNU General Public License as
- *   published by the Free Software Foundation.
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
   *
- *   In either case, Lustre is distributed in the hope that it will be
- *   useful, but WITHOUT ANY WARRANTY; without even the implied warranty
- *   of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- *   license text for more details.
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
   *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright  2008 Sun Microsystems, Inc. All rights reserved
+ * Use is subject to license terms.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
   */
  
  #define DEBUG_SUBSYSTEM S_RPC
@@ -35,211 +46,192 @@
  #include "ptlrpc_internal.h"
  #include <class_hash.h>
  
-static spinlock_t conn_lock;
-static struct list_head conn_list;
-static struct list_head conn_unused_list;
-static struct lustre_class_hash_body *conn_hash_body;
-static struct lustre_class_hash_body *conn_unused_hash_body;
+static lustre_hash_t *conn_hash = NULL;
+static lustre_hash_ops_t conn_hash_ops;
  
-extern struct lustre_hash_operations conn_hash_operations;
-
-void ptlrpc_dump_connections(void)
+struct ptlrpc_connection *
+ptlrpc_connection_get(lnet_process_id_t peer, lnet_nid_t self,
+                      struct obd_uuid *uuid)
  {
-        struct list_head *tmp;
-        struct ptlrpc_connection *c;
+        struct ptlrpc_connection *conn, *conn2;
          ENTRY;
  
-        list_for_each(tmp, &conn_list) {
-                c = list_entry(tmp, struct ptlrpc_connection, c_link);
-                CERROR("Connection %p/%s has refcount %d (nid=%s->%s)\n",
-                       c, c->c_remote_uuid.uuid, atomic_read(&c->c_refcount),
-                       libcfs_nid2str(c->c_self), 
-                       libcfs_nid2str(c->c_peer.nid));
+        conn = lustre_hash_lookup(conn_hash, &peer);
+        if (conn)
+                GOTO(out, conn);
+
+        OBD_ALLOC_PTR(conn);
+        if (!conn)
+                RETURN(NULL);
+
+        conn->c_peer = peer;
+        conn->c_self = self;
+        INIT_HLIST_NODE(&conn->c_hash);
+        atomic_set(&conn->c_refcount, 1);
+        if (uuid)
+                obd_str2uuid(&conn->c_remote_uuid, uuid->uuid);
+
+        /* 
+         * Add the newly created conn to the hash, on key collision we
+         * lost a racing addition and must destroy our newly allocated
+         * connection.  The object which exists in the has will be
+         * returned and may be compared against out object. 
+         */
+        conn2 = lustre_hash_findadd_unique(conn_hash, &peer, &conn->c_hash);
+        if (conn != conn2) {
+                OBD_FREE_PTR(conn);
+                conn = conn2;
          }
          EXIT;
+out:
+        CDEBUG(D_INFO, "conn=%p refcount %d to %s\n",
+               conn, atomic_read(&conn->c_refcount), 
+               libcfs_nid2str(conn->c_peer.nid));
+        return conn;
  }
-
-struct ptlrpc_connection*
-ptlrpc_lookup_conn_locked (lnet_process_id_t peer)
+  
+int ptlrpc_connection_put(struct ptlrpc_connection *conn)
  {
-        struct ptlrpc_connection *c;
-
-        c = lustre_hash_get_object_by_key(conn_hash_body, &peer);
-        if (c != NULL)
-                return c;
+        int rc = 0;
+        ENTRY;
+  
+        if (!conn)
+                RETURN(rc);
+  
+        LASSERT(!hlist_unhashed(&conn->c_hash));
+  
+        /*
+         * We do not remove connection from hashtable and 
+         * do not free it even if last caller released ref,
+         * as we want to have it cached for the case it is
+         * needed again.
+         *
+         * Deallocating it and later creating new connection
+         * again would be wastful. This way we also avoid
+         * expensive locking to protect things from get/put 
+         * race when found cached connection is freed by 
+         * ptlrpc_connection_put().
+         *
+         * It will be freed later in module unload time,
+         * when ptlrpc_connection_fini()->lh_exit->conn_exit()
+         * path is called.
+         */
+        if (atomic_dec_return(&conn->c_refcount) == 1)
+                rc = 1;
  
-        c = lustre_hash_get_object_by_key(conn_unused_hash_body, &peer);
-        if (c != NULL)
-                return c;
+        CDEBUG(D_INFO, "PUT conn=%p refcount %d to %s\n",
+               conn, atomic_read(&conn->c_refcount),
+               libcfs_nid2str(conn->c_peer.nid));
  
-        return NULL;
+        RETURN(rc);
  }
-
-
-struct ptlrpc_connection *ptlrpc_get_connection(lnet_process_id_t peer,
-                                                lnet_nid_t self, struct obd_uuid *uuid)
+  
+struct ptlrpc_connection *
+ptlrpc_connection_addref(struct ptlrpc_connection *conn)
  {
-        struct ptlrpc_connection *c;
-        struct ptlrpc_connection *c2;
-        int rc = 0;
          ENTRY;
  
-        CDEBUG(D_INFO, "self %s peer %s\n", 
-               libcfs_nid2str(self), libcfs_id2str(peer));
-
-        spin_lock(&conn_lock);
-
-        c = ptlrpc_lookup_conn_locked(peer);
-        
-        spin_unlock(&conn_lock);
-
-        if (c != NULL)
-                RETURN (c);
-        
-        OBD_ALLOC(c, sizeof(*c));
-        if (c == NULL)
-                RETURN (NULL);
-
-        atomic_set(&c->c_refcount, 1);
-        c->c_peer = peer;
-        c->c_self = self;
-       INIT_HLIST_NODE(&c->c_hash);
-       CFS_INIT_LIST_HEAD(&c->c_link);
-        if (uuid != NULL)
-                obd_str2uuid(&c->c_remote_uuid, uuid->uuid);
-
-        spin_lock(&conn_lock);
-
-        c2 = ptlrpc_lookup_conn_locked(peer);
-        if (c2 == NULL) {
-                list_add(&c->c_link, &conn_list);
-                rc = lustre_hash_additem_unique(conn_hash_body, &peer, 
-                                                &c->c_hash);
-                if (rc != 0) {
-                        list_del(&c->c_link);
-                        CERROR("Cannot add connection to conn_hash_body\n");
-                        goto out_conn;
-                }
-        }
-
-out_conn:
+        atomic_inc(&conn->c_refcount);
+        CDEBUG(D_INFO, "conn=%p refcount %d to %s\n",
+               conn, atomic_read(&conn->c_refcount),
+               libcfs_nid2str(conn->c_peer.nid));
  
-        spin_unlock(&conn_lock);
-
-        if (c2 == NULL && rc == 0)
-                RETURN (c);
-
-        if (c != NULL) 
-                OBD_FREE(c, sizeof(*c));
-        RETURN (c2);
+        RETURN(conn);
  }
-
-int ptlrpc_put_connection(struct ptlrpc_connection *c)
+  
+int ptlrpc_connection_init(void)
  {
-        int rc = 0;
-        lnet_process_id_t peer;
          ENTRY;
  
-        if (c == NULL) {
-                CERROR("NULL connection\n");
-                RETURN(0);
-        }
-
-        peer = c->c_peer;
-
-        CDEBUG (D_INFO, "connection=%p refcount %d to %s\n",
-                c, atomic_read(&c->c_refcount) - 1, 
-                libcfs_nid2str(c->c_peer.nid));
-
-        spin_lock(&conn_lock);
-        LASSERT(!hlist_unhashed(&c->c_hash));
-        spin_unlock(&conn_lock);
-
-        if (atomic_dec_return(&c->c_refcount) == 1) {
-
-                spin_lock(&conn_lock);
-
-                lustre_hash_delitem(conn_hash_body, &peer, &c->c_hash);
-                list_del(&c->c_link);
+        conn_hash = lustre_hash_init("CONN_HASH", 5, 15,
+                                     &conn_hash_ops, LH_REHASH);
+        if (!conn_hash)
+                RETURN(-ENOMEM);
+  
+        RETURN(0);
+}
+  
+void ptlrpc_connection_fini(void) {
+        ENTRY;
+        lustre_hash_exit(conn_hash);
+        EXIT;
+}
  
-                list_add(&c->c_link, &conn_unused_list);
-                rc = lustre_hash_additem_unique(conn_unused_hash_body, &peer, 
-                                                &c->c_hash);
-                if (rc != 0) {
-                        spin_unlock(&conn_lock);
-                        CERROR("Cannot hash connection to conn_hash_body\n");
-                        GOTO(ret, rc);
-                }
+/*
+ * Hash operations for net_peer<->connection
+ */
+static unsigned
+conn_hashfn(lustre_hash_t *lh,  void *key, unsigned mask)
+{
+        return lh_djb2_hash(key, sizeof(lnet_process_id_t), mask);
+}
  
-                spin_unlock(&conn_lock);
-                rc = 1;
- 
-        } 
+static int
+conn_compare(void *key, struct hlist_node *hnode)
+{
+        struct ptlrpc_connection *conn;
+        lnet_process_id_t *conn_key;
  
-        if (atomic_read(&c->c_refcount) < 0)
-                CERROR("connection %p refcount %d!\n",
-                       c, atomic_read(&c->c_refcount));
-ret :
+        LASSERT(key != NULL);
+        conn_key = (lnet_process_id_t*)key;
+        conn = hlist_entry(hnode, struct ptlrpc_connection, c_hash);
  
-        RETURN(rc);
+        return conn_key->nid == conn->c_peer.nid &&
+               conn_key->pid == conn->c_peer.pid;
  }
  
-struct ptlrpc_connection *ptlrpc_connection_addref(struct ptlrpc_connection *c)
+static void *
+conn_key(struct hlist_node *hnode)
  {
-        ENTRY;
-        atomic_inc(&c->c_refcount);
-        CDEBUG (D_INFO, "connection=%p refcount %d to %s\n",
-                c, atomic_read(&c->c_refcount),
-                libcfs_nid2str(c->c_peer.nid));
-        RETURN(c);
+        struct ptlrpc_connection *conn;
+        conn = hlist_entry(hnode, struct ptlrpc_connection, c_hash);
+        return &conn->c_peer;
  }
  
-int ptlrpc_init_connection(void)
+static void *
+conn_get(struct hlist_node *hnode)
  {
-        int rc = 0;
-        CFS_INIT_LIST_HEAD(&conn_list);
-        rc = lustre_hash_init(&conn_hash_body, "CONN_HASH", 
-                              128, &conn_hash_operations);
-        if (rc)
-                GOTO(ret, rc);
-
-        CFS_INIT_LIST_HEAD(&conn_unused_list);
-        rc = lustre_hash_init(&conn_unused_hash_body, "CONN_UNUSED_HASH", 
-                              128, &conn_hash_operations);
-        if (rc)
-                GOTO(ret, rc);
-
-        spin_lock_init(&conn_lock);
-ret:
-        if (rc) {
-                lustre_hash_exit(&conn_hash_body);
-                lustre_hash_exit(&conn_unused_hash_body);
-        }
-        RETURN(rc);
+        struct ptlrpc_connection *conn;
+
+        conn = hlist_entry(hnode, struct ptlrpc_connection, c_hash);
+        atomic_inc(&conn->c_refcount);
+
+        return conn;
  }
  
-void ptlrpc_cleanup_connection(void)
+static void *
+conn_put(struct hlist_node *hnode)
  {
-        struct list_head *tmp, *pos;
-        struct ptlrpc_connection *c;
+        struct ptlrpc_connection *conn;
  
-        spin_lock(&conn_lock);
+        conn = hlist_entry(hnode, struct ptlrpc_connection, c_hash);
+        atomic_dec(&conn->c_refcount);
  
-        lustre_hash_exit(&conn_unused_hash_body);
-        list_for_each_safe(tmp, pos, &conn_unused_list) {
-                c = list_entry(tmp, struct ptlrpc_connection, c_link);
-                list_del(&c->c_link);
-                OBD_FREE(c, sizeof(*c));
-        }
+        return conn;
+}
  
-        lustre_hash_exit(&conn_hash_body);
-        list_for_each_safe(tmp, pos, &conn_list) {
-                c = list_entry(tmp, struct ptlrpc_connection, c_link);
-                CERROR("Connection %p/%s has refcount %d (nid=%s)\n",
-                       c, c->c_remote_uuid.uuid, atomic_read(&c->c_refcount),
-                       libcfs_nid2str(c->c_peer.nid));
-                list_del(&c->c_link);
-                OBD_FREE(c, sizeof(*c));
-        }
-        spin_unlock(&conn_lock);
+static void
+conn_exit(struct hlist_node *hnode)
+{
+        struct ptlrpc_connection *conn;
+
+        conn = hlist_entry(hnode, struct ptlrpc_connection, c_hash);
+        /* 
+         * Nothing should be left. Connection user put it and
+         * connection also was deleted from table by this time
+         * so we should have 0 refs.
+         */
+        LASSERTF(atomic_read(&conn->c_refcount) == 0, 
+                 "Busy connection with %d refs\n", 
+                 atomic_read(&conn->c_refcount));
+        OBD_FREE_PTR(conn);
  }
+
+static lustre_hash_ops_t conn_hash_ops = {
+        .lh_hash    = conn_hashfn,
+        .lh_compare = conn_compare,
+        .lh_key     = conn_key,
+        .lh_get     = conn_get,
+        .lh_put     = conn_put,
+        .lh_exit    = conn_exit,
+};
diff --git a/lustre/ptlrpc/events.c b/lustre/ptlrpc/events.c

index 59878f0..18d5eff 100644 (file)
--- a/lustre/ptlrpc/events.c
+++ b/lustre/ptlrpc/events.c
@@ -1,26 +1,37 @@
  /* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
   * vim:expandtab:shiftwidth=8:tabstop=8:
   *
- *  Copyright (c) 2002, 2003 Cluster File Systems, Inc.
+ * GPL HEADER START
   *
- *   This file is part of the Lustre file system, http://www.lustre.org
- *   Lustre is a trademark of Cluster File Systems, Inc.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   *
- *   You may have signed or agreed to another license before downloading
- *   this software.  If so, you are bound by the terms and conditions
- *   of that agreement, and the following does not apply to you.  See the
- *   LICENSE file included with this distribution for more information.
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
   *
- *   If you did not agree to a different license, then this copy of Lustre
- *   is open source software; you can redistribute it and/or modify it
- *   under the terms of version 2 of the GNU General Public License as
- *   published by the Free Software Foundation.
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
   *
- *   In either case, Lustre is distributed in the hope that it will be
- *   useful, but WITHOUT ANY WARRANTY; without even the implied warranty
- *   of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- *   license text for more details.
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
   *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright  2008 Sun Microsystems, Inc. All rights reserved
+ * Use is subject to license terms.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
   */
  
  #define DEBUG_SUBSYSTEM S_RPC
@@ -63,7 +74,7 @@ void request_out_callback(lnet_event_t *ev)
                  req->rq_net_err = 1;
                  spin_unlock(&req->rq_lock);
  
-                ptlrpc_wake_client_req(req);
+                ptlrpc_client_wake_req(req);
          }
  
          ptlrpc_req_finished(req);
@@ -82,7 +93,7 @@ void reply_in_callback(lnet_event_t *ev)
  
          DEBUG_REQ((ev->status == 0) ? D_NET : D_ERROR, req,
                    "type %d, status %d", ev->type, ev->status);
-        
+
          LASSERT(ev->type == LNET_EVENT_PUT || ev->type == LNET_EVENT_UNLINK);
          LASSERT(ev->md.start == req->rq_repbuf);
          LASSERT(ev->mlength <= req->rq_replen);
@@ -94,16 +105,18 @@ void reply_in_callback(lnet_event_t *ev)
  
          req->rq_receiving_reply = 0;
          req->rq_early = 0;
-        
+        if (ev->unlinked)
+                req->rq_must_unlink = 0;
+
          if (ev->status)
                  goto out_wake;
          if (ev->type == LNET_EVENT_UNLINK) {
-                req->rq_must_unlink = 0;
+                LASSERT(ev->unlinked);
                  DEBUG_REQ(D_RPCTRACE, req, "unlink");
                  goto out_wake;
          }
  
-        if ((ev->offset == 0) && 
+        if ((ev->offset == 0) &&
              (lustre_msghdr_get_flags(req->rq_reqmsg) & MSGHDR_AT_SUPPORT)) {
                  /* Early reply */
                  DEBUG_REQ(D_ADAPTTO, req,
@@ -111,23 +124,15 @@ void reply_in_callback(lnet_event_t *ev)
                            "replied=%d unlinked=%d", ev->mlength, ev->offset,
                            req->rq_replen, req->rq_replied, ev->unlinked);
  
-                if (unlikely(ev->mlength != lustre_msg_early_size()))
+                if (unlikely(ev->mlength != lustre_msg_early_size(req)))
                          CERROR("early reply sized %u, expect %u\n",
-                               ev->mlength, lustre_msg_early_size());
+                               ev->mlength, lustre_msg_early_size(req));
  
                  req->rq_early_count++; /* number received, client side */
-                if (req->rq_replied) {
-                        /* If we already got the real reply, then we need to
-                         * check if lnet_finalize() unlinked the md.  In that
-                         * case, there will be no further callback of type
-                         * LNET_EVENT_UNLINK.
-                         */
-                        if (ev->unlinked)
-                                req->rq_must_unlink = 0;
-                        else
-                                DEBUG_REQ(D_RPCTRACE, req, "unlinked in reply");
+
+                if (req->rq_replied)   /* already got the real reply */
                          goto out_wake;
-                }
+
                  req->rq_early = 1;
                  req->rq_nob_received = ev->mlength;
                  /* repmsg points to early reply */
@@ -136,6 +141,7 @@ void reply_in_callback(lnet_event_t *ev)
                  req->rq_receiving_reply = 1;
          } else {
                  /* Real reply */
+                req->rq_rep_swab_mask = 0;
                  req->rq_replied = 1;
                  req->rq_nob_received = ev->mlength;
                  /* repmsg points to real reply */
@@ -154,7 +160,7 @@ void reply_in_callback(lnet_event_t *ev)
  out_wake:
          /* NB don't unlock till after wakeup; req can disappear under us
           * since we don't have our own ref */
-        ptlrpc_wake_client_req(req);
+        ptlrpc_client_wake_req(req);
          spin_unlock(&req->rq_lock);
          EXIT;
  }
@@ -195,7 +201,7 @@ void client_bulk_callback (lnet_event_t *ev)
  
          /* NB don't unlock till after wakeup; desc can disappear under us
           * otherwise */
-        ptlrpc_wake_client_req(desc->bd_req);
+        ptlrpc_client_wake_req(desc->bd_req);
  
          spin_unlock(&desc->bd_lock);
          EXIT;
diff --git a/lustre/ptlrpc/import.c b/lustre/ptlrpc/import.c

index 3125de8..d66a2a2 100644 (file)
--- a/lustre/ptlrpc/import.c
+++ b/lustre/ptlrpc/import.c
@@ -1,26 +1,41 @@
  /* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
   * vim:expandtab:shiftwidth=8:tabstop=8:
   *
- *  Copyright (c) 2002, 2003 Cluster File Systems, Inc.
- *   Author: Mike Shaver <shaver@clusterfs.com>
+ * GPL HEADER START
   *
- *   This file is part of the Lustre file system, http://www.lustre.org
- *   Lustre is a trademark of Cluster File Systems, Inc.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   *
- *   You may have signed or agreed to another license before downloading
- *   this software.  If so, you are bound by the terms and conditions
- *   of that agreement, and the following does not apply to you.  See the
- *   LICENSE file included with this distribution for more information.
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
   *
- *   If you did not agree to a different license, then this copy of Lustre
- *   is open source software; you can redistribute it and/or modify it
- *   under the terms of version 2 of the GNU General Public License as
- *   published by the Free Software Foundation.
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
   *
- *   In either case, Lustre is distributed in the hope that it will be
- *   useful, but WITHOUT ANY WARRANTY; without even the implied warranty
- *   of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- *   license text for more details.
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright  2008 Sun Microsystems, Inc. All rights reserved
+ * Use is subject to license terms.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/ptlrpc/import.c
+ *
+ * Author: Mike Shaver <shaver@clusterfs.com>
   */
  
  #define DEBUG_SUBSYSTEM S_RPC
@@ -135,12 +150,12 @@ int ptlrpc_set_import_discon(struct obd_import *imp, __u32 conn_cnt)
                          LCONSOLE_ERROR_MSG(0x166, "%s: Connection to service "
                                 "%.*s via nid %s was lost; in progress "
                                 "operations using this service will fail.\n",
-                               imp->imp_obd->obd_name, target_len, target_start, 
+                               imp->imp_obd->obd_name, target_len, target_start,
                                 libcfs_nid2str(imp->imp_connection->c_peer.nid));
                  }
                  IMPORT_SET_STATE_NOLOCK(imp, LUSTRE_IMP_DISCON);
                  spin_unlock(&imp->imp_lock);
-    
+
                  if (obd_dump_on_timeout)
                          libcfs_debug_dumplog();
  
@@ -184,6 +199,46 @@ void ptlrpc_deactivate_import(struct obd_import *imp)
          ptlrpc_deactivate_and_unlock_import(imp);
  }
  
+static unsigned int 
+ptlrpc_inflight_deadline(struct ptlrpc_request *req, time_t now)
+{
+        long dl;
+
+        if (!(((req->rq_phase == RQ_PHASE_RPC) && !req->rq_waiting) ||
+              (req->rq_phase == RQ_PHASE_BULK) || 
+              (req->rq_phase == RQ_PHASE_NEW)))
+                return 0;
+
+        if (req->rq_timedout)
+                return 0;
+
+        if (req->rq_phase == RQ_PHASE_NEW)
+                dl = req->rq_sent;
+        else
+                dl = req->rq_deadline;
+
+        if (dl <= now)
+                return 0;
+
+        return dl - now;
+}
+
+static unsigned int ptlrpc_inflight_timeout(struct obd_import *imp)
+{
+        time_t now = cfs_time_current_sec();
+        struct list_head *tmp, *n;
+        struct ptlrpc_request *req;
+        unsigned int timeout = 0;
+
+        spin_lock(&imp->imp_lock);
+        list_for_each_safe(tmp, n, &imp->imp_sending_list) {
+                req = list_entry(tmp, struct ptlrpc_request, rq_list);
+                timeout = max(ptlrpc_inflight_deadline(req, now), timeout);
+        }
+        spin_unlock(&imp->imp_lock);
+        return timeout;
+}
+
  /*
   * This function will invalidate the import, if necessary, then block
   * for all the RPC completions, and finally notify the obd to
@@ -195,11 +250,12 @@ void ptlrpc_invalidate_import(struct obd_import *imp)
          struct list_head *tmp, *n;
          struct ptlrpc_request *req;
          struct l_wait_info lwi;
+        unsigned int timeout;
          int rc;
  
          atomic_inc(&imp->imp_inval_count);
  
-        /* 
+        /*
           * If this is an invalid MGC connection, then don't bother
           * waiting for imp_inflight to drop to 0.
           */
@@ -211,35 +267,80 @@ void ptlrpc_invalidate_import(struct obd_import *imp)
  
          LASSERT(imp->imp_invalid);
  
-        /* wait for all requests to error out and call completion callbacks.
-           Cap it at obd_timeout -- these should all have been locally
-           cancelled by ptlrpc_abort_inflight. */
-        lwi = LWI_TIMEOUT_INTERVAL(
-                cfs_timeout_cap(cfs_time_seconds(obd_timeout)),
-                cfs_time_seconds(1), NULL, NULL);
-        rc = l_wait_event(imp->imp_recovery_waitq,
-                          (atomic_read(&imp->imp_inflight) == 0), &lwi);
-
-        if (rc) {
-                CERROR("%s: rc = %d waiting for callback (%d != 0)\n",
-                       obd2cli_tgt(imp->imp_obd), rc,
-                       atomic_read(&imp->imp_inflight));
-                spin_lock(&imp->imp_lock);
-                list_for_each_safe(tmp, n, &imp->imp_sending_list) {
-                        req = list_entry(tmp, struct ptlrpc_request, rq_list);
-                        DEBUG_REQ(D_ERROR, req, "still on sending list");
-                }
-                list_for_each_safe(tmp, n, &imp->imp_delayed_list) {
-                        req = list_entry(tmp, struct ptlrpc_request, rq_list);
-                        DEBUG_REQ(D_ERROR, req, "still on delayed list");
+        /* Wait forever until inflight == 0. We really can't do it another
+         * way because in some cases we need to wait for very long reply 
+         * unlink. We can't do anything before that because there is really
+         * no guarantee that some rdma transfer is not in progress right now. */
+        do {
+                /* Calculate max timeout for waiting on rpcs to error 
+                 * out. Use obd_timeout if calculated value is smaller
+                 * than it. */
+                timeout = ptlrpc_inflight_timeout(imp);
+                timeout += timeout / 3;
+                
+                if (timeout == 0)
+                        timeout = obd_timeout;
+                
+                CDEBUG(D_RPCTRACE, "Sleeping %d sec for inflight to error out\n",
+                       timeout);
+
+                /* Wait for all requests to error out and call completion
+                 * callbacks. Cap it at obd_timeout -- these should all
+                 * have been locally cancelled by ptlrpc_abort_inflight. */
+                lwi = LWI_TIMEOUT_INTERVAL(
+                        cfs_timeout_cap(cfs_time_seconds(timeout)),
+                        cfs_time_seconds(1), NULL, NULL);
+                rc = l_wait_event(imp->imp_recovery_waitq,
+                                (atomic_read(&imp->imp_inflight) == 0), &lwi);
+                if (rc) {
+                        const char *cli_tgt = obd2cli_tgt(imp->imp_obd);
+
+                        CERROR("%s: rc = %d waiting for callback (%d != 0)\n",
+                               cli_tgt, rc, atomic_read(&imp->imp_inflight));
+
+                        spin_lock(&imp->imp_lock);
+                        list_for_each_safe(tmp, n, &imp->imp_sending_list) {
+                                req = list_entry(tmp, struct ptlrpc_request, 
+                                        rq_list);
+                                DEBUG_REQ(D_ERROR, req, "still on sending list");
+                        }
+                        list_for_each_safe(tmp, n, &imp->imp_delayed_list) {
+                                req = list_entry(tmp, struct ptlrpc_request, 
+                                        rq_list);
+                                DEBUG_REQ(D_ERROR, req, "still on delayed list");
+                        }
+                        
+                        if (atomic_read(&imp->imp_unregistering) == 0) {
+                                /* We know that only "unregistering" rpcs may
+                                 * still survive in sending or delaying lists
+                                 * (They are waiting for long reply unlink in
+                                 * sluggish nets). Let's check this. If there
+                                 * is no unregistering and inflight != 0 this
+                                 * is bug. */
+                                LASSERT(atomic_read(&imp->imp_inflight) == 0);
+                                
+                                /* Let's save one loop as soon as inflight have
+                                 * dropped to zero. No new inflights possible at
+                                 * this point. */
+                                rc = 0;
+                        } else {
+                                CERROR("%s: RPCs in \"%s\" phase found (%d). "
+                                       "Network is sluggish? Waiting them "
+                                       "to error out.\n", cli_tgt,
+                                       ptlrpc_phase2str(RQ_PHASE_UNREGISTERING),
+                                       atomic_read(&imp->imp_unregistering));
+                        }
+                        spin_unlock(&imp->imp_lock);
                  }
-                spin_unlock(&imp->imp_lock);
-                LASSERT(atomic_read(&imp->imp_inflight) == 0);
-        }
+        } while (rc != 0);
  
-  out:
+        /* Let's additionally check that no new rpcs added to import in
+         * "invalidate" state. */
+        LASSERT(atomic_read(&imp->imp_inflight) == 0);
+
+out:
          obd_import_event(imp->imp_obd, imp, IMP_EVENT_INVALIDATE);
-        
+
          atomic_dec(&imp->imp_inval_count);
          cfs_waitq_signal(&imp->imp_recovery_waitq);
  }
@@ -286,8 +387,8 @@ void ptlrpc_fail_import(struct obd_import *imp, __u32 conn_cnt)
  
  int ptlrpc_reconnect_import(struct obd_import *imp)
  {
-        
-        ptlrpc_set_import_discon(imp, 0); 
+
+        ptlrpc_set_import_discon(imp, 0);
          /* Force a new connect attempt */
          ptlrpc_invalidate_import(imp);
          /* Do a fresh connect next time by zeroing the handle */
@@ -300,16 +401,16 @@ int ptlrpc_reconnect_import(struct obd_import *imp)
                                    (atomic_read(&imp->imp_inval_count) == 0),
                                    &lwi);
                  if (rc)
-                        CERROR("Interrupted, inval=%d\n", 
+                        CERROR("Interrupted, inval=%d\n",
                                 atomic_read(&imp->imp_inval_count));
          }
  
-        /* 
+        /*
           * Allow reconnect attempts. Note: Currently, the function is
           * only called by MGC. So assume this is a recoverable import,
-         * and force import to be recoverable. fix this if you need to 
+         * and force import to be recoverable. fix this if you need to
           */
-        
+
          imp->imp_obd->obd_no_recov = 0;
          /* Remove 'invalid' flag */
          ptlrpc_activate_import(imp);
@@ -341,10 +442,10 @@ static int import_select_connection(struct obd_import *imp)
                         imp->imp_obd->obd_name,
                         libcfs_nid2str(conn->oic_conn->c_peer.nid),
                         conn->oic_last_attempt);
-                
+
                  /* Don't thrash connections */
                  if (cfs_time_before_64(cfs_time_current_64(),
-                                     conn->oic_last_attempt + 
+                                     conn->oic_last_attempt +
                                       cfs_time_seconds(CONNECTION_SWITCH_MIN))) {
                          continue;
                  }
@@ -400,13 +501,13 @@ static int import_select_connection(struct obd_import *imp)
  
          /* switch connection, don't mind if it's same as the current one */
          if (imp->imp_connection)
-                ptlrpc_put_connection(imp->imp_connection);
+                ptlrpc_connection_put(imp->imp_connection);
          imp->imp_connection = ptlrpc_connection_addref(imp_conn->oic_conn);
  
          dlmexp =  class_conn2export(&imp->imp_dlm_handle);
          LASSERT(dlmexp != NULL);
          if (dlmexp->exp_connection)
-                ptlrpc_put_connection(dlmexp->exp_connection);
+                ptlrpc_connection_put(dlmexp->exp_connection);
          dlmexp->exp_connection = ptlrpc_connection_addref(imp_conn->oic_conn);
          class_export_put(dlmexp);
  
@@ -435,7 +536,7 @@ int ptlrpc_connect_import(struct obd_import *imp, char *new_uuid)
          int rc;
          __u64 committed_before_reconnect = 0;
          struct ptlrpc_request *request;
-        int size[] = { sizeof(struct ptlrpc_body),
+        __u32 size[] = { sizeof(struct ptlrpc_body),
                         sizeof(imp->imp_obd->u.cli.cl_target_uuid),
                         sizeof(obd->obd_uuid),
                         sizeof(imp->imp_dlm_handle),
@@ -515,7 +616,7 @@ int ptlrpc_connect_import(struct obd_import *imp, char *new_uuid)
          imp->imp_msghdr_flags &= ~MSGHDR_AT_SUPPORT;
  
          rc = obd_reconnect(imp->imp_obd->obd_self_export, obd,
-                           &obd->obd_uuid, &imp->imp_connect_data);
+                           &obd->obd_uuid, &imp->imp_connect_data, NULL);
          if (rc)
                  GOTO(out, rc);
  
@@ -524,6 +625,19 @@ int ptlrpc_connect_import(struct obd_import *imp, char *new_uuid)
          if (!request)
                  GOTO(out, rc = -ENOMEM);
  
+        /* Report the rpc service time to the server so that it knows how long
+         * to wait for clients to join recovery */
+        lustre_msg_set_service_time(request->rq_reqmsg,
+                                    at_timeout2est(request->rq_timeout));
+
+        /* The amount of time we give the server to process the connect req.
+         * import_select_connection will increase the net latency on
+         * repeated reconnect attempts to cover slow networks.
+         * We override/ignore the server rpc completion estimate here,
+         * which may be large if this is a reconnect attempt */
+        request->rq_timeout = INITIAL_CONNECT_TIMEOUT;
+        lustre_msg_set_timeout(request->rq_reqmsg, request->rq_timeout);
+
  #ifndef __KERNEL__
          lustre_msg_add_op_flags(request->rq_reqmsg, MSG_CONNECT_LIBCLIENT);
  #endif
@@ -531,6 +645,7 @@ int ptlrpc_connect_import(struct obd_import *imp, char *new_uuid)
                  lustre_msg_add_op_flags(request->rq_reqmsg,
                                          MSG_CONNECT_NEXT_VER);
  
+        request->rq_no_resend = request->rq_no_delay = 1;
          request->rq_send_state = LUSTRE_IMP_CONNECTING;
          /* Allow a slightly larger reply for future growth compatibility */
          size[REPLY_REC_OFF] = sizeof(struct obd_connect_data) +
@@ -538,8 +653,8 @@ int ptlrpc_connect_import(struct obd_import *imp, char *new_uuid)
          ptlrpc_req_set_repsize(request, 2, size);
          request->rq_interpret_reply = ptlrpc_connect_interpret;
  
-        CLASSERT(sizeof (*aa) <= sizeof (request->rq_async_args));
-        aa = (struct ptlrpc_connect_async_args *)&request->rq_async_args;
+        CLASSERT(sizeof(*aa) <= sizeof(request->rq_async_args));
+        aa = ptlrpc_req_async_args(request);
          memset(aa, 0, sizeof *aa);
  
          aa->pcaa_peer_committed = committed_before_reconnect;
@@ -548,14 +663,10 @@ int ptlrpc_connect_import(struct obd_import *imp, char *new_uuid)
                  spin_lock(&imp->imp_lock);
                  imp->imp_replayable = 1;
                  spin_unlock(&imp->imp_lock);
-                if (AT_OFF)
-                        /* AT will use INITIAL_CONNECT_TIMEOUT the first
-                           time, adaptive after that. */
-                        request->rq_timeout = INITIAL_CONNECT_TIMEOUT;
          }
  
          DEBUG_REQ(D_RPCTRACE, request, "%sconnect request %d",
-                  aa->pcaa_initial_connect ? "initial " : "re", 
+                  aa->pcaa_initial_connect ? "initial " : "re",
                    imp->imp_conn_cnt);
          ptlrpcd_add_req(request);
          rc = 0;
@@ -587,11 +698,11 @@ static void ptlrpc_maybe_ping_import_soon(struct obd_import *imp)
                                oic_item);
  
          /* XXX: When the failover node is the primary node, it is possible
-         * to have two identical connections in imp_conn_list. We must 
+         * to have two identical connections in imp_conn_list. We must
           * compare not conn's pointers but NIDs, otherwise we can defeat
           * connection throttling. (See bug 14774.) */
-        if (imp->imp_conn_current->oic_conn->c_self != 
-                                imp_conn->oic_conn->c_self) {
+        if (imp->imp_conn_current->oic_conn->c_peer.nid !=
+                                imp_conn->oic_conn->c_peer.nid) {
                  ptlrpc_ping_import_soon(imp);
                  wake_pinger = 1;
          }
@@ -599,7 +710,7 @@ static void ptlrpc_maybe_ping_import_soon(struct obd_import *imp)
  #else
          /* liblustre has no pinger thead, so we wakup pinger anyway */
          wake_pinger = 1;
-#endif 
+#endif
   unlock:
          spin_unlock(&imp->imp_lock);
  
@@ -616,6 +727,7 @@ static int ptlrpc_connect_interpret(struct ptlrpc_request *request,
          struct obd_import *imp = request->rq_import;
          struct client_obd *cli = &imp->imp_obd->u.cli;
          struct lustre_handle old_hdl;
+        __u64 old_connect_flags;
          int msg_flags;
          ENTRY;
  
@@ -662,7 +774,17 @@ static int ptlrpc_connect_interpret(struct ptlrpc_request *request,
                  imp->imp_remote_handle =
                                  *lustre_msg_get_handle(request->rq_repmsg);
  
-                IMPORT_SET_STATE(imp, LUSTRE_IMP_FULL);
+                /* Initial connects are allowed for clients with non-random
+                 * uuids when servers are in recovery.  Simply signal the
+                 * servers replay is complete and wait in REPLAY_WAIT. */
+                if (msg_flags & MSG_CONNECT_RECOVERING) {
+                        CDEBUG(D_HA, "connect to %s during recovery\n",
+                               obd2cli_tgt(imp->imp_obd));
+                        IMPORT_SET_STATE(imp, LUSTRE_IMP_REPLAY_LOCKS);
+                } else {
+                        IMPORT_SET_STATE(imp, LUSTRE_IMP_FULL);
+                }
+
                  spin_lock(&imp->imp_lock);
                  if (imp->imp_invalid) {
                          spin_unlock(&imp->imp_lock);
@@ -670,7 +792,6 @@ static int ptlrpc_connect_interpret(struct ptlrpc_request *request,
                  } else {
                          spin_unlock(&imp->imp_lock);
                  }
-
                  GOTO(finish, rc = 0);
          } else {
                  spin_unlock(&imp->imp_lock);
@@ -691,19 +812,34 @@ static int ptlrpc_connect_interpret(struct ptlrpc_request *request,
                  if (memcmp(&imp->imp_remote_handle,
                             lustre_msg_get_handle(request->rq_repmsg),
                             sizeof(imp->imp_remote_handle))) {
+                        int level = msg_flags & MSG_CONNECT_RECOVERING ? D_HA :
+                                                                         D_WARNING;
+
+                        /* Bug 16611/14775: if server handle have changed,
+                         * that means some sort of disconnection happened.
+                         * If the server is not in recovery, that also means it
+                         * already erased all of our state because of previous
+                         * eviction. If it is in recovery - we are safe to
+                         * participate since we can reestablish all of our state
+                         * with server again */
+                        CDEBUG(level,"%s@%s changed server handle from "
+                                     LPX64" to "LPX64"%s \n" "but is still in recovery \n",
+                                     obd2cli_tgt(imp->imp_obd),
+                                     imp->imp_connection->c_remote_uuid.uuid,
+                                     imp->imp_remote_handle.cookie,
+                                     lustre_msg_get_handle(request->rq_repmsg)->
+                                                                        cookie,
+                                     (MSG_CONNECT_RECOVERING & msg_flags) ?
+                                         "but is still in recovery" : "");
  
-                        CWARN("%s@%s changed server handle from "
-                               LPX64" to "LPX64" - evicting.\n",
-                               obd2cli_tgt(imp->imp_obd),
-                               imp->imp_connection->c_remote_uuid.uuid,
-                               imp->imp_remote_handle.cookie,
-                               lustre_msg_get_handle(request->rq_repmsg)->
-                                         cookie);
                          imp->imp_remote_handle =
                                       *lustre_msg_get_handle(request->rq_repmsg);
  
-                        IMPORT_SET_STATE(imp, LUSTRE_IMP_EVICTED);
-                        GOTO(finish, rc = 0);
+                        if (!(MSG_CONNECT_RECOVERING & msg_flags)) {
+                                IMPORT_SET_STATE(imp, LUSTRE_IMP_EVICTED);
+                                GOTO(finish, rc = 0);
+                        }
+
                  } else {
                          CDEBUG(D_HA, "reconnected to %s@%s after partition\n",
                                 obd2cli_tgt(imp->imp_obd),
@@ -721,6 +857,11 @@ static int ptlrpc_connect_interpret(struct ptlrpc_request *request,
  
                          spin_lock(&imp->imp_lock);
                          imp->imp_resend_replay = 1;
+                        /* VBR: delayed connection */
+                        if (MSG_CONNECT_DELAYED & msg_flags) {
+                                imp->imp_delayed_recovery = 1;
+                                imp->imp_no_lock_replay = 1;
+                        }
                          spin_unlock(&imp->imp_lock);
  
                          IMPORT_SET_STATE(imp, LUSTRE_IMP_REPLAY);
@@ -732,6 +873,13 @@ static int ptlrpc_connect_interpret(struct ptlrpc_request *request,
                  imp->imp_remote_handle =
                                  *lustre_msg_get_handle(request->rq_repmsg);
                  imp->imp_last_replay_transno = 0;
+                /* VBR: delayed connection */
+                if (MSG_CONNECT_DELAYED & msg_flags) {
+                        spin_lock(&imp->imp_lock);
+                        imp->imp_delayed_recovery = 1;
+                        imp->imp_no_lock_replay = 1;
+                        spin_unlock(&imp->imp_lock);
+                }
                  IMPORT_SET_STATE(imp, LUSTRE_IMP_REPLAY);
          } else {
                  DEBUG_REQ(D_HA, request, "evicting (not initial connect and "
@@ -751,8 +899,8 @@ static int ptlrpc_connect_interpret(struct ptlrpc_request *request,
              aa->pcaa_peer_committed) {
                  CERROR("%s went back in time (transno "LPD64
                         " was previously committed, server now claims "LPD64
-                       ")!  See https://bugzilla.clusterfs.com/"
-                       "long_list.cgi?buglist=9646\n",
+                       ")!  See https://bugzilla.lustre.org/show_bug.cgi?"
+                       "id=9646\n",
                         obd2cli_tgt(imp->imp_obd), aa->pcaa_peer_committed,
                         lustre_msg_get_last_committed(request->rq_repmsg));
          }
@@ -799,12 +947,13 @@ finish:
                           imp->imp_connect_flags_orig, ocd->ocd_connect_flags);
  
                  if (!exp) {
-                        /* This could happen if export is cleaned during the 
+                        /* This could happen if export is cleaned during the
                             connect attempt */
-                        CERROR("Missing export for %s\n", 
+                        CERROR("Missing export for %s\n",
                                 imp->imp_obd->obd_name);
                          GOTO(out, rc = -ENODEV);
                  }
+                old_connect_flags = exp->exp_connect_flags;
                  exp->exp_connect_flags = ocd->ocd_connect_flags;
                  imp->imp_obd->obd_self_export->exp_connect_flags = ocd->ocd_connect_flags;
                  class_export_put(exp);
@@ -876,24 +1025,33 @@ finish:
                  }
  
                  if (ocd->ocd_connect_flags & OBD_CONNECT_BRW_SIZE) {
-                        cli->cl_max_pages_per_rpc = 
+                        cli->cl_max_pages_per_rpc =
                                  ocd->ocd_brw_size >> CFS_PAGE_SHIFT;
                  }
  
-                imp->imp_obd->obd_namespace->ns_connect_flags = 
-                        ocd->ocd_connect_flags;
-                imp->imp_obd->obd_namespace->ns_orig_connect_flags = 
-                        ocd->ocd_connect_flags;
+                /* Reset ns_connect_flags only for initial connect. It might be
+                 * changed in while using FS and if we reset it in reconnect
+                 * this leads to lossing user settings done before such as
+                 * disable lru_resize, etc. */
+                if (old_connect_flags != exp->exp_connect_flags ||
+                    aa->pcaa_initial_connect) {
+                        CWARN("Reseting ns_connect_flags to server flags: "LPU64"\n", 
+                              ocd->ocd_connect_flags);
+                        imp->imp_obd->obd_namespace->ns_connect_flags =
+                                ocd->ocd_connect_flags;
+                        imp->imp_obd->obd_namespace->ns_orig_connect_flags =
+                                ocd->ocd_connect_flags;
+                }
  
                  if ((ocd->ocd_connect_flags & OBD_CONNECT_AT) &&
                      (imp->imp_msg_magic == LUSTRE_MSG_MAGIC_V2))
-                        /* We need a per-message support flag, because 
+                        /* We need a per-message support flag, because
                             a. we don't know if the incoming connect reply
                                supports AT or not (in reply_in_callback)
                                until we unpack it.
                             b. failovered server means export and flags are gone
                                (in ptlrpc_send_reply).
-                           Can only be set when we know AT is supported at 
+                           Can only be set when we know AT is supported at
                             both ends */
                          imp->imp_msghdr_flags |= MSGHDR_AT_SUPPORT;
                  else
@@ -953,7 +1111,7 @@ finish:
                         obd2cli_tgt(imp->imp_obd),
                         (char *)imp->imp_connection->c_remote_uuid.uuid, rc);
          }
-        
+
          spin_lock(&imp->imp_lock);
          imp->imp_last_recon = 0;
          spin_unlock(&imp->imp_lock);
@@ -963,19 +1121,29 @@ finish:
  }
  
  static int completed_replay_interpret(struct ptlrpc_request *req,
-                                    void * data, int rc)
+                                      void * data, int rc)
  {
          ENTRY;
          atomic_dec(&req->rq_import->imp_replay_inflight);
-        if (req->rq_status == 0) {
+        if (req->rq_status == 0 &&
+            !req->rq_import->imp_vbr_failed) {
                  ptlrpc_import_recovery_state_machine(req->rq_import);
          } else {
-                CDEBUG(D_HA, "%s: LAST_REPLAY message error: %d, "
-                       "reconnecting\n",
-                       req->rq_import->imp_obd->obd_name, req->rq_status);
+                if (req->rq_import->imp_vbr_failed) {
+                        CDEBUG(D_WARNING,
+                               "%s: version recovery fails, reconnecting\n",
+                               req->rq_import->imp_obd->obd_name);
+                        spin_lock(&req->rq_import->imp_lock);
+                        req->rq_import->imp_vbr_failed = 0;
+                        spin_unlock(&req->rq_import->imp_lock);
+                } else {
+                        CDEBUG(D_HA, "%s: LAST_REPLAY message error: %d, "
+                                     "reconnecting\n",
+                               req->rq_import->imp_obd->obd_name,
+                               req->rq_status);
+                }
                  ptlrpc_connect_import(req->rq_import, NULL);
          }
-
          RETURN(0);
  }
  
@@ -996,6 +1164,8 @@ static int signal_completed_replay(struct obd_import *imp)
          ptlrpc_req_set_repsize(req, 1, NULL);
          req->rq_send_state = LUSTRE_IMP_REPLAY_WAIT;
          lustre_msg_add_flags(req->rq_reqmsg, MSG_LAST_REPLAY);
+        if (imp->imp_delayed_recovery)
+                lustre_msg_add_flags(req->rq_reqmsg, MSG_DELAY_REPLAY);
          req->rq_timeout *= 3;
          req->rq_interpret_reply = completed_replay_interpret;
  
@@ -1007,17 +1177,25 @@ static int signal_completed_replay(struct obd_import *imp)
  static int ptlrpc_invalidate_import_thread(void *data)
  {
          struct obd_import *imp = data;
+        int disconnect;
  
          ENTRY;
  
          ptlrpc_daemonize("ll_imp_inval");
-        
+
          CDEBUG(D_HA, "thread invalidate import %s to %s@%s\n",
                 imp->imp_obd->obd_name, obd2cli_tgt(imp->imp_obd),
                 imp->imp_connection->c_remote_uuid.uuid);
  
          ptlrpc_invalidate_import(imp);
  
+        /* is client_disconnect_export in flight ? */
+        spin_lock(&imp->imp_lock);
+        disconnect = imp->imp_deactive;
+        spin_unlock(&imp->imp_lock);
+        if (disconnect)
+                GOTO(out, 0 );
+
          if (obd_dump_on_eviction) {
                  CERROR("dump the log upon eviction\n");
                  libcfs_debug_dumplog();
@@ -1026,6 +1204,8 @@ static int ptlrpc_invalidate_import_thread(void *data)
          IMPORT_SET_STATE(imp, LUSTRE_IMP_RECOVER);
          ptlrpc_import_recovery_state_machine(imp);
  
+out:
+        class_import_put(imp);
          RETURN(0);
  }
  #endif
@@ -1054,12 +1234,19 @@ int ptlrpc_import_recovery_state_machine(struct obd_import *imp)
                         imp->imp_connection->c_remote_uuid.uuid);
  
  #ifdef __KERNEL__
+                /* bug 17802:  XXX client_disconnect_export vs connect request
+                 * race. if client will evicted at this time, we start invalidate
+                 * thread without referece to import and import can be freed
+                 * at same time. */
+                class_import_get(imp);
                  rc = cfs_kernel_thread(ptlrpc_invalidate_import_thread, imp,
                                     CLONE_VM | CLONE_FILES);
-                if (rc < 0)
+                if (rc < 0) {
+                        class_import_put(imp);
                          CERROR("error starting invalidate thread: %d\n", rc);
-                else
+                } else {
                          rc = 0;
+                }
                  RETURN(rc);
  #else
                  ptlrpc_invalidate_import(imp);
@@ -1158,12 +1345,12 @@ int ptlrpc_disconnect_import(struct obd_import *imp, int noclose)
                  if (AT_OFF) {
                          timeout = cfs_time_seconds(obd_timeout);
                  } else {
-                        int idx = import_at_get_index(imp, 
+                        int idx = import_at_get_index(imp,
                                  imp->imp_client->cli_request_portal);
                          timeout = cfs_time_seconds(
                                  at_get(&imp->imp_at.iat_service_estimate[idx]));
                  }
-                lwi = LWI_TIMEOUT_INTR(cfs_timeout_cap(timeout), 
+                lwi = LWI_TIMEOUT_INTR(cfs_timeout_cap(timeout),
                                         back_to_sleep, LWI_ON_SIGNAL_NOOP, NULL);
                  rc = l_wait_event(imp->imp_recovery_waitq,
                                    !ptlrpc_import_in_recovery(imp), &lwi);
@@ -1181,14 +1368,14 @@ int ptlrpc_disconnect_import(struct obd_import *imp, int noclose)
                   * it fails.  We can get through the above with a down server
                   * if the client doesn't know the server is gone yet. */
                  req->rq_no_resend = 1;
-                
+
  #ifndef CRAY_XT3
-                /* We want client umounts to happen quickly, no matter the 
+                /* We want client umounts to happen quickly, no matter the
                     server state... */
                  req->rq_timeout = min_t(int, req->rq_timeout,
                                          INITIAL_CONNECT_TIMEOUT);
  #else
-                /* ... but we always want liblustre clients to nicely 
+                /* ... but we always want liblustre clients to nicely
                     disconnect, so only use the adaptive value. */
                  if (AT_OFF)
                          req->rq_timeout = obd_timeout / 3;
@@ -1204,12 +1391,12 @@ int ptlrpc_disconnect_import(struct obd_import *imp, int noclose)
  set_state:
          spin_lock(&imp->imp_lock);
  out:
-        if (noclose) 
+        if (noclose)
                  IMPORT_SET_STATE_NOLOCK(imp, LUSTRE_IMP_DISCON);
          else
                  IMPORT_SET_STATE_NOLOCK(imp, LUSTRE_IMP_CLOSED);
          memset(&imp->imp_remote_handle, 0, sizeof(imp->imp_remote_handle));
-        /* Try all connections in the future - bz 12758 */ 
+        /* Try all connections in the future - bz 12758 */
          imp->imp_last_recon = 0;
          spin_unlock(&imp->imp_lock);
  
@@ -1232,7 +1419,7 @@ extern unsigned int at_min, at_max, at_history;
     This gives us a max of the last binlimit*AT_BINS secs without the storage,
     but still smoothing out a return to normalcy from a slow response.
     (E.g. remember the maximum latency in each minute of the last 4 minutes.) */
-int at_add(struct adaptive_timeout *at, unsigned int val) 
+int at_add(struct adaptive_timeout *at, unsigned int val)
  {
          unsigned int old = at->at_current;
          time_t now = cfs_time_current_sec();
@@ -1240,12 +1427,12 @@ int at_add(struct adaptive_timeout *at, unsigned int val)
  
          LASSERT(at);
  #if 0
-        CDEBUG(D_INFO, "add %u to %p time=%lu v=%u (%u %u %u %u)\n", 
+        CDEBUG(D_INFO, "add %u to %p time=%lu v=%u (%u %u %u %u)\n",
                 val, at, now - at->at_binstart, at->at_current,
                 at->at_hist[0], at->at_hist[1], at->at_hist[2], at->at_hist[3]);
  #endif
-        if (val == 0) 
-                /* 0's don't count, because we never want our timeout to 
+        if (val == 0)
+                /* 0's don't count, because we never want our timeout to
                     drop to 0, and because 0 could mean an error */
                  return 0;
  
@@ -1303,22 +1490,22 @@ int at_add(struct adaptive_timeout *at, unsigned int val)
                         at->at_hist[0], at->at_hist[1], at->at_hist[2],
                         at->at_hist[3]);
  #endif
-        
+
          /* if we changed, report the old value */
          old = (at->at_current != old) ? old : 0;
-        
+
          spin_unlock(&at->at_lock);
          return old;
  }
  
  /* Find the imp_at index for a given portal; assign if space available */
-int import_at_get_index(struct obd_import *imp, int portal) 
+int import_at_get_index(struct obd_import *imp, int portal)
  {
          struct imp_at *at = &imp->imp_at;
          int i;
  
          for (i = 0; i < IMP_AT_MAX_PORTALS; i++) {
-                if (at->iat_portal[i] == portal) 
+                if (at->iat_portal[i] == portal)
                          return i;
                  if (at->iat_portal[i] == 0)
                          /* unused */
@@ -1330,13 +1517,13 @@ int import_at_get_index(struct obd_import *imp, int portal)
  
          /* Check unused under lock */
          for (; i < IMP_AT_MAX_PORTALS; i++) {
-                if (at->iat_portal[i] == portal) 
+                if (at->iat_portal[i] == portal)
                          goto out;
                  if (at->iat_portal[i] == 0)
                          /* unused */
                          break;
          }
-        
+
          /* Not enough portals? */
          LASSERT(i < IMP_AT_MAX_PORTALS);
  
@@ -1345,4 +1532,3 @@ out:
          spin_unlock(&imp->imp_lock);
          return i;
  }
-
diff --git a/lustre/ptlrpc/llog_client.c b/lustre/ptlrpc/llog_client.c

index ff5fddd..d463183 100644 (file)
--- a/lustre/ptlrpc/llog_client.c
+++ b/lustre/ptlrpc/llog_client.c
@@ -1,29 +1,43 @@
  /* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
   * vim:expandtab:shiftwidth=8:tabstop=8:
   *
- *  Copyright (C) 2001-2004 Cluster File Systems, Inc.
- *   Author: Andreas Dilger <adilger@clusterfs.com>
+ * GPL HEADER START
   *
- *   This file is part of the Lustre file system, http://www.lustre.org
- *   Lustre is a trademark of Cluster File Systems, Inc.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   *
- *   You may have signed or agreed to another license before downloading
- *   this software.  If so, you are bound by the terms and conditions
- *   of that agreement, and the following does not apply to you.  See the
- *   LICENSE file included with this distribution for more information.
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
   *
- *   If you did not agree to a different license, then this copy of Lustre
- *   is open source software; you can redistribute it and/or modify it
- *   under the terms of version 2 of the GNU General Public License as
- *   published by the Free Software Foundation.
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
   *
- *   In either case, Lustre is distributed in the hope that it will be
- *   useful, but WITHOUT ANY WARRANTY; without even the implied warranty
- *   of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- *   license text for more details.
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
   *
- *  remote api for llog - client side
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
   *
+ * GPL HEADER END
+ */
+/*
+ * Copyright  2008 Sun Microsystems, Inc. All rights reserved
+ * Use is subject to license terms.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/ptlrpc/llog_client.c
+ *
+ * remote api for llog - client side
+ *
+ * Author: Andreas Dilger <adilger@clusterfs.com>
   */
  
  #define DEBUG_SUBSYSTEM S_LOG
@@ -78,7 +92,7 @@ static int llog_client_create(struct llog_ctxt *ctxt, struct llog_handle **res,
          struct llogd_body *body;
          struct llog_handle *handle;
          struct ptlrpc_request *req = NULL;
-        int size[3] = { sizeof(struct ptlrpc_body), sizeof(req_body) };
+        __u32 size[3] = { sizeof(struct ptlrpc_body), sizeof(req_body) };
          char *bufs[3] = { NULL, (char*)&req_body };
          int bufcount = 2;
          int rc;
@@ -138,7 +152,7 @@ static int llog_client_destroy(struct llog_handle *loghandle)
          struct obd_import     *imp;
          struct ptlrpc_request *req = NULL;
          struct llogd_body *body;
-        int size[] = { sizeof(struct ptlrpc_body), sizeof(*body) };
+        __u32 size[] = { sizeof(struct ptlrpc_body), sizeof(*body) };
          int rc;
          ENTRY;
  
@@ -170,7 +184,7 @@ static int llog_client_next_block(struct llog_handle *loghandle,
          struct ptlrpc_request *req = NULL;
          struct llogd_body *body;
          void * ptr;
-        int size[3] = { sizeof(struct ptlrpc_body), sizeof(*body) };
+        __u32 size[3] = { sizeof(struct ptlrpc_body), sizeof(*body) };
          int rc;
          ENTRY;
  
@@ -228,7 +242,7 @@ static int llog_client_prev_block(struct llog_handle *loghandle,
          struct ptlrpc_request *req = NULL;
          struct llogd_body *body;
          void * ptr;
-        int size[3] = { sizeof(struct ptlrpc_body), sizeof(*body) };
+        __u32 size[3] = { sizeof(struct ptlrpc_body), sizeof(*body) };
          int rc;
          ENTRY;
  
@@ -280,8 +294,8 @@ static int llog_client_read_header(struct llog_handle *handle)
          struct llogd_body *body;
          struct llog_log_hdr *hdr;
          struct llog_rec_hdr *llh_hdr;
-        int size[2] = { sizeof(struct ptlrpc_body), sizeof(*body) };
-        int repsize[2] = { sizeof(struct ptlrpc_body), sizeof(*hdr) };
+        __u32 size[2] = { sizeof(struct ptlrpc_body), sizeof(*body) };
+        __u32 repsize[2] = { sizeof(struct ptlrpc_body), sizeof(*hdr) };
          int rc;
          ENTRY;
  
diff --git a/lustre/ptlrpc/llog_net.c b/lustre/ptlrpc/llog_net.c

index 8d955d3..87e3566 100644 (file)
--- a/lustre/ptlrpc/llog_net.c
+++ b/lustre/ptlrpc/llog_net.c
@@ -1,32 +1,47 @@
  /* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
   * vim:expandtab:shiftwidth=8:tabstop=8:
   *
- *  Copyright (C) 2001-2003 Cluster File Systems, Inc.
- *   Author: Andreas Dilger <adilger@clusterfs.com>
+ * GPL HEADER START
   *
- *   This file is part of the Lustre file system, http://www.lustre.org
- *   Lustre is a trademark of Cluster File Systems, Inc.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   *
- *   You may have signed or agreed to another license before downloading
- *   this software.  If so, you are bound by the terms and conditions
- *   of that agreement, and the following does not apply to you.  See the
- *   LICENSE file included with this distribution for more information.
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
   *
- *   If you did not agree to a different license, then this copy of Lustre
- *   is open source software; you can redistribute it and/or modify it
- *   under the terms of version 2 of the GNU General Public License as
- *   published by the Free Software Foundation.
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
   *
- *   In either case, Lustre is distributed in the hope that it will be
- *   useful, but WITHOUT ANY WARRANTY; without even the implied warranty
- *   of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- *   license text for more details.
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright  2008 Sun Microsystems, Inc. All rights reserved
+ * Use is subject to license terms.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/ptlrpc/llog_net.c
   *
   * OST<->MDS recovery logging infrastructure.
   *
   * Invariants in implementation:
   * - we do not share logs among different OST<->MDS connections, so that
   *   if an OST or MDS fails it need only look at log(s) relevant to itself
+ *
+ * Author: Andreas Dilger <adilger@clusterfs.com>
   */
  
  #define DEBUG_SUBSYSTEM S_LOG
@@ -48,7 +63,7 @@
  #include <lustre_fsfilt.h>
  
  #ifdef __KERNEL__
-int llog_origin_connect(struct llog_ctxt *ctxt, int count,
+int llog_origin_connect(struct llog_ctxt *ctxt,
                          struct llog_logid *logid, struct llog_gen *gen,
                          struct obd_uuid *uuid)
  {
@@ -58,11 +73,17 @@ int llog_origin_connect(struct llog_ctxt *ctxt, int count,
          struct llogd_conn_body *req_body;
          int size[2] = { sizeof(struct ptlrpc_body),
                          sizeof(struct llogd_conn_body) };
-        struct inode* inode = ctxt->loc_handle->lgh_file->f_dentry->d_inode;
+        struct inode *inode;
          void *handle;
          int rc, rc1;
          ENTRY;
  
+        LASSERT(ctxt != NULL);
+        LASSERT(ctxt->loc_handle != NULL);
+        LASSERT(ctxt->loc_handle->lgh_file != NULL);
+        LASSERT(ctxt->loc_handle->lgh_file->f_dentry != NULL);
+        inode = ctxt->loc_handle->lgh_file->f_dentry->d_inode;
+
          if (list_empty(&ctxt->loc_handle->u.chd.chd_head)) {
                  CDEBUG(D_HA, "there is no record related to ctxt %p\n", ctxt);
                  RETURN(0);
@@ -78,9 +99,9 @@ int llog_origin_connect(struct llog_ctxt *ctxt, int count,
          lgr->lgr_hdr.lrh_len = lgr->lgr_tail.lrt_len = sizeof(*lgr);
          lgr->lgr_hdr.lrh_type = LLOG_GEN_REC;
  
-        handle = fsfilt_start_log(ctxt->loc_exp->exp_obd, inode, 
+        handle = fsfilt_start_log(ctxt->loc_exp->exp_obd, inode,
                                    FSFILT_OP_CANCEL_UNLINK, NULL, 1);
-       
+
          if (IS_ERR(handle)) {
                  CERROR("fsfilt_start failed: %ld\n", PTR_ERR(handle));
                  OBD_FREE(lgr, sizeof(*lgr));
@@ -90,7 +111,7 @@ int llog_origin_connect(struct llog_ctxt *ctxt, int count,
          lgr->lgr_gen = ctxt->loc_gen;
          rc = llog_add(ctxt, &lgr->lgr_hdr, NULL, NULL, 1);
          OBD_FREE(lgr, sizeof(*lgr));
-        
+
          rc1 = fsfilt_commit(ctxt->loc_exp->exp_obd, inode, handle, 0);
          if (rc != 1 || rc1 != 0) {
                  rc = (rc != 1) ? rc : rc1;
@@ -132,7 +153,7 @@ int llog_handle_connect(struct ptlrpc_request *req)
                                    sizeof(*req_body));
  
          ctxt = llog_get_context(obd, req_body->lgdc_ctxt_idx);
-        rc = llog_connect(ctxt, 1, &req_body->lgdc_logid,
+        rc = llog_connect(ctxt, &req_body->lgdc_logid,
                            &req_body->lgdc_gen, NULL);
  
          llog_ctxt_put(ctxt);
@@ -149,9 +170,11 @@ int llog_receptor_accept(struct llog_ctxt *ctxt, struct obd_import *imp)
          LASSERT(ctxt);
          mutex_down(&ctxt->loc_sem);
          if (ctxt->loc_imp != imp) {
-                CWARN("changing the import %p - %p\n", ctxt->loc_imp, imp);
-                if (ctxt->loc_imp)
+                if (ctxt->loc_imp) {
+                        CWARN("changing the import %p - %p\n",
+                              ctxt->loc_imp, imp);
                          class_import_put(ctxt->loc_imp);
+                }
                  ctxt->loc_imp = class_import_get(imp);
          }
          mutex_up(&ctxt->loc_sem);
@@ -161,7 +184,7 @@ EXPORT_SYMBOL(llog_receptor_accept);
  
  #else /* !__KERNEL__ */
  
-int llog_origin_connect(struct llog_ctxt *ctxt, int count,
+int llog_origin_connect(struct llog_ctxt *ctxt,
                          struct llog_logid *logid, struct llog_gen *gen,
                          struct obd_uuid *uuid)
  {
diff --git a/lustre/ptlrpc/llog_server.c b/lustre/ptlrpc/llog_server.c

index dff4397..9888c26 100644 (file)
--- a/lustre/ptlrpc/llog_server.c
+++ b/lustre/ptlrpc/llog_server.c
@@ -1,29 +1,43 @@
  /* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
   * vim:expandtab:shiftwidth=8:tabstop=8:
   *
- *  Copyright (C) 2001-2003 Cluster File Systems, Inc.
- *   Author: Andreas Dilger <adilger@clusterfs.com>
+ * GPL HEADER START
   *
- *   This file is part of the Lustre file system, http://www.lustre.org
- *   Lustre is a trademark of Cluster File Systems, Inc.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   *
- *   You may have signed or agreed to another license before downloading
- *   this software.  If so, you are bound by the terms and conditions
- *   of that agreement, and the following does not apply to you.  See the
- *   LICENSE file included with this distribution for more information.
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
   *
- *   If you did not agree to a different license, then this copy of Lustre
- *   is open source software; you can redistribute it and/or modify it
- *   under the terms of version 2 of the GNU General Public License as
- *   published by the Free Software Foundation.
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
   *
- *   In either case, Lustre is distributed in the hope that it will be
- *   useful, but WITHOUT ANY WARRANTY; without even the implied warranty
- *   of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- *   license text for more details.
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
   *
- *  remote api for llog - server side
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
   *
+ * GPL HEADER END
+ */
+/*
+ * Copyright  2008 Sun Microsystems, Inc. All rights reserved
+ * Use is subject to license terms.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/ptlrpc/llog_server.c
+ *
+ * remote api for llog - server side
+ *
+ * Author: Andreas Dilger <adilger@clusterfs.com>
   */
  
  #define DEBUG_SUBSYSTEM S_LOG
@@ -62,7 +76,7 @@ int llog_origin_handle_create(struct ptlrpc_request *req)
          body = lustre_swab_reqbuf(req, REQ_REC_OFF, sizeof(*body),
                                   lustre_swab_llogd_body);
          if (body == NULL) {
-                CERROR ("Can't unpack llogd_body\n");
+                CERROR("Can't unpack llogd_body\n");
                  RETURN(-EFAULT);
          }
  
@@ -75,12 +89,13 @@ int llog_origin_handle_create(struct ptlrpc_request *req)
                          CERROR("Can't unpack name\n");
                          RETURN(-EFAULT);
                  }
-                CDEBUG(D_INFO, "opening log %s\n", name);
+                CDEBUG(D_INFO, "Opening log %s\n", name);
          }
  
          ctxt = llog_get_context(obd, body->lgd_ctxt_idx);
          if (ctxt == NULL)
-                RETURN(-EINVAL);
+                RETURN(-ENODEV);
+
          disk_obd = ctxt->loc_exp->exp_obd;
          push_ctxt(&saved, &disk_obd->obd_lvfs_ctxt, NULL);
  
@@ -94,7 +109,7 @@ int llog_origin_handle_create(struct ptlrpc_request *req)
  
          body = lustre_msg_buf(req->rq_repmsg, REPLY_REC_OFF, sizeof(*body));
          body->lgd_logid = loghandle->lgh_id;
-
+        EXIT;
  out_close:
          rc2 = llog_close(loghandle);
          if (!rc)
@@ -102,7 +117,7 @@ out_close:
  out_pop:
          pop_ctxt(&saved, &disk_obd->obd_lvfs_ctxt, NULL);
          llog_ctxt_put(ctxt);
-        RETURN(rc);
+        return rc;
  }
  
  int llog_origin_handle_destroy(struct ptlrpc_request *req)
@@ -132,14 +147,20 @@ int llog_origin_handle_destroy(struct ptlrpc_request *req)
  
          ctxt = llog_get_context(obd, body->lgd_ctxt_idx);
          if (ctxt == NULL)
-                RETURN(-EINVAL);
+                RETURN(-ENODEV);
  
          disk_obd = ctxt->loc_exp->exp_obd;
          push_ctxt(&saved, &disk_obd->obd_lvfs_ctxt, NULL);
  
          rc = llog_create(ctxt, &loghandle, logid, NULL);
-        if (rc)
+        if (rc) {
+                /* This might already be killed. Let's check if this is
+                 * resent case. */
+                if (rc == -ENOENT &&
+                    (lustre_msg_get_flags(req->rq_reqmsg) & MSG_RESENT))
+                        rc = 0;
                  GOTO(out_pop, rc);
+        }
  
          rc = lustre_pack_reply(req, 2, size, NULL);
          if (rc)
@@ -153,16 +174,18 @@ int llog_origin_handle_destroy(struct ptlrpc_request *req)
                  GOTO(out_close, rc);
          rc = llog_destroy(loghandle);
          if (rc)
+                /* Do not check for resent as this is already done above after
+                 * llog_create(). */
                  GOTO(out_close, rc);
          llog_free_handle(loghandle);
-
+        EXIT;
  out_close:
          if (rc)
                  llog_close(loghandle);
  out_pop:
          pop_ctxt(&saved, &disk_obd->obd_lvfs_ctxt, NULL);
          llog_ctxt_put(ctxt);
-        RETURN(rc);
+        return rc;
  }
  
  int llog_origin_handle_next_block(struct ptlrpc_request *req)
@@ -196,7 +219,8 @@ int llog_origin_handle_next_block(struct ptlrpc_request *req)
  
          ctxt = llog_get_context(obd, body->lgd_ctxt_idx);
          if (ctxt == NULL)
-                GOTO(out_free, rc = -EINVAL);
+                GOTO(out_free, rc = -ENODEV);
+
          disk_obd = ctxt->loc_exp->exp_obd;
          push_ctxt(&saved, &disk_obd->obd_lvfs_ctxt, NULL);
  
@@ -216,7 +240,6 @@ int llog_origin_handle_next_block(struct ptlrpc_request *req)
          if (rc)
                  GOTO(out_close, rc);
  
-
          rc = lustre_pack_reply(req, 3, size, NULL);
          if (rc)
                  GOTO(out_close, rc = -ENOMEM);
@@ -226,18 +249,17 @@ int llog_origin_handle_next_block(struct ptlrpc_request *req)
  
          ptr = lustre_msg_buf(req->rq_repmsg, REPLY_REC_OFF+1, LLOG_CHUNK_SIZE);
          memcpy(ptr, buf, LLOG_CHUNK_SIZE);
-
+        EXIT;
  out_close:
          rc2 = llog_close(loghandle);
          if (!rc)
                  rc = rc2;
-
  out_pop:
          pop_ctxt(&saved, &disk_obd->obd_lvfs_ctxt, NULL);
          llog_ctxt_put(ctxt);
  out_free:
          OBD_FREE(buf, LLOG_CHUNK_SIZE);
-        RETURN(rc);
+        return rc;
  }
  
  int llog_origin_handle_prev_block(struct ptlrpc_request *req)
@@ -270,7 +292,9 @@ int llog_origin_handle_prev_block(struct ptlrpc_request *req)
                  RETURN(-ENOMEM);
  
          ctxt = llog_get_context(obd, body->lgd_ctxt_idx);
-        LASSERT(ctxt != NULL);
+        if (ctxt == NULL)
+                GOTO(out_free, rc = -ENODEV);
+
          disk_obd = ctxt->loc_exp->exp_obd;
          push_ctxt(&saved, &disk_obd->obd_lvfs_ctxt, NULL);
  
@@ -298,17 +322,17 @@ int llog_origin_handle_prev_block(struct ptlrpc_request *req)
  
          ptr = lustre_msg_buf(req->rq_repmsg, REPLY_REC_OFF+1, LLOG_CHUNK_SIZE);
          memcpy(ptr, buf, LLOG_CHUNK_SIZE);
-
+        EXIT;
  out_close:
          rc2 = llog_close(loghandle);
          if (!rc)
                  rc = rc2;
-
  out_pop:
          pop_ctxt(&saved, &disk_obd->obd_lvfs_ctxt, NULL);
          llog_ctxt_put(ctxt);
+out_free:
          OBD_FREE(buf, LLOG_CHUNK_SIZE);
-        RETURN(rc);
+        return rc;
  }
  
  int llog_origin_handle_read_header(struct ptlrpc_request *req)
@@ -335,15 +359,15 @@ int llog_origin_handle_read_header(struct ptlrpc_request *req)
  
          ctxt = llog_get_context(obd, body->lgd_ctxt_idx);
          if (ctxt == NULL)
-                RETURN(-EINVAL);
+                RETURN(-ENODEV);
+
          disk_obd = ctxt->loc_exp->exp_obd;
          push_ctxt(&saved, &disk_obd->obd_lvfs_ctxt, NULL);
-
          rc = llog_create(ctxt, &loghandle, &body->lgd_logid, NULL);
          if (rc)
                  GOTO(out_pop, rc);
  
-        /* init_handle reads the header */
+        /* llog_init_handle() reads the header */
          flags = body->lgd_llh_flags;
          rc = llog_init_handle(loghandle, flags, NULL);
          if (rc)
@@ -355,7 +379,7 @@ int llog_origin_handle_read_header(struct ptlrpc_request *req)
  
          hdr = lustre_msg_buf(req->rq_repmsg, REPLY_REC_OFF, sizeof(*hdr));
          memcpy(hdr, loghandle->lgh_hdr, sizeof(*hdr));
-
+        EXIT;
  out_close:
          rc2 = llog_close(loghandle);
          if (!rc)
@@ -363,25 +387,22 @@ out_close:
  out_pop:
          pop_ctxt(&saved, &disk_obd->obd_lvfs_ctxt, NULL);
          llog_ctxt_put(ctxt);
-        RETURN(rc);
+        return rc;
  }
  
  int llog_origin_handle_close(struct ptlrpc_request *req)
  {
-        int rc;
-
-        rc = 0;
-
-        RETURN(rc);
+        ENTRY;
+        RETURN(0);
  }
  
  int llog_origin_handle_cancel(struct ptlrpc_request *req)
  {
          struct obd_device *obd = req->rq_export->exp_obd;
+        int num_cookies, rc = 0, err, i, failed = 0;
          struct obd_device *disk_obd;
          struct llog_cookie *logcookies;
          struct llog_ctxt *ctxt = NULL;
-        int num_cookies, rc = 0, err, i;
          struct lvfs_run_ctxt saved;
          struct llog_handle *cathandle;
          struct inode *inode;
@@ -393,15 +414,13 @@ int llog_origin_handle_cancel(struct ptlrpc_request *req)
          num_cookies = lustre_msg_buflen(req->rq_reqmsg, REQ_REC_OFF) /
                        sizeof(*logcookies);
          if (logcookies == NULL || num_cookies == 0) {
-                DEBUG_REQ(D_HA, req, "no cookies sent");
+                DEBUG_REQ(D_HA, req, "No llog cookies sent");
                  RETURN(-EFAULT);
          }
  
          ctxt = llog_get_context(obd, logcookies->lgc_subsys);
-        if (ctxt == NULL) {
-                CWARN("llog subsys not setup or already cleanup\n");
-                RETURN(-ENOENT);
-        }
+        if (ctxt == NULL)
+                RETURN(-ENODEV);
  
          disk_obd = ctxt->loc_exp->exp_obd;
          push_ctxt(&saved, &disk_obd->obd_lvfs_ctxt, NULL);
@@ -413,29 +432,46 @@ int llog_origin_handle_cancel(struct ptlrpc_request *req)
                  handle = fsfilt_start_log(disk_obd, inode,
                                            FSFILT_OP_CANCEL_UNLINK, NULL, 1);
                  if (IS_ERR(handle)) {
-                        CERROR("fsfilt_start failed: %ld\n", PTR_ERR(handle));
+                        CERROR("fsfilt_start_log() failed: %ld\n", 
+                               PTR_ERR(handle));
                          GOTO(pop_ctxt, rc = PTR_ERR(handle));
                  }
  
                  rc = llog_cat_cancel_records(cathandle, 1, logcookies);
  
+                /* Do not raise -ENOENT errors for resent rpcs. This rec already
+                 * might be killed. */
+                if (rc == -ENOENT && 
+                    (lustre_msg_get_flags(req->rq_reqmsg) & MSG_RESENT)) {
+                        /* Do not change this message, reply-single.sh test_59b
+                         * expects to find this in dmesg. */
+                        CDEBUG(D_RPCTRACE, "RESENT cancel req %p - ignored\n",
+                               req);
+                        rc = 0;
+                } else if (rc == 0) {
+                        CDEBUG(D_RPCTRACE, "Canceled %d llog-records\n", 
+                               num_cookies);
+                }
+
                  err = fsfilt_commit(disk_obd, inode, handle, 0);
                  if (err) {
-                        CERROR("error committing transaction: %d\n", err);
+                        CERROR("Error committing transaction: %d\n", err);
                          if (!rc)
                                  rc = err;
+                        failed++;
                          GOTO(pop_ctxt, rc);
-                }
+                } else if (rc)
+                        failed++;
          }
+        EXIT;
  pop_ctxt:
          pop_ctxt(&saved, &disk_obd->obd_lvfs_ctxt, NULL);
          if (rc)
-                CERROR("cancel %d llog-records failed: %d\n", num_cookies, rc);
-        else
-                CDEBUG(D_RPCTRACE, "cancel %d llog-records\n", num_cookies);
+                CERROR("Cancel %d of %d llog-records failed: %d\n", 
+                       failed, num_cookies, rc);
  
          llog_ctxt_put(ctxt);
-        RETURN(rc);
+        return rc;
  }
  EXPORT_SYMBOL(llog_origin_handle_cancel);
  
@@ -449,9 +485,10 @@ static int llog_catinfo_config(struct obd_device *obd, char *buf, int buf_len,
          char name[4][64];
          int rc, i, l, remains = buf_len;
          char *out = buf;
+        ENTRY;
  
          if (ctxt == NULL || mds == NULL)
-                GOTO(release_ctxt, rc = -EOPNOTSUPP);
+                GOTO(release_ctxt, rc = -ENODEV);
  
          push_ctxt(&saved, &ctxt->loc_exp->exp_obd->obd_lvfs_ctxt, NULL);
  
@@ -488,11 +525,12 @@ static int llog_catinfo_config(struct obd_device *obd, char *buf, int buf_len,
                  if (remains <= 0)
                          break;
          }
+        GOTO(out_pop, rc);
  out_pop:
          pop_ctxt(&saved, &ctxt->loc_exp->exp_obd->obd_lvfs_ctxt, NULL);
  release_ctxt:
          llog_ctxt_put(ctxt);
-        RETURN(rc);
+        return rc;
  }
  
  struct cb_data {
@@ -513,6 +551,7 @@ static int llog_catinfo_cb(struct llog_handle *cat,
          struct llog_logid_rec *lir;
          int l, rc, index, count = 0;
          struct cb_data *cbd = (struct cb_data*)data;
+        ENTRY;
  
          if (cbd->init) {
                  out = cbd->out;
@@ -524,13 +563,14 @@ static int llog_catinfo_cb(struct llog_handle *cat,
                  RETURN(-EINVAL);
  
          if (!cbd->ctxt)
-                RETURN(-EINVAL);
+                RETURN(-ENODEV);
          
          lir = (struct llog_logid_rec *)rec;
          logid = &lir->lid_id;
          rc = llog_create(ctxt, &handle, logid, NULL);
          if (rc)
                  RETURN(-EINVAL);
+
          rc = llog_init_handle(handle, 0, NULL);
          if (rc)
                  GOTO(out_close, rc);
@@ -554,10 +594,10 @@ static int llog_catinfo_cb(struct llog_handle *cat,
                  CWARN("Not enough memory\n");
                  rc = -ENOMEM;
          }
-
+        GOTO(out_close, rc);
  out_close:
          llog_close(handle);
-        RETURN(rc);
+        return rc;
  }
  
  static int llog_catinfo_deletions(struct obd_device *obd, char *buf,
@@ -568,23 +608,24 @@ static int llog_catinfo_deletions(struct obd_device *obd, char *buf,
          struct lvfs_run_ctxt saved;
          int size, i, count;
          struct llog_catid *idarray;
-        struct llog_logid *id;
          char name[32] = CATLIST;
          int rc;
          struct cb_data data;
          struct llog_ctxt *ctxt = llog_get_context(obd, LLOG_CONFIG_ORIG_CTXT);
+        ENTRY;
  
          if (ctxt == NULL || mds == NULL)
-                GOTO(release_ctxt, rc = -EOPNOTSUPP);
+                GOTO(release_ctxt, rc = -ENODEV);
         
          count = mds->mds_lov_desc.ld_tgt_count;
          size = sizeof(*idarray) * count;
  
-        OBD_ALLOC(idarray, size);
+        OBD_VMALLOC(idarray, size);
          if (!idarray)
                  GOTO(release_ctxt, rc = -ENOMEM);
  
-        rc = llog_get_cat_list(obd, obd, name, count, idarray);
+        mutex_down(&obd->obd_llog_cat_process);
+        rc = llog_get_cat_list(obd, obd, name, 0, count, idarray);
          if (rc)
                  GOTO(out_free, rc);
  
@@ -596,8 +637,7 @@ static int llog_catinfo_deletions(struct obd_device *obd, char *buf,
          for (i = 0; i < count; i++) {
                  int l, index, uncanceled = 0;
  
-                id = &idarray[i].lci_logid;
-                rc = llog_create(ctxt, &handle, id, NULL);
+                rc = llog_create(ctxt, &handle, &idarray[i].lci_logid, NULL);
                  if (rc)
                          GOTO(out_pop, rc);
                  rc = llog_init_handle(handle, 0, NULL);
@@ -612,8 +652,9 @@ static int llog_catinfo_deletions(struct obd_device *obd, char *buf,
                  l = snprintf(data.out, data.remains,
                               "\n[Catlog ID]: #"LPX64"#"LPX64"#%08x  "
                               "[Log Count]: %d\n",
-                             id->lgl_oid, id->lgl_ogr, id->lgl_ogen,
-                             uncanceled);
+                             idarray[i].lci_logid.lgl_oid,
+                             idarray[i].lci_logid.lgl_ogr,
+                             idarray[i].lci_logid.lgl_ogen, uncanceled);
  
                  data.out += l;
                  data.remains -= l;
@@ -625,14 +666,15 @@ static int llog_catinfo_deletions(struct obd_device *obd, char *buf,
                  if (data.remains <= 0)
                          break;
          }
+        GOTO(out_pop, rc);
  out_pop:
          pop_ctxt(&saved, &ctxt->loc_exp->exp_obd->obd_lvfs_ctxt, NULL);
  out_free:
-        OBD_FREE(idarray, size);
+        mutex_up(&obd->obd_llog_cat_process);
+        OBD_VFREE(idarray, size);
  release_ctxt:
          llog_ctxt_put(ctxt);
-
-        RETURN(rc);
+        return rc;
  }
  
  int llog_catinfo(struct ptlrpc_request *req)
@@ -643,11 +685,11 @@ int llog_catinfo(struct ptlrpc_request *req)
          char *buf, *reply;
          int rc, buf_len = LLOG_CHUNK_SIZE;
          int size[2] = { sizeof(struct ptlrpc_body), buf_len };
+        ENTRY;
  
          OBD_ALLOC(buf, buf_len);
          if (buf == NULL)
-                return -ENOMEM;
-        memset(buf, 0, buf_len);
+                RETURN(-ENOMEM);
  
          keyword = lustre_msg_string(req->rq_reqmsg, REQ_REC_OFF, 0);
  
@@ -669,7 +711,7 @@ int llog_catinfo(struct ptlrpc_request *req)
          if (strlen(buf) == 0)
                  sprintf(buf, "%s", "No log informations\n");
          memcpy(reply, buf, buf_len);
-
+        GOTO(out_free, rc);
  out_free:
          OBD_FREE(buf, buf_len);
          return rc;
diff --git a/lustre/ptlrpc/lproc_ptlrpc.c b/lustre/ptlrpc/lproc_ptlrpc.c

index 8a2c375..12c996f 100644 (file)
--- a/lustre/ptlrpc/lproc_ptlrpc.c
+++ b/lustre/ptlrpc/lproc_ptlrpc.c
@@ -1,26 +1,37 @@
  /* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
   * vim:expandtab:shiftwidth=8:tabstop=8:
   *
- *  Copyright (C) 2002 Cluster File Systems, Inc.
+ * GPL HEADER START
   *
- *   This file is part of the Lustre file system, http://www.lustre.org
- *   Lustre is a trademark of Cluster File Systems, Inc.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   *
- *   You may have signed or agreed to another license before downloading
- *   this software.  If so, you are bound by the terms and conditions
- *   of that agreement, and the following does not apply to you.  See the
- *   LICENSE file included with this distribution for more information.
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
   *
- *   If you did not agree to a different license, then this copy of Lustre
- *   is open source software; you can redistribute it and/or modify it
- *   under the terms of version 2 of the GNU General Public License as
- *   published by the Free Software Foundation.
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
   *
- *   In either case, Lustre is distributed in the hope that it will be
- *   useful, but WITHOUT ANY WARRANTY; without even the implied warranty
- *   of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- *   license text for more details.
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
   *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright  2008 Sun Microsystems, Inc. All rights reserved
+ * Use is subject to license terms.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
   */
  #define DEBUG_SUBSYSTEM S_CLASS
  
@@ -104,6 +115,9 @@ struct ll_rpc_opcode {
          { LLOG_CATINFO,                  "llog_catinfo" },
          { LLOG_ORIGIN_HANDLE_PREV_BLOCK, "llog_origin_handle_prev_block" },
          { LLOG_ORIGIN_HANDLE_DESTROY,    "llog_origin_handle_destroy" },
+        { QUOTA_DQACQ,      "quota_acquire" },
+        { QUOTA_DQREL,      "quota_release" },
+        { SEQ_QUERY,        "seq_query" },
  };
  
  struct ll_eopcode {
@@ -115,26 +129,32 @@ struct ll_eopcode {
          { LDLM_EXTENT_ENQUEUE,  "ldlm_extent_enqueue" },
          { LDLM_FLOCK_ENQUEUE,   "ldlm_flock_enqueue" },
          { LDLM_IBITS_ENQUEUE,   "ldlm_ibits_enqueue" },
+        { MDS_REINT_SETATTR,    "mds_reint_setattr" },
          { MDS_REINT_CREATE,     "mds_reint_create" },
          { MDS_REINT_LINK,       "mds_reint_link" },
-        { MDS_REINT_OPEN,       "mds_reint_open" },
-        { MDS_REINT_SETATTR,    "mds_reint_setattr" },
+        { MDS_REINT_UNLINK,     "mds_reint_unlink" },
          { MDS_REINT_RENAME,     "mds_reint_rename" },
-        { MDS_REINT_UNLINK,     "mds_reint_unlink" }
+        { MDS_REINT_OPEN,       "mds_reint_open" },
+        { BRW_READ_BYTES,       "read_bytes" },
+        { BRW_WRITE_BYTES,      "write_bytes" },
  };
  
  const char *ll_opcode2str(__u32 opcode)
  {
          /* When one of the assertions below fail, chances are that:
-         *     1) A new opcode was added in lustre_idl.h, but was
-         *        is missing from the table above.
+         *     1) A new opcode was added in include/lustre/lustre_idl.h,
+         *        but is missing from the table above.
           * or  2) The opcode space was renumbered or rearranged,
           *        and the opcode_offset() function in
           *        ptlrpc_internal.h needs to be modified.
           */
          __u32 offset = opcode_offset(opcode);
-        LASSERT(offset < LUSTRE_MAX_OPCODES);
-        LASSERT(ll_rpc_opcode_table[offset].opcode == opcode);
+        LASSERTF(offset < LUSTRE_MAX_OPCODES,
+                 "offset %u >= LUSTRE_MAX_OPCODES %u\n",
+                 offset, LUSTRE_MAX_OPCODES);
+        LASSERTF(ll_rpc_opcode_table[offset].opcode == opcode,
+                 "ll_rpc_opcode_table[%u].opcode %u != opcode %u\n",
+                 offset, ll_rpc_opcode_table[offset].opcode, opcode);
          return ll_rpc_opcode_table[offset].opname;
  }
  
@@ -143,6 +163,7 @@ const char* ll_eopcode2str(__u32 opcode)
          LASSERT(ll_eopcode_table[opcode].opcode == opcode);
          return ll_eopcode_table[opcode].opname;
  }
+
  #ifdef LPROCFS
  void ptlrpc_lprocfs_register(struct proc_dir_entry *root, char *dir,
                               char *name, struct proc_dir_entry **procroot_ret,
@@ -182,9 +203,20 @@ void ptlrpc_lprocfs_register(struct proc_dir_entry *root, char *dir,
          lprocfs_counter_init(svc_stats, PTLRPC_REQBUF_AVAIL_CNTR,
                               svc_counter_config, "reqbuf_avail", "bufs");
          for (i = 0; i < EXTRA_LAST_OPC; i++) {
+                char *units;
+
+                switch(i) {
+                case BRW_WRITE_BYTES:
+                case BRW_READ_BYTES:
+                        units = "bytes";
+                        break;
+                default:
+                        units = "reqs";
+                        break;
+                }
                  lprocfs_counter_init(svc_stats, PTLRPC_LAST_CNTR + i,
                                       svc_counter_config,
-                                     ll_eopcode2str(i), "reqs");
+                                     ll_eopcode2str(i), units);
          }
          for (i = 0; i < LUSTRE_MAX_OPCODES; i++) {
                  __u32 opcode = ll_rpc_opcode_table[i].opcode;
@@ -406,8 +438,8 @@ static int ptlrpc_lprocfs_svc_req_history_show(struct seq_file *s, void *iter)
                   * parser. Currently I only print stuff here I know is OK
                   * to look at coz it was set up in request_in_callback()!!! */
                  seq_printf(s, LPD64":%s:%s:x"LPD64":%d:%s:%ld:%lds(%+lds) ",
-                           req->rq_history_seq, libcfs_nid2str(req->rq_self), 
-                           libcfs_id2str(req->rq_peer), req->rq_xid, 
+                           req->rq_history_seq, libcfs_nid2str(req->rq_self),
+                           libcfs_id2str(req->rq_peer), req->rq_xid,
                             req->rq_reqlen, ptlrpc_rqphase2str(req),
                             req->rq_arrival_time.tv_sec,
                             req->rq_sent - req->rq_arrival_time.tv_sec,
@@ -467,14 +499,40 @@ static int ptlrpc_lprocfs_rd_timeouts(char *page, char **start, off_t off,
                  rc += snprintf(page + rc, count - rc,
                                "adaptive timeouts off, using obd_timeout %u\n",
                                obd_timeout);
-        rc += snprintf(page + rc, count - rc, 
+        rc += snprintf(page + rc, count - rc,
                         "%10s : cur %3u  worst %3u (at %ld, "DHMS_FMT" ago) ",
-                       "service", cur, worst, worstt, 
+                       "service", cur, worst, worstt,
                         DHMS_VARS(&ts));
          rc = lprocfs_at_hist_helper(page, count, rc,
                                      &svc->srv_at_estimate);
          return rc;
-}               
+}
+
+static int ptlrpc_lprocfs_rd_hp_ratio(char *page, char **start, off_t off,
+                                      int count, int *eof, void *data)
+{
+        struct ptlrpc_service *svc = data;
+        int rc = snprintf(page, count, "%d", svc->srv_hpreq_ratio);
+        return rc;
+}
+
+static int ptlrpc_lprocfs_wr_hp_ratio(struct file *file, const char *buffer,
+                                      unsigned long count, void *data)
+{
+        struct ptlrpc_service *svc = data;
+        int rc, val;
+        
+        rc = lprocfs_write_helper(buffer, count, &val);
+        if (rc < 0)
+                return rc;
+        if (val < 0)
+                return -ERANGE;
+
+        spin_lock(&svc->srv_lock);
+        svc->srv_hpreq_ratio = val;
+        spin_unlock(&svc->srv_lock);
+        return count;
+}
  
  void ptlrpc_lprocfs_register_service(struct proc_dir_entry *entry,
                                       struct ptlrpc_service *svc)
@@ -491,6 +549,10 @@ void ptlrpc_lprocfs_register_service(struct proc_dir_entry *entry,
                  {.name       = "timeouts",
                   .read_fptr  = ptlrpc_lprocfs_rd_timeouts,
                   .data       = svc},
+                {.name       = "high_priority_ratio",
+                 .read_fptr  = ptlrpc_lprocfs_rd_hp_ratio,
+                 .write_fptr = ptlrpc_lprocfs_wr_hp_ratio,
+                 .data       = svc},
                  {NULL}
          };
          static struct file_operations req_history_fops = {
@@ -537,27 +599,43 @@ void ptlrpc_lprocfs_rpc_sent(struct ptlrpc_request *req)
          if (svc_stats == NULL || opc <= 0)
                  return;
          LASSERT(opc < LUSTRE_MAX_OPCODES);
-        /* These two use the ptlrpc_lprocfs_brw below */
-        if (!(opc == OST_WRITE || opc == OST_READ || op == LDLM_ENQUEUE
-              || op == MDS_REINT))
+        if (!(op == LDLM_ENQUEUE || op == MDS_REINT))
                  lprocfs_counter_add(svc_stats, opc + EXTRA_MAX_OPCODES, 0);
  }
  
-void ptlrpc_lprocfs_brw(struct ptlrpc_request *req, int opc, int bytes)
+void ptlrpc_lprocfs_brw(struct ptlrpc_request *req, int bytes)
  {
          struct lprocfs_stats *svc_stats;
+        int idx;
+
+        if (!req->rq_import)
+                return;
          svc_stats = req->rq_import->imp_obd->obd_svc_stats;
-        if (!svc_stats) 
+        if (!svc_stats)
                  return;
-        lprocfs_counter_add(svc_stats, opc + EXTRA_MAX_OPCODES, bytes);
+        idx = lustre_msg_get_opc(req->rq_reqmsg);
+        switch (idx) {
+        case OST_READ:
+                idx = BRW_READ_BYTES + PTLRPC_LAST_CNTR;
+                break;
+        case OST_WRITE:
+                idx = BRW_WRITE_BYTES + PTLRPC_LAST_CNTR;
+                break;
+        default:
+                LASSERTF(0, "unsupported opcode %u\n", idx);
+                break;
+        }
+
+        lprocfs_counter_add(svc_stats, idx, bytes);
  }
+
  EXPORT_SYMBOL(ptlrpc_lprocfs_brw);
  
  void ptlrpc_lprocfs_unregister_service(struct ptlrpc_service *svc)
  {
-        if (svc->srv_procroot != NULL) 
+        if (svc->srv_procroot != NULL)
                  lprocfs_remove(&svc->srv_procroot);
-        if (svc->srv_stats) 
+        if (svc->srv_stats)
                  lprocfs_free_stats(&svc->srv_stats);
  }
  
@@ -577,12 +655,12 @@ int lprocfs_wr_evict_client(struct file *file, const char *buffer,
          struct obd_device *obd = data;
          char tmpbuf[sizeof(struct obd_uuid)];
  
-        /* Kludge code(deadlock situation): the lprocfs lock has been held 
+        /* Kludge code(deadlock situation): the lprocfs lock has been held
           * since the client is evicted by writting client's
-         * uuid/nid to procfs "evict_client" entry. However, 
+         * uuid/nid to procfs "evict_client" entry. However,
           * obd_export_evict_by_uuid() will call lprocfs_remove() to destroy
           * the proc entries under the being destroyed export{}, so I have
-         * to drop the lock at first here. 
+         * to drop the lock at first here.
           * - jay, jxiong@clusterfs.com */
          class_incref(obd);
          LPROCFS_EXIT();
@@ -611,16 +689,12 @@ int lprocfs_wr_ping(struct file *file, const char *buffer,
          ENTRY;
  
          LPROCFS_CLIMP_CHECK(obd);
-        req = ptlrpc_prep_req(obd->u.cli.cl_import, LUSTRE_OBD_VERSION,
-                              OBD_PING, 1, NULL, NULL);
+        req = ptlrpc_prep_ping(obd->u.cli.cl_import);
          LPROCFS_CLIMP_EXIT(obd);
          if (req == NULL)
                  RETURN(-ENOMEM);
  
-        ptlrpc_req_set_repsize(req, 1, NULL);
          req->rq_send_state = LUSTRE_IMP_FULL;
-        req->rq_no_resend = 1;
-        req->rq_no_delay = 1;
  
          rc = ptlrpc_queue_wait(req);
  
diff --git a/lustre/ptlrpc/niobuf.c b/lustre/ptlrpc/niobuf.c

index 99f3d8a..6207437 100644 (file)
--- a/lustre/ptlrpc/niobuf.c
+++ b/lustre/ptlrpc/niobuf.c
@@ -1,26 +1,37 @@
  /* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
   * vim:expandtab:shiftwidth=8:tabstop=8:
   *
- *  Copyright (c) 2002, 2003 Cluster File Systems, Inc.
+ * GPL HEADER START
   *
- *   This file is part of the Lustre file system, http://www.lustre.org
- *   Lustre is a trademark of Cluster File Systems, Inc.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   *
- *   You may have signed or agreed to another license before downloading
- *   this software.  If so, you are bound by the terms and conditions
- *   of that agreement, and the following does not apply to you.  See the
- *   LICENSE file included with this distribution for more information.
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
   *
- *   If you did not agree to a different license, then this copy of Lustre
- *   is open source software; you can redistribute it and/or modify it
- *   under the terms of version 2 of the GNU General Public License as
- *   published by the Free Software Foundation.
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
   *
- *   In either case, Lustre is distributed in the hope that it will be
- *   useful, but WITHOUT ANY WARRANTY; without even the implied warranty
- *   of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- *   license text for more details.
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
   *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright  2008 Sun Microsystems, Inc. All rights reserved
+ * Use is subject to license terms.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
   */
  
  #define DEBUG_SUBSYSTEM S_RPC
@@ -85,7 +96,7 @@ static int ptl_send_buf (lnet_handle_md_t *mdh, void *base, int len,
          RETURN (0);
  }
  
-int ptlrpc_start_bulk_transfer (struct ptlrpc_bulk_desc *desc)
+int ptlrpc_start_bulk_transfer(struct ptlrpc_bulk_desc *desc)
  {
          struct ptlrpc_connection *conn = desc->bd_export->exp_connection;
          int                       rc;
@@ -153,16 +164,16 @@ int ptlrpc_start_bulk_transfer (struct ptlrpc_bulk_desc *desc)
          RETURN(0);
  }
  
-void ptlrpc_abort_bulk (struct ptlrpc_bulk_desc *desc)
+/* Server side bulk abort. Idempotent. Not thread-safe (i.e. only
+ * serialises with completion callback) */
+void ptlrpc_abort_bulk(struct ptlrpc_bulk_desc *desc)
  {
-        /* Server side bulk abort. Idempotent. Not thread-safe (i.e. only
-         * serialises with completion callback) */
-        struct l_wait_info lwi;
-        int                rc;
+        struct l_wait_info       lwi;
+        int                      rc;
  
-        LASSERT (!in_interrupt ());             /* might sleep */
+        LASSERT(!in_interrupt());               /* might sleep */
  
-        if (!ptlrpc_bulk_active(desc))          /* completed or */
+        if (!ptlrpc_server_bulk_active(desc))   /* completed or */
                  return;                         /* never started */
          
          /* Do not send any meaningful data over the wire for evicted clients */
@@ -174,14 +185,15 @@ void ptlrpc_abort_bulk (struct ptlrpc_bulk_desc *desc)
           * but we must still l_wait_event() in this case, to give liblustre
           * a chance to run server_bulk_callback()*/
  
-        LNetMDUnlink (desc->bd_md_h);
+        LNetMDUnlink(desc->bd_md_h);
  
          for (;;) {
                  /* Network access will complete in finite time but the HUGE
                   * timeout lets us CWARN for visibility of sluggish NALs */
-                lwi = LWI_TIMEOUT (cfs_time_seconds(300), NULL, NULL);
+                lwi = LWI_TIMEOUT_INTERVAL(cfs_time_seconds(LONG_UNLINK),
+                                           cfs_time_seconds(1), NULL, NULL);
                  rc = l_wait_event(desc->bd_waitq, 
-                                  !ptlrpc_bulk_active(desc), &lwi);
+                                  !ptlrpc_server_bulk_active(desc), &lwi);
                  if (rc == 0)
                          return;
  
@@ -190,7 +202,7 @@ void ptlrpc_abort_bulk (struct ptlrpc_bulk_desc *desc)
          }
  }
  
-int ptlrpc_register_bulk (struct ptlrpc_request *req)
+int ptlrpc_register_bulk(struct ptlrpc_request *req)
  {
          struct ptlrpc_bulk_desc *desc = req->rq_bulk;
          lnet_process_id_t peer;
@@ -264,29 +276,45 @@ int ptlrpc_register_bulk (struct ptlrpc_request *req)
          RETURN(0);
  }
  
-void ptlrpc_unregister_bulk (struct ptlrpc_request *req)
+/* Disconnect a bulk desc from the network. Idempotent. Not
+ * thread-safe (i.e. only interlocks with completion callback). */
+int ptlrpc_unregister_bulk(struct ptlrpc_request *req, int async)
  {
-        /* Disconnect a bulk desc from the network. Idempotent. Not
-         * thread-safe (i.e. only interlocks with completion callback). */
          struct ptlrpc_bulk_desc *desc = req->rq_bulk;
          cfs_waitq_t             *wq;
          struct l_wait_info       lwi;
          int                      rc;
+        ENTRY;
  
-        LASSERT (!in_interrupt ());     /* might sleep */
+        LASSERT(!in_interrupt());     /* might sleep */
  
-        if (!ptlrpc_bulk_active(desc))  /* completed or */
-                return;                 /* never registered */
+        /* Let's setup deadline for reply unlink. */
+        if (OBD_FAIL_CHECK(OBD_FAIL_PTLRPC_LONG_BULK_UNLINK) && 
+            async && req->rq_bulk_deadline == 0)
+                req->rq_bulk_deadline = cfs_time_current_sec() + LONG_UNLINK;
  
-        LASSERT (desc->bd_req == req);  /* bd_req NULL until registered */
+        if (!ptlrpc_client_bulk_active(req))  /* completed or */
+                RETURN(1);                    /* never registered */
+
+        LASSERT(desc->bd_req == req);  /* bd_req NULL until registered */
  
          /* the unlink ensures the callback happens ASAP and is the last
           * one.  If it fails, it must be because completion just happened,
           * but we must still l_wait_event() in this case to give liblustre
           * a chance to run client_bulk_callback() */
  
-        LNetMDUnlink (desc->bd_md_h);
-        
+        LNetMDUnlink(desc->bd_md_h);
+
+        if (!ptlrpc_client_bulk_active(req))  /* completed or */
+                RETURN(1);                    /* never registered */
+
+        /* Move to "Unregistering" phase as bulk was not unlinked yet. */
+        ptlrpc_rqphase_move(req, RQ_PHASE_UNREGISTERING);
+
+        /* Do not wait for unlink to finish. */
+        if (async)
+                RETURN(0);
+
          if (req->rq_set != NULL)
                  wq = &req->rq_set->set_waitq;
          else
@@ -295,18 +323,22 @@ void ptlrpc_unregister_bulk (struct ptlrpc_request *req)
          for (;;) {
                  /* Network access will complete in finite time but the HUGE
                   * timeout lets us CWARN for visibility of sluggish NALs */
-                lwi = LWI_TIMEOUT (cfs_time_seconds(300), NULL, NULL);
-                rc = l_wait_event(*wq, !ptlrpc_bulk_active(desc), &lwi);
-                if (rc == 0)
-                        return;
+                lwi = LWI_TIMEOUT_INTERVAL(cfs_time_seconds(LONG_UNLINK),
+                                           cfs_time_seconds(1), NULL, NULL);
+                rc = l_wait_event(*wq, !ptlrpc_client_bulk_active(req), &lwi);
+                if (rc == 0) {
+                        ptlrpc_rqphase_move(req, req->rq_next_phase);
+                        RETURN(1);
+                }
  
-                LASSERT (rc == -ETIMEDOUT);
-                DEBUG_REQ(D_WARNING,req,"Unexpectedly long timeout: desc %p",
+                LASSERT(rc == -ETIMEDOUT);
+                DEBUG_REQ(D_WARNING, req, "Unexpectedly long timeout: desc %p",
                            desc);
          }
+        RETURN(0);
  }
  
-int ptlrpc_send_reply (struct ptlrpc_request *req, int flags)
+int ptlrpc_send_reply(struct ptlrpc_request *req, int flags)
  {
          struct ptlrpc_service     *svc = req->rq_rqbd->rqbd_service;
          struct ptlrpc_reply_state *rs = req->rq_reply_state;
@@ -348,9 +380,11 @@ int ptlrpc_send_reply (struct ptlrpc_request *req, int flags)
          service_time = max_t(int, cfs_time_current_sec() -
                               req->rq_arrival_time.tv_sec, 1);
          if (!(flags & PTLRPC_REPLY_EARLY) && 
-            (req->rq_type != PTL_RPC_MSG_ERR)) {
-                /* early replies and errors don't count toward our service
-                   time estimate */
+            (req->rq_type != PTL_RPC_MSG_ERR) &&
+            !(lustre_msg_get_flags(req->rq_reqmsg) &
+              (MSG_RESENT | MSG_REPLAY | MSG_LAST_REPLAY))) {
+                /* early replies, errors and recovery requests don't count
+                 * toward our service time estimate */
                  int oldse = at_add(&svc->srv_at_estimate, service_time);
                  if (oldse != 0)
                          DEBUG_REQ(D_ADAPTTO, req,
@@ -381,7 +415,7 @@ int ptlrpc_send_reply (struct ptlrpc_request *req, int flags)
                          lustre_msg_set_cksum(req->rq_repmsg, 
                                           lustre_msg_calc_cksum(req->rq_repmsg));
                  } else {
-                        offset = lustre_msg_early_size();
+                        offset = lustre_msg_early_size(req);
                  }
          } else {
                  CDEBUG(D_ADAPTTO, "No early reply support: flags=%#x "
@@ -393,7 +427,7 @@ int ptlrpc_send_reply (struct ptlrpc_request *req, int flags)
          }
  
          if (req->rq_export == NULL || req->rq_export->exp_connection == NULL)
-                conn = ptlrpc_get_connection(req->rq_peer, req->rq_self, NULL);
+                conn = ptlrpc_connection_get(req->rq_peer, req->rq_self, NULL);
          else
                  conn = ptlrpc_connection_addref(req->rq_export->exp_connection);
  
@@ -414,7 +448,7 @@ int ptlrpc_send_reply (struct ptlrpc_request *req, int flags)
                  atomic_dec (&svc->srv_outstanding_replies);
                  ptlrpc_req_drop_rs(req);
          }
-        ptlrpc_put_connection(conn);
+        ptlrpc_connection_put(conn);
          return rc;
  }
  
@@ -461,7 +495,7 @@ int ptl_send_rpc(struct ptlrpc_request *request, int noreply)
  
          /* If this is a re-transmit, we're required to have disengaged
           * cleanly from the previous attempt */
-        LASSERT (!request->rq_receiving_reply);
+        LASSERT(!request->rq_receiving_reply);
  
          if (request->rq_import->imp_obd &&
              request->rq_import->imp_obd->obd_fail) {
@@ -469,6 +503,7 @@ int ptl_send_rpc(struct ptlrpc_request *request, int noreply)
                         request->rq_import->imp_obd->obd_name);
                  /* this prevents us from waiting in ptlrpc_queue_wait */
                  request->rq_err = 1;
+                request->rq_status = -ENODEV;
                  RETURN(-ENODEV);
          }
  
@@ -488,12 +523,19 @@ int ptl_send_rpc(struct ptlrpc_request *request, int noreply)
          lustre_msghdr_set_flags(request->rq_reqmsg,
                                  request->rq_import->imp_msghdr_flags);
  
+        if (request->rq_resend)
+                lustre_msg_add_flags(request->rq_reqmsg, MSG_RESENT);
+
          if (!noreply) {
                  LASSERT (request->rq_replen != 0);
                  if (request->rq_repbuf == NULL)
                          OBD_ALLOC(request->rq_repbuf, request->rq_replen);
-                if (request->rq_repbuf == NULL)
+                if (request->rq_repbuf == NULL) {
+                        /* this prevents us from looping in ptlrpc_queue_wait */
+                        request->rq_err = 1;
+                        request->rq_status = -ENOMEM;
                          GOTO(cleanup_bulk, rc = -ENOMEM);
+                }
                  request->rq_repmsg = NULL;
  
                  rc = LNetMEAttach(request->rq_reply_portal,/*XXX FIXME bug 249*/
@@ -518,6 +560,7 @@ int ptl_send_rpc(struct ptlrpc_request *request, int noreply)
          request->rq_net_err = 0;
          request->rq_resend = 0;
          request->rq_restart = 0;
+        request->rq_rep_swab_mask = 0;
          spin_unlock(&request->rq_lock);
  
          if (!noreply) {
@@ -542,7 +585,7 @@ int ptl_send_rpc(struct ptlrpc_request *request, int noreply)
                          /* ...but the MD attach didn't succeed... */
                          request->rq_receiving_reply = 0;
                          spin_unlock(&request->rq_lock);
-                        GOTO(cleanup_me, rc -ENOMEM);
+                        GOTO(cleanup_me, rc = -ENOMEM);
                  }
  
                  CDEBUG(D_NET, "Setup reply buffer: %u bytes, xid "LPU64
@@ -592,7 +635,7 @@ int ptl_send_rpc(struct ptlrpc_request *request, int noreply)
          rc2 = LNetMEUnlink(reply_me_h);
          LASSERT (rc2 == 0);
          /* UNLINKED callback called synchronously */
-        LASSERT (!request->rq_receiving_reply);
+        LASSERT(!request->rq_receiving_reply);
  
   cleanup_repmsg:
          OBD_FREE(request->rq_repbuf, request->rq_replen);
@@ -600,13 +643,13 @@ int ptl_send_rpc(struct ptlrpc_request *request, int noreply)
          request->rq_repmsg = NULL; //remove
  
   cleanup_bulk:
-        if (request->rq_bulk != NULL)
-                ptlrpc_unregister_bulk(request);
-
+        /* We do sync unlink here as there was no real transfer here so
+         * the chance to have long unlink to sluggish net is smaller here. */
+        ptlrpc_unregister_bulk(request, 0);
          return rc;
  }
  
-int ptlrpc_register_rqbd (struct ptlrpc_request_buffer_desc *rqbd)
+int ptlrpc_register_rqbd(struct ptlrpc_request_buffer_desc *rqbd)
  {
          struct ptlrpc_service   *service = rqbd->rqbd_service;
          static lnet_process_id_t  match_id = {LNET_NID_ANY, LNET_PID_ANY};
diff --git a/lustre/ptlrpc/pack_generic.c b/lustre/ptlrpc/pack_generic.c

index 0ab8100..cee8fd7 100644 (file)
--- a/lustre/ptlrpc/pack_generic.c
+++ b/lustre/ptlrpc/pack_generic.c
@@ -1,31 +1,45 @@
  /* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
   * vim:expandtab:shiftwidth=8:tabstop=8:
   *
- *  Copyright (C) 2001-2003 Cluster File Systems, Inc.
- *   Author: Peter J. Braam <braam@clusterfs.com>
- *   Author: Phil Schwan <phil@clusterfs.com>
- *   Author: Eric Barton <eeb@clusterfs.com>
+ * GPL HEADER START
   *
- *   This file is part of the Lustre file system, http://www.lustre.org
- *   Lustre is a trademark of Cluster File Systems, Inc.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   *
- *   You may have signed or agreed to another license before downloading
- *   this software.  If so, you are bound by the terms and conditions
- *   of that agreement, and the following does not apply to you.  See the
- *   LICENSE file included with this distribution for more information.
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
   *
- *   If you did not agree to a different license, then this copy of Lustre
- *   is open source software; you can redistribute it and/or modify it
- *   under the terms of version 2 of the GNU General Public License as
- *   published by the Free Software Foundation.
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
   *
- *   In either case, Lustre is distributed in the hope that it will be
- *   useful, but WITHOUT ANY WARRANTY; without even the implied warranty
- *   of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- *   license text for more details.
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright  2008 Sun Microsystems, Inc. All rights reserved
+ * Use is subject to license terms.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/ptlrpc/pack_generic.c
   *
   * (Un)packing of OST requests
   *
+ * Author: Peter J. Braam <braam@clusterfs.com>
+ * Author: Phil Schwan <phil@clusterfs.com>
+ * Author: Eric Barton <eeb@clusterfs.com>
   */
  
  #define DEBUG_SUBSYSTEM S_RPC
@@ -38,11 +52,10 @@
  #include <obd_support.h>
  #include <obd_class.h>
  #include <lustre_net.h>
+#include <lustre/ll_fiemap.h>
  
  #if LUSTRE_VERSION_CODE > OBD_OCD_VERSION(1,8,0,0)
  #error "lustre_msg_v1 has been deprecated since 1.6.0, please remove it"
-#elif LUSTRE_VERSION_CODE > OBD_OCD_VERSION(1,6,50,0)
-#warning "lustre_msg_v1 has been deprecated since 1.6.0, consider removing it"
  #endif
  
  static inline int lustre_msg_hdr_size_v1(int count)
@@ -55,7 +68,7 @@ static inline int lustre_msg_hdr_size_v2(int count)
          return size_round(offsetof(struct lustre_msg_v2, lm_buflens[count]));
  }
  
-static int lustre_msg_need_swab(struct lustre_msg *msg)
+int lustre_msg_need_swab(struct lustre_msg *msg)
  {
          return (msg->lm_magic == LUSTRE_MSG_MAGIC_V1_SWABBED) ||
                 (msg->lm_magic == LUSTRE_MSG_MAGIC_V2_SWABBED);
@@ -102,15 +115,23 @@ static int ptlrpc_repbuf_need_swab(struct ptlrpc_request *req, int index)
  
  
  /* early reply size */
-int lustre_msg_early_size() {
+int lustre_msg_early_size(struct ptlrpc_request *req) {
          static int size = 0;
+        /* For b1_6 interoperability */
+        if (req->rq_reqmsg &&
+            req->rq_reqmsg->lm_magic == LUSTRE_MSG_MAGIC_V2) {
+                __u32 pb_len = lustre_msg_buflen(req->rq_reqmsg,
+                                               MSG_PTLRPC_BODY_OFF);
+                return lustre_msg_size(LUSTRE_MSG_MAGIC_V2, 1, &pb_len);
+        }
+
          if (!size)
                  size = lustre_msg_size(LUSTRE_MSG_MAGIC_V2, 1, NULL);
          return size;
  }
  EXPORT_SYMBOL(lustre_msg_early_size);
  
-static inline int lustre_msg_size_v1(int count, int *lengths)
+static inline int lustre_msg_size_v1(int count, __u32 *lengths)
  {
          int size;
          int i;
@@ -123,7 +144,7 @@ static inline int lustre_msg_size_v1(int count, int *lengths)
          return size;
  }
  
-static inline int lustre_msg_size_v2(int count, int *lengths)
+static inline int lustre_msg_size_v2(int count, __u32 *lengths)
  {
          int size;
          int i;
@@ -141,9 +162,9 @@ static inline int lustre_msg_size_v2(int count, int *lengths)
   *       in the form of a v2 request.  If this is a connection to a v1
   *       target then the first buffer will be stripped because the ptlrpc
   *       data is part of the lustre_msg_v1 header. b=14043 */
-int lustre_msg_size(__u32 magic, int count, int *lens)
+int lustre_msg_size(__u32 magic, int count, __u32 *lens)
  {
-        int size[] = { sizeof(struct ptlrpc_body) };
+        __u32 size[] = { sizeof(struct ptlrpc_body) };
  
          if (!lens) {
                  LASSERT(count == 1);
@@ -151,8 +172,12 @@ int lustre_msg_size(__u32 magic, int count, int *lens)
          }
  
          LASSERT(count > 0);
+#ifdef PTLRPC_INTEROP_1_6
+        LASSERT(lens[MSG_PTLRPC_BODY_OFF] == sizeof(struct ptlrpc_body) ||
+                lens[MSG_PTLRPC_BODY_OFF] == PTLRPC_BODY_MIN_SIZE);
+#else
          LASSERT(lens[MSG_PTLRPC_BODY_OFF] == sizeof(struct ptlrpc_body));
-
+#endif
          switch (magic) {
          case LUSTRE_MSG_MAGIC_V1:
                  return lustre_msg_size_v1(count - 1, lens + 1);
@@ -183,7 +208,7 @@ int lustre_packed_msg_size(struct lustre_msg *msg)
  }
  
  static void
-lustre_init_msg_v1(void *m, int count, int *lens, char **bufs)
+lustre_init_msg_v1(void *m, int count, __u32 *lens, char **bufs)
  {
          struct lustre_msg_v1 *msg = (struct lustre_msg_v1 *)m;
          char *ptr;
@@ -208,7 +233,8 @@ lustre_init_msg_v1(void *m, int count, int *lens, char **bufs)
  }
  
  static void
-lustre_init_msg_v2(struct lustre_msg_v2 *msg, int count, int *lens, char **bufs)
+lustre_init_msg_v2(struct lustre_msg_v2 *msg, int count, __u32 *lens,
+                   char **bufs)
  {
          char *ptr;
          int i;
@@ -231,7 +257,7 @@ lustre_init_msg_v2(struct lustre_msg_v2 *msg, int count, int *lens, char **bufs)
  }
  
  static int lustre_pack_request_v1(struct ptlrpc_request *req,
-                                  int count, int *lens, char **bufs)
+                                  int count, __u32 *lens, char **bufs)
  {
          int reqlen;
  
@@ -263,7 +289,7 @@ static int lustre_pack_request_v1(struct ptlrpc_request *req,
  }
  
  static int lustre_pack_request_v2(struct ptlrpc_request *req,
-                                  int count, int *lens, char **bufs)
+                                  int count, __u32 *lens, char **bufs)
  {
          int reqlen;
  
@@ -298,9 +324,9 @@ static int lustre_pack_request_v2(struct ptlrpc_request *req,
  }
  
  int lustre_pack_request(struct ptlrpc_request *req, __u32 magic, int count,
-                        int *lens, char **bufs)
+                        __u32 *lens, char **bufs)
  {
-        int size[] = { sizeof(struct ptlrpc_body) };
+        __u32 size[] = { sizeof(struct ptlrpc_body) };
  
          if (!lens) {
                  LASSERT(count == 1);
@@ -379,7 +405,7 @@ out:
  }
  
  static int lustre_pack_reply_v1(struct ptlrpc_request *req, int count,
-                                int *lens, char **bufs, int flags)
+                                __u32 *lens, char **bufs, int flags)
  {
          struct ptlrpc_reply_state *rs;
          int                        msg_len;
@@ -396,7 +422,7 @@ static int lustre_pack_reply_v1(struct ptlrpc_request *req, int count,
          OBD_ALLOC(rs, size);
          if (unlikely(rs == NULL)) {
                  rs = lustre_get_emerg_rs(req->rq_rqbd->rqbd_service, size);
-                if (!rs) 
+                if (!rs)
                          RETURN (-ENOMEM);
          }
          atomic_set(&rs->rs_refcount, 1);        /* 1 ref for rq_reply_state */
@@ -420,7 +446,7 @@ static int lustre_pack_reply_v1(struct ptlrpc_request *req, int count,
  }
  
  static int lustre_pack_reply_v2(struct ptlrpc_request *req, int count,
-                                int *lens, char **bufs, int flags)
+                                __u32 *lens, char **bufs, int flags)
  {
          struct ptlrpc_reply_state *rs;
          int                        msg_len;
@@ -432,12 +458,18 @@ static int lustre_pack_reply_v2(struct ptlrpc_request *req, int count,
          if ((flags & LPRFL_EARLY_REPLY) == 0)
                  req->rq_packed_final = 1;
  
+        /* use the same size of ptlrpc_body as client requested for
+         * interoperability cases */
+        LASSERT(req->rq_reqmsg);
+        lens[MSG_PTLRPC_BODY_OFF] = lustre_msg_buflen(req->rq_reqmsg,
+                                                      MSG_PTLRPC_BODY_OFF);
+
          msg_len = lustre_msg_size_v2(count, lens);
          size = sizeof(struct ptlrpc_reply_state) + msg_len;
          OBD_ALLOC(rs, size);
          if (unlikely(rs == NULL)) {
                  rs = lustre_get_emerg_rs(req->rq_rqbd->rqbd_service, size);
-                if (!rs) 
+                if (!rs)
                          RETURN (-ENOMEM);
          }
          atomic_set(&rs->rs_refcount, 1);        /* 1 ref for rq_reply_state */
@@ -452,7 +484,6 @@ static int lustre_pack_reply_v2(struct ptlrpc_request *req, int count,
          req->rq_replen = msg_len;
          req->rq_reply_state = rs;
          req->rq_repmsg = rs->rs_msg;
-
          /* server side, no rq_repbuf */
          lustre_init_msg_v2(rs->rs_msg, count, lens, bufs);
          lustre_msg_add_version(rs->rs_msg, PTLRPC_MSG_VERSION);
@@ -463,10 +494,10 @@ static int lustre_pack_reply_v2(struct ptlrpc_request *req, int count,
          RETURN(0);
  }
  
-int lustre_pack_reply_flags(struct ptlrpc_request *req, int count, int *lens,
+int lustre_pack_reply_flags(struct ptlrpc_request *req, int count, __u32 *lens,
                              char **bufs, int flags)
  {
-        int size[] = { sizeof(struct ptlrpc_body) };
+        __u32 size[] = { sizeof(struct ptlrpc_body) };
  
          if (!lens) {
                  LASSERT(count == 1);
@@ -489,7 +520,7 @@ int lustre_pack_reply_flags(struct ptlrpc_request *req, int count, int *lens,
          }
  }
  
-int lustre_pack_reply(struct ptlrpc_request *req, int count, int *lens,
+int lustre_pack_reply(struct ptlrpc_request *req, int count, __u32 *lens,
                        char **bufs)
  {
          int rc = lustre_pack_reply_flags(req, count, lens, bufs, 0);
@@ -608,7 +639,7 @@ void lustre_shrink_reply_v1(struct ptlrpc_request *req, int segment,
                  msg->lm_buflens[msg->lm_bufcount - 1] = 0;
          }
  
-        req->rq_replen = lustre_msg_size_v1(msg->lm_bufcount, (int *)msg->lm_buflens);
+        req->rq_replen = lustre_msg_size_v1(msg->lm_bufcount, msg->lm_buflens);
  }
  
  void lustre_shrink_reply_v2(struct ptlrpc_request *req, int segment,
@@ -647,13 +678,13 @@ void lustre_shrink_reply_v2(struct ptlrpc_request *req, int segment,
                  msg->lm_buflens[msg->lm_bufcount - 1] = 0;
          }
  
-        req->rq_replen = lustre_msg_size_v2(msg->lm_bufcount, (int *)msg->lm_buflens);
+        req->rq_replen = lustre_msg_size_v2(msg->lm_bufcount, msg->lm_buflens);
  }
  
  /*
   * shrink @segment to size @newlen. if @move_data is non-zero, we also move
   * data forward from @segment + 1.
- * 
+ *
   * if @newlen == 0, we remove the segment completely, but we still keep the
   * totally bufcount the same to save possible data moving. this will leave a
   * unused segment with size 0 at the tail, but that's ok.
@@ -867,13 +898,13 @@ static inline int lustre_unpack_ptlrpc_body_v2(struct lustre_msg_v2 *m,
  {
          struct ptlrpc_body *pb;
  
-        pb = lustre_msg_buf_v2(m, offset, sizeof(*pb));
+        pb = lustre_msg_buf_v2(m, offset, PTLRPC_BODY_MIN_SIZE);
          if (!pb) {
                  CERROR("error unpacking ptlrpc body\n");
                  return -EFAULT;
          }
          if (swab_needed)
-                lustre_swab_ptlrpc_body(pb);
+                lustre_swab_ptlrpc_body(pb, lustre_msg_buflen(m, offset));
  
          if ((pb->pb_version & ~LUSTRE_VERSION_MASK) != PTLRPC_MSG_VERSION) {
                   CERROR("wrong lustre_msg version %08x\n", pb->pb_version);
@@ -883,6 +914,20 @@ static inline int lustre_unpack_ptlrpc_body_v2(struct lustre_msg_v2 *m,
          return 0;
  }
  
+int lustre_unpack_msg_ptlrpc_body(struct lustre_msg *msg,
+                                  int offset, int swab_needed)
+{
+        switch (msg->lm_magic) {
+        case LUSTRE_MSG_MAGIC_V1:
+                return 0;
+        case LUSTRE_MSG_MAGIC_V2:
+                return lustre_unpack_ptlrpc_body_v2(msg, offset, swab_needed);
+        default:
+                CERROR("incorrect message magic: %08x\n", msg->lm_magic);
+                return -EINVAL;
+        }
+}
+
  int lustre_unpack_req_ptlrpc_body(struct ptlrpc_request *req, int offset)
  {
          switch (req->rq_reqmsg->lm_magic) {
@@ -1066,6 +1111,7 @@ void *lustre_swab_buf(struct lustre_msg *msg, int index, int min_size,
  {
          void *ptr = NULL;
  
+        LASSERT(msg != NULL);
          switch (msg->lm_magic) {
          case LUSTRE_MSG_MAGIC_V1:
                  ptr = lustre_msg_buf_v1(msg, index - 1, min_size);
@@ -1090,8 +1136,11 @@ void *lustre_swab_buf(struct lustre_msg *msg, int index, int min_size,
  void *lustre_swab_reqbuf(struct ptlrpc_request *req, int index, int min_size,
                           void *swabber)
  {
-        if (!ptlrpc_reqbuf_need_swab(req, index))
-                 swabber = NULL;
+        if (lustre_req_swabbed(req, index))
+                return lustre_msg_buf(req->rq_reqmsg, index, min_size);
+
+        if (!lustre_req_need_swab(req))
+                swabber = NULL;
  
          lustre_set_req_swabbed(req, index);
          return lustre_swab_buf(req->rq_reqmsg, index, min_size, swabber);
@@ -1100,13 +1149,22 @@ void *lustre_swab_reqbuf(struct ptlrpc_request *req, int index, int min_size,
  void *lustre_swab_repbuf(struct ptlrpc_request *req, int index, int min_size,
                           void *swabber)
  {
-        if (!ptlrpc_repbuf_need_swab(req, index))
+        if (lustre_rep_swabbed(req, index))
+                return lustre_msg_buf(req->rq_repmsg, index, min_size);
+
+        if (!lustre_rep_need_swab(req))
                  swabber = NULL;
  
          lustre_set_rep_swabbed(req, index);
          return lustre_swab_buf(req->rq_repmsg, index, min_size, swabber);
  }
  
+static inline struct ptlrpc_body *lustre_msg_ptlrpc_body(struct lustre_msg *msg)
+{
+        return lustre_msg_buf_v2(msg, MSG_PTLRPC_BODY_OFF,
+                                 PTLRPC_BODY_MIN_SIZE);
+}
+
  __u32 lustre_msghdr_get_flags(struct lustre_msg *msg)
  {
          switch (msg->lm_magic) {
@@ -1141,9 +1199,7 @@ __u32 lustre_msg_get_flags(struct lustre_msg *msg)
                  return ((struct lustre_msg_v1 *)msg)->lm_flags &
                         MSG_GEN_FLAG_MASK;
          case LUSTRE_MSG_MAGIC_V2: {
-                struct ptlrpc_body *pb;
-
-                pb = lustre_msg_buf_v2(msg, MSG_PTLRPC_BODY_OFF, sizeof(*pb));
+                struct ptlrpc_body *pb = lustre_msg_ptlrpc_body(msg);
                  if (!pb) {
                          CERROR("invalid msg %p: no ptlrpc body!\n", msg);
                          return 0;
@@ -1151,7 +1207,7 @@ __u32 lustre_msg_get_flags(struct lustre_msg *msg)
                  return pb->pb_flags;
          }
          default:
-                CERROR("Wrong magic %x\n", msg->lm_magic);
+                CERROR("incorrect message magic: %08x\n", msg->lm_magic);
                  /* flags might be printed in debug code while message
                   * uninitialized */
                  return 0;
@@ -1166,9 +1222,7 @@ void lustre_msg_add_flags(struct lustre_msg *msg, int flags)
                                          MSG_GEN_FLAG_MASK & flags;
                  return;
          case LUSTRE_MSG_MAGIC_V2: {
-                struct ptlrpc_body *pb;
-
-                pb = lustre_msg_buf_v2(msg, MSG_PTLRPC_BODY_OFF, sizeof(*pb));
+                struct ptlrpc_body *pb = lustre_msg_ptlrpc_body(msg);
                  LASSERTF(pb, "invalid msg %p: no ptlrpc body!\n", msg);
                  pb->pb_flags |= flags;
                  return;
@@ -1187,9 +1241,7 @@ void lustre_msg_set_flags(struct lustre_msg *msg, int flags)
                                          MSG_GEN_FLAG_MASK & flags;
                  return;
          case LUSTRE_MSG_MAGIC_V2: {
-                struct ptlrpc_body *pb;
-
-                pb = lustre_msg_buf_v2(msg, MSG_PTLRPC_BODY_OFF, sizeof(*pb));
+                struct ptlrpc_body *pb = lustre_msg_ptlrpc_body(msg);
                  LASSERTF(pb, "invalid msg %p: no ptlrpc body!\n", msg);
                  pb->pb_flags = flags;
                  return;
@@ -1207,9 +1259,7 @@ void lustre_msg_clear_flags(struct lustre_msg *msg, int flags)
                                          ~(MSG_GEN_FLAG_MASK & flags);
                  return;
          case LUSTRE_MSG_MAGIC_V2: {
-                struct ptlrpc_body *pb;
-
-                pb = lustre_msg_buf_v2(msg, MSG_PTLRPC_BODY_OFF, sizeof(*pb));
+                struct ptlrpc_body *pb = lustre_msg_ptlrpc_body(msg);
                  LASSERTF(pb, "invalid msg %p: no ptlrpc body!\n", msg);
                  pb->pb_flags &= ~(MSG_GEN_FLAG_MASK & flags);
                  return;
@@ -1226,9 +1276,7 @@ __u32 lustre_msg_get_op_flags(struct lustre_msg *msg)
                  return ((struct lustre_msg_v1 *)msg)->lm_flags >>
                         MSG_OP_FLAG_SHIFT;
          case LUSTRE_MSG_MAGIC_V2: {
-                struct ptlrpc_body *pb;
-
-                pb = lustre_msg_buf_v2(msg, MSG_PTLRPC_BODY_OFF, sizeof(*pb));
+                struct ptlrpc_body *pb = lustre_msg_ptlrpc_body(msg);
                  if (!pb) {
                          CERROR("invalid msg %p: no ptlrpc body!\n", msg);
                          return 0;
@@ -1249,9 +1297,7 @@ void lustre_msg_add_op_flags(struct lustre_msg *msg, int flags)
                          (flags & MSG_GEN_FLAG_MASK) << MSG_OP_FLAG_SHIFT;
                  return;
          case LUSTRE_MSG_MAGIC_V2: {
-                struct ptlrpc_body *pb;
-
-                pb = lustre_msg_buf_v2(msg, MSG_PTLRPC_BODY_OFF, sizeof(*pb));
+                struct ptlrpc_body *pb = lustre_msg_ptlrpc_body(msg);
                  LASSERTF(pb, "invalid msg %p: no ptlrpc body!\n", msg);
                  pb->pb_op_flags |= flags;
                  return;
@@ -1270,9 +1316,7 @@ void lustre_msg_set_op_flags(struct lustre_msg *msg, int flags)
                          ((flags & MSG_GEN_FLAG_MASK) <<MSG_OP_FLAG_SHIFT);
                  return;
          case LUSTRE_MSG_MAGIC_V2: {
-                struct ptlrpc_body *pb;
-
-                pb = lustre_msg_buf_v2(msg, MSG_PTLRPC_BODY_OFF, sizeof(*pb));
+                struct ptlrpc_body *pb = lustre_msg_ptlrpc_body(msg);
                  LASSERTF(pb, "invalid msg %p: no ptlrpc body!\n", msg);
                  pb->pb_op_flags |= flags;
                  return;
@@ -1288,9 +1332,7 @@ struct lustre_handle *lustre_msg_get_handle(struct lustre_msg *msg)
          case LUSTRE_MSG_MAGIC_V1:
                  return &((struct lustre_msg_v1 *)msg)->lm_handle;
          case LUSTRE_MSG_MAGIC_V2: {
-                struct ptlrpc_body *pb;
-
-                pb = lustre_msg_buf_v2(msg, MSG_PTLRPC_BODY_OFF, sizeof(*pb));
+                struct ptlrpc_body *pb = lustre_msg_ptlrpc_body(msg);
                  if (!pb) {
                          CERROR("invalid msg %p: no ptlrpc body!\n", msg);
                          return NULL;
@@ -1309,9 +1351,7 @@ __u32 lustre_msg_get_type(struct lustre_msg *msg)
          case LUSTRE_MSG_MAGIC_V1:
                  return ((struct lustre_msg_v1 *)msg)->lm_type;
          case LUSTRE_MSG_MAGIC_V2: {
-                struct ptlrpc_body *pb;
-
-                pb = lustre_msg_buf_v2(msg, MSG_PTLRPC_BODY_OFF, sizeof(*pb));
+                struct ptlrpc_body *pb = lustre_msg_ptlrpc_body(msg);
                  if (!pb) {
                          CERROR("invalid msg %p: no ptlrpc body!\n", msg);
                          return PTL_RPC_MSG_ERR;
@@ -1330,9 +1370,7 @@ __u32 lustre_msg_get_version(struct lustre_msg *msg)
          case LUSTRE_MSG_MAGIC_V1:
                  return ((struct lustre_msg_v1 *)msg)->lm_version;
          case LUSTRE_MSG_MAGIC_V2: {
-                struct ptlrpc_body *pb;
-
-                pb = lustre_msg_buf_v2(msg, MSG_PTLRPC_BODY_OFF, sizeof(*pb));
+                struct ptlrpc_body *pb = lustre_msg_ptlrpc_body(msg);
                  if (!pb) {
                          CERROR("invalid msg %p: no ptlrpc body!\n", msg);
                          return 0;
@@ -1351,9 +1389,7 @@ void lustre_msg_add_version(struct lustre_msg *msg, int version)
          case LUSTRE_MSG_MAGIC_V1:
                  return;
          case LUSTRE_MSG_MAGIC_V2: {
-                struct ptlrpc_body *pb;
-
-                pb = lustre_msg_buf_v2(msg, MSG_PTLRPC_BODY_OFF, sizeof(*pb));
+                struct ptlrpc_body *pb = lustre_msg_ptlrpc_body(msg);
                  LASSERTF(pb, "invalid msg %p: no ptlrpc body!\n", msg);
                  pb->pb_version |= version;
                  return;
@@ -1369,9 +1405,7 @@ __u32 lustre_msg_get_opc(struct lustre_msg *msg)
          case LUSTRE_MSG_MAGIC_V1:
                  return ((struct lustre_msg_v1 *)msg)->lm_opc;
          case LUSTRE_MSG_MAGIC_V2: {
-                struct ptlrpc_body *pb;
-
-                pb = lustre_msg_buf_v2(msg, MSG_PTLRPC_BODY_OFF, sizeof(*pb));
+                struct ptlrpc_body *pb = lustre_msg_ptlrpc_body(msg);
                  if (!pb) {
                          CERROR("invalid msg %p: no ptlrpc body!\n", msg);
                          return 0;
@@ -1390,9 +1424,7 @@ __u64 lustre_msg_get_last_xid(struct lustre_msg *msg)
          case LUSTRE_MSG_MAGIC_V1:
                  return ((struct lustre_msg_v1 *)msg)->lm_last_xid;
          case LUSTRE_MSG_MAGIC_V2: {
-                struct ptlrpc_body *pb;
-
-                pb = lustre_msg_buf_v2(msg, MSG_PTLRPC_BODY_OFF, sizeof(*pb));
+                struct ptlrpc_body *pb = lustre_msg_ptlrpc_body(msg);
                  if (!pb) {
                          CERROR("invalid msg %p: no ptlrpc body!\n", msg);
                          return 0;
@@ -1411,9 +1443,7 @@ __u64 lustre_msg_get_last_committed(struct lustre_msg *msg)
          case LUSTRE_MSG_MAGIC_V1:
                  return ((struct lustre_msg_v1 *)msg)->lm_last_committed;
          case LUSTRE_MSG_MAGIC_V2: {
-                struct ptlrpc_body *pb;
-
-                pb = lustre_msg_buf_v2(msg, MSG_PTLRPC_BODY_OFF, sizeof(*pb));
+                struct ptlrpc_body *pb = lustre_msg_ptlrpc_body(msg);
                  if (!pb) {
                          CERROR("invalid msg %p: no ptlrpc body!\n", msg);
                          return 0;
@@ -1426,15 +1456,36 @@ __u64 lustre_msg_get_last_committed(struct lustre_msg *msg)
          }
  }
  
+__u64 *lustre_msg_get_versions(struct lustre_msg *msg)
+{
+        switch (msg->lm_magic) {
+        case LUSTRE_MSG_MAGIC_V1:
+                return NULL;
+        case LUSTRE_MSG_MAGIC_V2: {
+                struct ptlrpc_body *pb = lustre_msg_ptlrpc_body(msg);
+                if (!pb) {
+                        CERROR("invalid msg %p: no ptlrpc body!\n", msg);
+                        return 0;
+                }
+#ifdef PTLRPC_INTEROP_1_6
+                if (lustre_msg_buflen(msg, MSG_PTLRPC_BODY_OFF) < sizeof (*pb))
+                        return NULL;
+#endif
+                return pb->pb_pre_versions;
+        }
+        default:
+                CERROR("incorrect message magic: %08x\n", msg->lm_magic);
+                return NULL;
+        }
+}
+
  __u64 lustre_msg_get_transno(struct lustre_msg *msg)
  {
          switch (msg->lm_magic) {
          case LUSTRE_MSG_MAGIC_V1:
                  return ((struct lustre_msg_v1 *)msg)->lm_transno;
          case LUSTRE_MSG_MAGIC_V2: {
-                struct ptlrpc_body *pb;
-
-                pb = lustre_msg_buf_v2(msg, MSG_PTLRPC_BODY_OFF, sizeof(*pb));
+                struct ptlrpc_body *pb = lustre_msg_ptlrpc_body(msg);
                  if (!pb) {
                          CERROR("invalid msg %p: no ptlrpc body!\n", msg);
                          return 0;
@@ -1451,13 +1502,9 @@ int lustre_msg_get_status(struct lustre_msg *msg)
  {
          switch (msg->lm_magic) {
          case LUSTRE_MSG_MAGIC_V1:
-        case LUSTRE_MSG_MAGIC_V1_SWABBED:
                  return ((struct lustre_msg_v1 *)msg)->lm_status;
-        case LUSTRE_MSG_MAGIC_V2:
-        case LUSTRE_MSG_MAGIC_V2_SWABBED: {
-                struct ptlrpc_body *pb;
-
-                pb = lustre_msg_buf_v2(msg, MSG_PTLRPC_BODY_OFF, sizeof(*pb));
+        case LUSTRE_MSG_MAGIC_V2: {
+                struct ptlrpc_body *pb = lustre_msg_ptlrpc_body(msg);
                  if (!pb) {
                          CERROR("invalid msg %p: no ptlrpc body!\n", msg);
                          return -EINVAL;
@@ -1478,9 +1525,7 @@ __u64 lustre_msg_get_slv(struct lustre_msg *msg)
          case LUSTRE_MSG_MAGIC_V1:
                  return 1;
          case LUSTRE_MSG_MAGIC_V2: {
-                struct ptlrpc_body *pb;
-
-                pb = lustre_msg_buf_v2(msg, MSG_PTLRPC_BODY_OFF, sizeof(*pb));
+                struct ptlrpc_body *pb = lustre_msg_ptlrpc_body(msg);
                  if (!pb) {
                          CERROR("invalid msg %p: no ptlrpc body!\n", msg);
                          return -EINVAL;
@@ -1500,9 +1545,7 @@ void lustre_msg_set_slv(struct lustre_msg *msg, __u64 slv)
          case LUSTRE_MSG_MAGIC_V1:
                  return;
          case LUSTRE_MSG_MAGIC_V2: {
-                struct ptlrpc_body *pb;
-
-                pb = lustre_msg_buf_v2(msg, MSG_PTLRPC_BODY_OFF, sizeof(*pb));
+                struct ptlrpc_body *pb = lustre_msg_ptlrpc_body(msg);
                  if (!pb) {
                          CERROR("invalid msg %p: no ptlrpc body!\n", msg);
                          return;
@@ -1522,9 +1565,7 @@ __u32 lustre_msg_get_limit(struct lustre_msg *msg)
          case LUSTRE_MSG_MAGIC_V1:
                  return 1;
          case LUSTRE_MSG_MAGIC_V2: {
-                struct ptlrpc_body *pb;
-
-                pb = lustre_msg_buf_v2(msg, MSG_PTLRPC_BODY_OFF, sizeof(*pb));
+                struct ptlrpc_body *pb = lustre_msg_ptlrpc_body(msg);
                  if (!pb) {
                          CERROR("invalid msg %p: no ptlrpc body!\n", msg);
                          return -EINVAL;
@@ -1544,9 +1585,7 @@ void lustre_msg_set_limit(struct lustre_msg *msg, __u64 limit)
          case LUSTRE_MSG_MAGIC_V1:
                  return;
          case LUSTRE_MSG_MAGIC_V2: {
-                struct ptlrpc_body *pb;
-
-                pb = lustre_msg_buf_v2(msg, MSG_PTLRPC_BODY_OFF, sizeof(*pb));
+                struct ptlrpc_body *pb = lustre_msg_ptlrpc_body(msg);
                  if (!pb) {
                          CERROR("invalid msg %p: no ptlrpc body!\n", msg);
                          return;
@@ -1566,9 +1605,7 @@ __u32 lustre_msg_get_conn_cnt(struct lustre_msg *msg)
          case LUSTRE_MSG_MAGIC_V1:
                  return ((struct lustre_msg_v1 *)msg)->lm_conn_cnt;
          case LUSTRE_MSG_MAGIC_V2: {
-                struct ptlrpc_body *pb;
-
-                pb = lustre_msg_buf_v2(msg, MSG_PTLRPC_BODY_OFF, sizeof(*pb));
+                struct ptlrpc_body *pb = lustre_msg_ptlrpc_body(msg);
                  if (!pb) {
                          CERROR("invalid msg %p: no ptlrpc body!\n", msg);
                          return 0;
@@ -1612,9 +1649,7 @@ __u32 lustre_msg_get_timeout(struct lustre_msg *msg)
          case LUSTRE_MSG_MAGIC_V1:
                  return 0;
          case LUSTRE_MSG_MAGIC_V2: {
-                struct ptlrpc_body *pb;
-
-                pb = lustre_msg_buf_v2(msg, MSG_PTLRPC_BODY_OFF, sizeof(*pb));
+                struct ptlrpc_body *pb = lustre_msg_ptlrpc_body(msg);
                  if (!pb) {
                          CERROR("invalid msg %p: no ptlrpc body!\n", msg);
                          return 0;
@@ -1633,9 +1668,7 @@ __u32 lustre_msg_get_service_time(struct lustre_msg *msg)
          case LUSTRE_MSG_MAGIC_V1:
                  return 0;
          case LUSTRE_MSG_MAGIC_V2: {
-                struct ptlrpc_body *pb;
-
-                pb = lustre_msg_buf_v2(msg, MSG_PTLRPC_BODY_OFF, sizeof(*pb));
+                struct ptlrpc_body *pb = lustre_msg_ptlrpc_body(msg);
                  if (!pb) {
                          CERROR("invalid msg %p: no ptlrpc body!\n", msg);
                          return 0;
@@ -1661,16 +1694,23 @@ __u32 lustre_msg_get_cksum(struct lustre_msg *msg)
          }
  }
  
+/*
+ * the sizeof ptlrpc_body in 1.6 is 88 bytes (64 bytes shorter than current
+ * size), to be able to interoperate with 1.6 we only calculate checksum
+ * aginst first 88 bytes of ptlrpc_body.
+ */
+static const int ptlrpc_body_size_16 = 88;
+
  __u32 lustre_msg_calc_cksum(struct lustre_msg *msg)
  {
          switch (msg->lm_magic) {
          case LUSTRE_MSG_MAGIC_V1:
                  return 0;
          case LUSTRE_MSG_MAGIC_V2: {
-                struct ptlrpc_body *pb;
-                pb = lustre_msg_buf_v2(msg, MSG_PTLRPC_BODY_OFF, sizeof(*pb));
+                struct ptlrpc_body *pb = lustre_msg_ptlrpc_body(msg);
                  LASSERTF(pb, "invalid msg %p: no ptlrpc body!\n", msg);
-                return crc32_le(~(__u32)0, (char *)pb, sizeof(*pb));
+                return crc32_le(~(__u32)0, (unsigned char *)pb,
+                                ptlrpc_body_size_16);
          }
          default:
                  CERROR("incorrect message magic: %08x\n", msg->lm_magic);
@@ -1685,9 +1725,7 @@ void lustre_msg_set_handle(struct lustre_msg *msg, struct lustre_handle *handle)
                  ((struct lustre_msg_v1 *)msg)->lm_handle = *handle;
                  return;
          case LUSTRE_MSG_MAGIC_V2: {
-                struct ptlrpc_body *pb;
-
-                pb = lustre_msg_buf_v2(msg, MSG_PTLRPC_BODY_OFF, sizeof(*pb));
+                struct ptlrpc_body *pb = lustre_msg_ptlrpc_body(msg);
                  LASSERTF(pb, "invalid msg %p: no ptlrpc body!\n", msg);
                  pb->pb_handle = *handle;
                  return;
@@ -1704,9 +1742,7 @@ void lustre_msg_set_type(struct lustre_msg *msg, __u32 type)
                  ((struct lustre_msg_v1 *)msg)->lm_type = type;
                  return;
          case LUSTRE_MSG_MAGIC_V2: {
-                struct ptlrpc_body *pb;
-
-                pb = lustre_msg_buf_v2(msg, MSG_PTLRPC_BODY_OFF, sizeof(*pb));
+                struct ptlrpc_body *pb = lustre_msg_ptlrpc_body(msg);
                  LASSERTF(pb, "invalid msg %p: no ptlrpc body!\n", msg);
                  pb->pb_type = type;
                  return;
@@ -1723,9 +1759,7 @@ void lustre_msg_set_opc(struct lustre_msg *msg, __u32 opc)
                  ((struct lustre_msg_v1 *)msg)->lm_opc = opc;
                  return;
          case LUSTRE_MSG_MAGIC_V2: {
-                struct ptlrpc_body *pb;
-
-                pb = lustre_msg_buf_v2(msg, MSG_PTLRPC_BODY_OFF, sizeof(*pb));
+                struct ptlrpc_body *pb = lustre_msg_ptlrpc_body(msg);
                  LASSERTF(pb, "invalid msg %p: no ptlrpc body!\n", msg);
                  pb->pb_opc = opc;
                  return;
@@ -1742,9 +1776,7 @@ void lustre_msg_set_last_xid(struct lustre_msg *msg, __u64 last_xid)
                  ((struct lustre_msg_v1 *)msg)->lm_last_xid = last_xid;
                  return;
          case LUSTRE_MSG_MAGIC_V2: {
-                struct ptlrpc_body *pb;
-
-                pb = lustre_msg_buf_v2(msg, MSG_PTLRPC_BODY_OFF, sizeof(*pb));
+                struct ptlrpc_body *pb = lustre_msg_ptlrpc_body(msg);
                  LASSERTF(pb, "invalid msg %p: no ptlrpc body!\n", msg);
                  pb->pb_last_xid = last_xid;
                  return;
@@ -1761,9 +1793,7 @@ void lustre_msg_set_last_committed(struct lustre_msg *msg, __u64 last_committed)
                  ((struct lustre_msg_v1 *)msg)->lm_last_committed=last_committed;
                  return;
          case LUSTRE_MSG_MAGIC_V2: {
-                struct ptlrpc_body *pb;
-
-                pb = lustre_msg_buf_v2(msg, MSG_PTLRPC_BODY_OFF, sizeof(*pb));
+                struct ptlrpc_body *pb = lustre_msg_ptlrpc_body(msg);
                  LASSERTF(pb, "invalid msg %p: no ptlrpc body!\n", msg);
                  pb->pb_last_committed = last_committed;
                  return;
@@ -1773,6 +1803,31 @@ void lustre_msg_set_last_committed(struct lustre_msg *msg, __u64 last_committed)
          }
  }
  
+void lustre_msg_set_versions(struct lustre_msg *msg, __u64 *versions)
+{
+        switch (msg->lm_magic) {
+        case LUSTRE_MSG_MAGIC_V1:
+                return;
+        case LUSTRE_MSG_MAGIC_V2: {
+                struct ptlrpc_body *pb = lustre_msg_ptlrpc_body(msg);
+                LASSERTF(pb, "invalid msg %p: no ptlrpc body!\n", msg);
+#ifdef PTLRPC_INTEROP_1_6
+                /* do nothing for old clients */
+                if (lustre_msg_buflen(msg, MSG_PTLRPC_BODY_OFF) < sizeof (*pb))
+                        return;
+#endif
+                pb->pb_pre_versions[0] = versions[0];
+                pb->pb_pre_versions[1] = versions[1];
+                pb->pb_pre_versions[2] = versions[2];
+                pb->pb_pre_versions[3] = versions[3];
+                return;
+        }
+        default:
+                LASSERTF(0, "incorrect message magic: %08x\n", msg->lm_magic);
+        }
+}
+
+
  void lustre_msg_set_transno(struct lustre_msg *msg, __u64 transno)
  {
          switch (msg->lm_magic) {
@@ -1780,9 +1835,7 @@ void lustre_msg_set_transno(struct lustre_msg *msg, __u64 transno)
                  ((struct lustre_msg_v1 *)msg)->lm_transno = transno;
                  return;
          case LUSTRE_MSG_MAGIC_V2: {
-                struct ptlrpc_body *pb;
-
-                pb = lustre_msg_buf_v2(msg, MSG_PTLRPC_BODY_OFF, sizeof(*pb));
+                struct ptlrpc_body *pb = lustre_msg_ptlrpc_body(msg);
                  LASSERTF(pb, "invalid msg %p: no ptlrpc body!\n", msg);
                  pb->pb_transno = transno;
                  return;
@@ -1799,9 +1852,7 @@ void lustre_msg_set_status(struct lustre_msg *msg, __u32 status)
                  ((struct lustre_msg_v1 *)msg)->lm_status = status;
                  return;
          case LUSTRE_MSG_MAGIC_V2: {
-                struct ptlrpc_body *pb;
-
-                pb = lustre_msg_buf_v2(msg, MSG_PTLRPC_BODY_OFF, sizeof(*pb));
+                struct ptlrpc_body *pb = lustre_msg_ptlrpc_body(msg);
                  LASSERTF(pb, "invalid msg %p: no ptlrpc body!\n", msg);
                  pb->pb_status = status;
                  return;
@@ -1818,9 +1869,7 @@ void lustre_msg_set_conn_cnt(struct lustre_msg *msg, __u32 conn_cnt)
                  ((struct lustre_msg_v1 *)msg)->lm_conn_cnt = conn_cnt;
                  return;
          case LUSTRE_MSG_MAGIC_V2: {
-                struct ptlrpc_body *pb;
-
-                pb = lustre_msg_buf_v2(msg, MSG_PTLRPC_BODY_OFF, sizeof(*pb));
+                struct ptlrpc_body *pb = lustre_msg_ptlrpc_body(msg);
                  LASSERTF(pb, "invalid msg %p: no ptlrpc body!\n", msg);
                  pb->pb_conn_cnt = conn_cnt;
                  return;
@@ -1836,9 +1885,7 @@ void lustre_msg_set_timeout(struct lustre_msg *msg, __u32 timeout)
          case LUSTRE_MSG_MAGIC_V1:
                  return;
          case LUSTRE_MSG_MAGIC_V2: {
-                struct ptlrpc_body *pb;
-
-                pb = lustre_msg_buf_v2(msg, MSG_PTLRPC_BODY_OFF, sizeof(*pb));
+                struct ptlrpc_body *pb = lustre_msg_ptlrpc_body(msg);
                  LASSERTF(pb, "invalid msg %p: no ptlrpc body!\n", msg);
                  pb->pb_timeout = timeout;
                  return;
@@ -1854,9 +1901,7 @@ void lustre_msg_set_service_time(struct lustre_msg *msg, __u32 service_time)
          case LUSTRE_MSG_MAGIC_V1:
                  return;
          case LUSTRE_MSG_MAGIC_V2: {
-                struct ptlrpc_body *pb;
-
-                pb = lustre_msg_buf_v2(msg, MSG_PTLRPC_BODY_OFF, sizeof(*pb));
+                struct ptlrpc_body *pb = lustre_msg_ptlrpc_body(msg);
                  LASSERTF(pb, "invalid msg %p: no ptlrpc body!\n", msg);
                  pb->pb_service_time = service_time;
                  return;
@@ -1883,7 +1928,7 @@ void lustre_msg_set_cksum(struct lustre_msg *msg, __u32 cksum)
  /* byte flipping routines for all wire types declared in
   * lustre_idl.h implemented here.
   */
-void lustre_swab_ptlrpc_body(struct ptlrpc_body *b)
+void lustre_swab_ptlrpc_body(struct ptlrpc_body *b, int msgsize)
  {
          __swab32s (&b->pb_type);
          __swab32s (&b->pb_version);
@@ -1900,6 +1945,13 @@ void lustre_swab_ptlrpc_body(struct ptlrpc_body *b)
          __swab32s (&b->pb_service_time);
          __swab64s (&b->pb_slv);
          __swab32s (&b->pb_limit);
+        if (msgsize < offsetof(struct ptlrpc_body, pb_pre_versions[4]))
+                return;
+        __swab64s (&b->pb_pre_versions[0]);
+        __swab64s (&b->pb_pre_versions[1]);
+        __swab64s (&b->pb_pre_versions[2]);
+        __swab64s (&b->pb_pre_versions[3]);
+        CLASSERT(offsetof(typeof(*b), pb_padding) != 0);
  }
  
  void lustre_swab_connect(struct obd_connect_data *ocd)
@@ -2042,7 +2094,7 @@ void lustre_swab_mgs_target_info(struct mgs_target_info *mti)
          __swab32s(&mti->mti_flags);
          __swab32s(&mti->mti_nid_count);
          CLASSERT(sizeof(lnet_nid_t) == sizeof(__u64));
-        for (i = 0; i < MTI_NIDS_MAX; i++) 
+        for (i = 0; i < MTI_NIDS_MAX; i++)
                  __swab64s(&mti->mti_nids[i]);
  }
  
@@ -2166,6 +2218,30 @@ void lustre_swab_mds_rec_unlink (struct mds_rec_unlink *ul)
          CLASSERT(offsetof(typeof(*ul), ul_padding_4) != 0);
  }
  
+void lustre_swab_fiemap_extent(struct ll_fiemap_extent *fm_extent)
+{
+        __swab64s(&fm_extent->fe_logical);
+        __swab64s(&fm_extent->fe_physical);
+        __swab64s(&fm_extent->fe_length);
+        __swab32s(&fm_extent->fe_flags);
+        __swab32s(&fm_extent->fe_device);
+}
+
+void lustre_swab_fiemap(struct ll_user_fiemap *fiemap)
+{
+        int i;
+
+        __swab64s(&fiemap->fm_start);
+        __swab64s(&fiemap->fm_length);
+        __swab32s(&fiemap->fm_flags);
+        __swab32s(&fiemap->fm_mapped_extents);
+        __swab32s(&fiemap->fm_extent_count);
+        __swab32s(&fiemap->fm_reserved);
+
+        for (i = 0; i < fiemap->fm_mapped_extents; i++)
+                lustre_swab_fiemap_extent(&fiemap->fm_extents[i]);
+}
+
  void lustre_swab_mds_rec_rename (struct mds_rec_rename *rn)
  {
          __swab32s (&rn->rn_opcode);
@@ -2188,18 +2264,17 @@ void lustre_swab_lov_desc (struct lov_desc *ld)
          __swab32s (&ld->ld_tgt_count);
          __swab32s (&ld->ld_active_tgt_count);
          __swab32s (&ld->ld_default_stripe_count);
+        __swab32s (&ld->ld_pattern);
          __swab64s (&ld->ld_default_stripe_size);
          __swab64s (&ld->ld_default_stripe_offset);
-        __swab32s (&ld->ld_pattern);
          __swab32s (&ld->ld_qos_maxage);
          /* uuid endian insensitive */
  }
  
  
-void lustre_swab_lov_user_md(struct lov_user_md *lum)
+static void lustre_swab_lov_user_md_common(struct lov_user_md_v1 *lum)
  {
          ENTRY;
-        CDEBUG(D_IOCTL, "swabbing lov_user_md\n");
          __swab32s(&lum->lmm_magic);
          __swab32s(&lum->lmm_pattern);
          __swab64s(&lum->lmm_object_id);
@@ -2210,6 +2285,23 @@ void lustre_swab_lov_user_md(struct lov_user_md *lum)
          EXIT;
  }
  
+void lustre_swab_lov_user_md_v1(struct lov_user_md_v1 *lum)
+{
+        ENTRY;
+        CDEBUG(D_IOCTL, "swabbing lov_user_md v1\n");
+        lustre_swab_lov_user_md_common(lum);
+        EXIT;
+}
+
+void lustre_swab_lov_user_md_v3(struct lov_user_md_v3 *lum)
+{
+        ENTRY;
+        CDEBUG(D_IOCTL, "swabbing lov_user_md v3\n");
+        lustre_swab_lov_user_md_common((struct lov_user_md_v1 *)lum);
+        /* lmm_pool_name nothing to do with char */
+        EXIT;
+}
+
  void lustre_swab_lov_user_md_join(struct lov_user_md_join *lumj)
  {
          ENTRY;
@@ -2224,17 +2316,16 @@ void lustre_swab_lov_user_md_join(struct lov_user_md_join *lumj)
          EXIT;
  }
  
-void lustre_swab_lov_user_md_objects(struct lov_user_md *lum)
+void lustre_swab_lov_user_md_objects(struct lov_user_ost_data *lod,
+                                     int stripe_count)
  {
-        struct lov_user_ost_data *lod;
          int i;
          ENTRY;
-        for (i = 0; i < lum->lmm_stripe_count; i++) {
-                lod = &lum->lmm_objects[i];
-                __swab64s(&lod->l_object_id);
-                __swab64s(&lod->l_object_gr);
-                __swab32s(&lod->l_ost_gen);
-                __swab32s(&lod->l_ost_idx);
+        for (i = 0; i < stripe_count; i++) {
+                __swab64s(&(lod[i].l_object_id));
+                __swab64s(&(lod[i].l_object_gr));
+                __swab32s(&(lod[i].l_ost_gen));
+                __swab32s(&(lod[i].l_ost_idx));
          }
          EXIT;
  }
@@ -2369,6 +2460,7 @@ int quota_get_qdata(void *request, struct qunit_data *qdata,
          int size2 = sizeof(struct qunit_data_old2);
          __u64  flags = is_exp ? req->rq_export->exp_connect_flags :
                         req->rq_import->imp_connect_data.ocd_connect_flags;
+        int rc = 0;
  
          LASSERT(req);
          LASSERT(qdata);
@@ -2394,6 +2486,8 @@ int quota_get_qdata(void *request, struct qunit_data *qdata,
                          new = lustre_swab_repbuf(req, REPLY_REC_OFF,
                                                   sizeof(struct qunit_data),
                                                   lustre_swab_qdata);
+                if (new == NULL)
+                        GOTO(out, rc = -EPROTO);
                  *qdata = *new;
                  QDATA_SET_CHANGE_QS(qdata);
                  return 0;
@@ -2412,6 +2506,8 @@ without_change_qs:
                  else
                          old2 = lustre_swab_repbuf(req, REPLY_REC_OFF, size2,
                                                    lustre_swab_qdata_old2);
+                if (old2 == NULL)
+                        GOTO(out, rc = -EPROTO);
                  qdata_v2_to_v3(old2, qdata);
  
                  return 0;
@@ -2419,8 +2515,8 @@ without_change_qs:
  #else
  #warning "remove quota code above for format absolete in new release"
  #endif
-
-        return 0;
+out:
+        return rc;
  }
  EXPORT_SYMBOL(quota_get_qdata);
  
@@ -2433,6 +2529,7 @@ int quota_copy_qdata(void *request, struct qunit_data *qdata,
          struct qunit_data_old2 *old2;
          __u64  flags = is_exp ? req->rq_export->exp_connect_flags :
                  req->rq_import->imp_connect_data.ocd_connect_flags;
+        int rc = 0;
  
          LASSERT(req);
          LASSERT(qdata);
@@ -2457,7 +2554,7 @@ int quota_copy_qdata(void *request, struct qunit_data *qdata,
                          target = lustre_msg_buf(req->rq_repmsg, REPLY_REC_OFF,
                                                  sizeof(struct qunit_data));
                  if (!target)
-                        return -EINVAL;
+                        GOTO(out, rc = -EPROTO);
                  memcpy(target, qdata, sizeof(*qdata));
                  return 0;
          }
@@ -2473,7 +2570,7 @@ without_change_qs:
                          target = lustre_msg_buf(req->rq_repmsg, REPLY_REC_OFF,
                                                  sizeof(struct qunit_data_old2));
                  if (!target)
-                        return -EINVAL;
+                        GOTO(out, rc = -EPROTO);
                  old2 = qdata_v3_to_v2(qdata);
                  memcpy(target, old2, sizeof(*old2));
                  return 0;
@@ -2481,8 +2578,8 @@ without_change_qs:
  #else
  #warning "remove quota code above for format absolete in new release"
  #endif
-
-        return 0;
+out:
+        return rc;
  }
  EXPORT_SYMBOL(quota_copy_qdata);
  
@@ -2542,39 +2639,43 @@ void _debug_req(struct ptlrpc_request *req, __u32 mask,
          int rep_fl = 0;
          int rep_status = 0;
  
+        /* Caller is responsible holding a reference on the request */
+        LASSERT(req && atomic_read(&req->rq_refcount) > 0);
+
          if (req->rq_reqmsg &&
              (!lustre_msg_need_swab(req->rq_reqmsg) ||
-            lustre_req_need_swab(req))) {
+             (lustre_req_need_swab(req) &&
+              lustre_req_swabbed(req, MSG_PTLRPC_BODY_OFF)))) {
                  opc = lustre_msg_get_opc(req->rq_reqmsg);
                  req_fl = lustre_msg_get_flags(req->rq_reqmsg);
          }
  
          if (req->rq_repmsg &&
-           (!lustre_msg_need_swab(req->rq_repmsg) ||
-            lustre_rep_need_swab(req))) {
+            (!lustre_msg_need_swab(req->rq_repmsg) ||
+             (lustre_rep_need_swab(req) &&
+              lustre_rep_swabbed(req, MSG_PTLRPC_BODY_OFF)))) {
                  rep_fl = lustre_msg_get_flags(req->rq_repmsg);
                  rep_status = lustre_msg_get_status(req->rq_repmsg);
          }
  
          va_start(args, fmt);
-        libcfs_debug_vmsg2(data->msg_cdls, data->msg_subsys, mask, data->msg_file,
-                           data->msg_fn, data->msg_line, fmt, args,
-                           " req@%p x"LPD64"/t"LPD64" o%d->%s@%s:%d/%d "
-                           "lens %d/%d e %d to %d dl %ld ref %d "
-                           "fl "REQ_FLAGS_FMT"/%x/%x rc %d/%d\n",
-                           req, req->rq_xid, req->rq_transno, opc,
-                           req->rq_import ? obd2cli_tgt(req->rq_import->imp_obd) :
-                           req->rq_export ?
-                                (char*)req->rq_export->exp_client_uuid.uuid : "<?>",
-                           req->rq_import ?
-                                (char *)req->rq_import->imp_connection->c_remote_uuid.uuid :
-                           req->rq_export ?
-                                (char *)req->rq_export->exp_connection->c_remote_uuid.uuid : "<?>",
-                           req->rq_request_portal,  req->rq_reply_portal,
-                           req->rq_reqlen, req->rq_replen,
-                           req->rq_early_count, req->rq_timeout, req->rq_deadline,
-                           atomic_read(&req->rq_refcount), DEBUG_REQ_FLAGS(req),
-                           req_fl, rep_fl, req->rq_status, rep_status);
+        libcfs_debug_vmsg2(data->msg_cdls, data->msg_subsys, mask,
+                data->msg_file, data->msg_fn, data->msg_line, fmt, args,
+                " req@%p x"LPD64"/t"LPD64" o%d->%s@%s:%d/%d lens %d/%d e %d "
+                "to %d dl %ld ref %d fl "REQ_FLAGS_FMT"/%x/%x rc %d/%d\n",
+                req, req->rq_xid, req->rq_transno, opc,
+                req->rq_import ? obd2cli_tgt(req->rq_import->imp_obd) :
+                req->rq_export ?
+                (char*)req->rq_export->exp_client_uuid.uuid : "<?>",
+                req->rq_import ?
+                (char *)req->rq_import->imp_connection->c_remote_uuid.uuid :
+                req->rq_export ?
+                (char *)req->rq_export->exp_connection->c_remote_uuid.uuid :
+                "<?>", req->rq_request_portal,  req->rq_reply_portal,
+                req->rq_reqlen, req->rq_replen,
+                req->rq_early_count, !!req->rq_timeout, req->rq_deadline,
+                atomic_read(&req->rq_refcount), DEBUG_REQ_FLAGS(req),
+                req_fl, rep_fl, req->rq_status, rep_status);
          va_end(args);
  }
  
diff --git a/lustre/ptlrpc/pers.c b/lustre/ptlrpc/pers.c

index 865dcf0..d53d42c 100644 (file)
--- a/lustre/ptlrpc/pers.c
+++ b/lustre/ptlrpc/pers.c
@@ -1,25 +1,37 @@
  /* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
   * vim:expandtab:shiftwidth=8:tabstop=8:
   *
- *  Copyright (c) 2004 Cluster File Systems, Inc.
+ * GPL HEADER START
   *
- *   This file is part of the Lustre file system, http://www.lustre.org
- *   Lustre is a trademark of Cluster File Systems, Inc.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   *
- *   You may have signed or agreed to another license before downloading
- *   this software.  If so, you are bound by the terms and conditions
- *   of that agreement, and the following does not apply to you.  See the
- *   LICENSE file included with this distribution for more information.
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
   *
- *   If you did not agree to a different license, then this copy of Lustre
- *   is open source software; you can redistribute it and/or modify it
- *   under the terms of version 2 of the GNU General Public License as
- *   published by the Free Software Foundation.
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
   *
- *   In either case, Lustre is distributed in the hope that it will be
- *   useful, but WITHOUT ANY WARRANTY; without even the implied warranty
- *   of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- *   license text for more details.
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright  2008 Sun Microsystems, Inc. All rights reserved
+ * Use is subject to license terms.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
   */
  
  #define DEBUG_SUBSYSTEM S_RPC
diff --git a/lustre/ptlrpc/pinger.c b/lustre/ptlrpc/pinger.c

index 59ccff3..4b9e1a0 100644 (file)
--- a/lustre/ptlrpc/pinger.c
+++ b/lustre/ptlrpc/pinger.c
@@ -1,29 +1,41 @@
  /* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
   * vim:expandtab:shiftwidth=8:tabstop=8:
   *
- * Portal-RPC reconnection and replay operations, for use in recovery.
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   *
- *  Copyright (c) 2003 Cluster File Systems, Inc.
- *   Authors: Phil Schwan <phil@clusterfs.com>
- *            Mike Shaver <shaver@clusterfs.com>
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
   *
- *   This file is part of the Lustre file system, http://www.lustre.org
- *   Lustre is a trademark of Cluster File Systems, Inc.
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
   *
- *   You may have signed or agreed to another license before downloading
- *   this software.  If so, you are bound by the terms and conditions
- *   of that agreement, and the following does not apply to you.  See the
- *   LICENSE file included with this distribution for more information.
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
   *
- *   If you did not agree to a different license, then this copy of Lustre
- *   is open source software; you can redistribute it and/or modify it
- *   under the terms of version 2 of the GNU General Public License as
- *   published by the Free Software Foundation.
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
   *
- *   In either case, Lustre is distributed in the hope that it will be
- *   useful, but WITHOUT ANY WARRANTY; without even the implied warranty
- *   of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- *   license text for more details.
+ * GPL HEADER END
+ */
+/*
+ * Copyright  2008 Sun Microsystems, Inc. All rights reserved
+ * Use is subject to license terms.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/ptlrpc/pinger.c
+ *
+ * Portal-RPC reconnection and replay operations, for use in recovery.
   */
  
  #ifndef __KERNEL__
@@ -39,20 +51,51 @@
  struct semaphore pinger_sem;
  static struct list_head pinger_imports = CFS_LIST_HEAD_INIT(pinger_imports);
  
+struct ptlrpc_request *
+ptlrpc_prep_ping(struct obd_import *imp)
+{
+        struct ptlrpc_request *req;
+
+        req = ptlrpc_prep_req(imp, LUSTRE_OBD_VERSION,
+                              OBD_PING, 1, NULL, NULL);
+        if (req) {
+                ptlrpc_req_set_repsize(req, 1, NULL);
+                req->rq_no_resend = req->rq_no_delay = 1;
+        }
+        return req;
+}
+
+int ptlrpc_obd_ping(struct obd_device *obd)
+{
+        int rc;
+        struct ptlrpc_request *req;
+        ENTRY;
+
+        req = ptlrpc_prep_ping(obd->u.cli.cl_import);
+        if (req == NULL)
+                RETURN(-ENOMEM);
+
+        req->rq_send_state = LUSTRE_IMP_FULL;
+
+        rc = ptlrpc_queue_wait(req);
+
+        ptlrpc_req_finished(req);
+
+        RETURN(rc);
+}
+EXPORT_SYMBOL(ptlrpc_obd_ping);
+
  int ptlrpc_ping(struct obd_import *imp)
  {
          struct ptlrpc_request *req;
          int rc = 0;
          ENTRY;
  
-        req = ptlrpc_prep_req(imp, LUSTRE_OBD_VERSION, OBD_PING, 
-                              1, NULL, NULL);
+        req = ptlrpc_prep_ping(imp);
          if (req) {
                  DEBUG_REQ(D_INFO, req, "pinging %s->%s",
                            imp->imp_obd->obd_uuid.uuid,
                            obd2cli_tgt(imp->imp_obd));
-                req->rq_no_resend = req->rq_no_delay = 1;
-                ptlrpc_req_set_repsize(req, 1, NULL);
                  ptlrpcd_add_req(req);
          } else {
                  CERROR("OOM trying to ping %s->%s\n",
@@ -63,6 +106,7 @@ int ptlrpc_ping(struct obd_import *imp)
  
          RETURN(rc);
  }
+EXPORT_SYMBOL(ptlrpc_ping);
  
  void ptlrpc_update_next_ping(struct obd_import *imp)
  {
@@ -83,6 +127,12 @@ void ptlrpc_ping_import_soon(struct obd_import *imp)
          imp->imp_next_ping = cfs_time_current();
  }
  
+static inline int imp_is_deactive(struct obd_import *imp)
+{
+        return (imp->imp_deactive ||
+                OBD_FAIL_CHECK(OBD_FAIL_PTLRPC_IMP_DEACTIVE));
+}
+
  #ifdef __KERNEL__
  static int ptlrpc_pinger_main(void *arg)
  {
@@ -127,14 +177,14 @@ static int ptlrpc_pinger_main(void *arg)
                              cfs_time_aftereq(this_ping, 
                                               imp->imp_next_ping - 5 * CFS_TICK)) {
                                  if (level == LUSTRE_IMP_DISCON &&
-                                    !imp->imp_deactive) {
+                                    !imp_is_deactive(imp)) {
                                          /* wait at least a timeout before
                                             trying recovery again. */
                                          imp->imp_next_ping = cfs_time_shift(obd_timeout);
                                          ptlrpc_initiate_recovery(imp);
                                  } else if (level != LUSTRE_IMP_FULL ||
                                           imp->imp_obd->obd_no_recov ||
-                                         imp->imp_deactive) {
+                                         imp_is_deactive(imp)) {
                                          CDEBUG(D_HA, "not pinging %s "
                                                 "(in recovery: %s or recovery "
                                                 "disabled: %u/%u)\n",
@@ -143,7 +193,7 @@ static int ptlrpc_pinger_main(void *arg)
                                                 imp->imp_deactive,
                                                 imp->imp_obd->obd_no_recov);
                                  } else if (imp->imp_pingable || force) {
-                                        ptlrpc_ping(imp);
+                                                ptlrpc_ping(imp);
                                  }
                          } else {
                                  if (!imp->imp_pingable)
@@ -467,6 +517,7 @@ static int pinger_check_rpcs(void *arg)
          struct ptlrpc_request *req;
          struct ptlrpc_request_set *set;
          struct list_head *iter;
+        struct obd_import *imp;
          struct pinger_data *pd = &pinger_args;
          int rc;
  
@@ -528,7 +579,7 @@ static int pinger_check_rpcs(void *arg)
                          req->rq_no_resend = 1;
                          ptlrpc_req_set_repsize(req, 1, NULL);
                          req->rq_send_state = LUSTRE_IMP_FULL;
-                        req->rq_phase = RQ_PHASE_RPC;
+                        ptlrpc_rqphase_move(req, RQ_PHASE_RPC);
                          req->rq_import_generation = generation;
                          ptlrpc_set_add_req(set, req);
                  } else {
@@ -574,17 +625,23 @@ do_check_set:
                  if (req->rq_phase == RQ_PHASE_COMPLETE)
                          continue;
  
-                req->rq_phase = RQ_PHASE_COMPLETE;
-                atomic_dec(&req->rq_import->imp_inflight);
-                set->set_remaining--;
-                /* If it was disconnected, don't sweat it. */
-                if (list_empty(&req->rq_import->imp_pinger_chain)) {
-                        ptlrpc_unregister_reply(req);
-                        continue;
-                }
+                CDEBUG(D_RPCTRACE, "Pinger initiate expire request(%p)\n",
+                       req);
+
+                /* This will also unregister reply. */
+                ptlrpc_expire_one_request(req, 0);
  
-                CDEBUG(D_RPCTRACE, "pinger initiate expire_one_request\n");
-                ptlrpc_expire_one_request(req);
+                /* We're done with this req, let's finally move it to complete
+                 * phase and take care of inflights. */
+                ptlrpc_rqphase_move(req, RQ_PHASE_COMPLETE);
+                imp = req->rq_import;
+                spin_lock(&imp->imp_lock);
+                if (!list_empty(&req->rq_list)) {
+                        list_del_init(&req->rq_list);
+                        atomic_dec(&imp->imp_inflight);
+                }
+                spin_unlock(&imp->imp_lock);
+                set->set_remaining--;
          }
          mutex_up(&pinger_sem);
  
@@ -682,11 +739,13 @@ void ptlrpc_pinger_wake_up()
                  CDEBUG(D_RPCTRACE, "checking import %s->%s\n",
                         imp->imp_obd->obd_uuid.uuid, obd2cli_tgt(imp->imp_obd));
  #ifdef ENABLE_LIBLUSTRE_RECOVERY
-                if (imp->imp_state == LUSTRE_IMP_DISCON && !imp->imp_deactive)
+                if (imp->imp_state == LUSTRE_IMP_DISCON &&
+                    !imp_is_deactive(imp))
  #else
                  /*XXX only recover for the initial connection */
                  if (!lustre_handle_is_used(&imp->imp_remote_handle) &&
-                    imp->imp_state == LUSTRE_IMP_DISCON && !imp->imp_deactive)
+                    imp->imp_state == LUSTRE_IMP_DISCON &&
+                    !imp_is_deactive(imp))
  #endif
                          ptlrpc_initiate_recovery(imp);
                  else if (imp->imp_state != LUSTRE_IMP_FULL)
@@ -694,7 +753,7 @@ void ptlrpc_pinger_wake_up()
                                       "state %d, deactive %d\n",
                                       imp->imp_obd->obd_uuid.uuid,
                                       obd2cli_tgt(imp->imp_obd), imp->imp_state,
-                                     imp->imp_deactive);
+                                     imp_is_deactive(imp));
          }
  #endif
          EXIT;
diff --git a/lustre/ptlrpc/ptlrpc_internal.h b/lustre/ptlrpc/ptlrpc_internal.h

index 4dd83e5..6fdb453 100644 (file)
--- a/lustre/ptlrpc/ptlrpc_internal.h
+++ b/lustre/ptlrpc/ptlrpc_internal.h
@@ -1,26 +1,37 @@
  /* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
   * vim:expandtab:shiftwidth=8:tabstop=8:
   *
- *  Copyright (C) 2003 Cluster File Systems, Inc.
+ * GPL HEADER START
   *
- *   This file is part of the Lustre file system, http://www.lustre.org
- *   Lustre is a trademark of Cluster File Systems, Inc.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   *
- *   You may have signed or agreed to another license before downloading
- *   this software.  If so, you are bound by the terms and conditions
- *   of that agreement, and the following does not apply to you.  See the
- *   LICENSE file included with this distribution for more information.
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
   *
- *   If you did not agree to a different license, then this copy of Lustre
- *   is open source software; you can redistribute it and/or modify it
- *   under the terms of version 2 of the GNU General Public License as
- *   published by the Free Software Foundation.
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
   *
- *   In either case, Lustre is distributed in the hope that it will be
- *   useful, but WITHOUT ANY WARRANTY; without even the implied warranty
- *   of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- *   license text for more details.
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
   *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright  2008 Sun Microsystems, Inc. All rights reserved
+ * Use is subject to license terms.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
   */
  
  /* Intramodule declarations for ptlrpc. */
@@ -37,6 +48,13 @@ struct ptlrpc_request_set;
  extern int test_req_buffer_pressure;
  extern cfs_mem_cache_t *ptlrpc_cbdata_slab;
  
+/* client.c */
+void ptlrpc_init_xid(void);
+
+/* events.c */
+int ptlrpc_init_portals(void);
+void ptlrpc_exit_portals(void);
+
  void ptlrpc_request_handle_notconn(struct ptlrpc_request *);
  void lustre_assert_wire_constants(void);
  int ptlrpc_import_in_recovery(struct obd_import *imp);
@@ -45,6 +63,8 @@ void ptlrpc_handle_failed_import(struct obd_import *imp);
  int ptlrpc_replay_next(struct obd_import *imp, int *inflight);
  void ptlrpc_initiate_recovery(struct obd_import *imp);
  
+int lustre_msg_need_swab(struct lustre_msg *msg);
+int lustre_unpack_msg_ptlrpc_body(struct lustre_msg *msg, int offset, int swab);
  int lustre_unpack_req_ptlrpc_body(struct ptlrpc_request *req, int offset);
  int lustre_unpack_rep_ptlrpc_body(struct ptlrpc_request *req, int offset);
  
@@ -63,7 +83,7 @@ void ptlrpc_lprocfs_do_request_stat (struct ptlrpc_request *req,
  #endif /* LPROCFS */
  
  /* recovd_thread.c */
-int ptlrpc_expire_one_request(struct ptlrpc_request *req);
+int ptlrpc_expire_one_request(struct ptlrpc_request *req, int async_unlink);
  
  /* pers.c */
  void ptlrpc_fill_bulk_md(lnet_md_t *md, struct ptlrpc_bulk_desc *desc);
@@ -83,6 +103,10 @@ int ping_evictor_wake(struct obd_export *exp);
  #define ping_evictor_wake(exp)     1
  #endif
  
+/* recov_thread.c */
+int llog_recov_init(void);
+void llog_recov_fini(void);
+
  static inline int ll_rpc_recoverable_error(int rc)
  { 
          return (rc == -ENOTCONN || rc == -ENODEV);
diff --git a/lustre/ptlrpc/ptlrpc_module.c b/lustre/ptlrpc/ptlrpc_module.c

index f74eef1..b798b1a 100644 (file)
--- a/lustre/ptlrpc/ptlrpc_module.c
+++ b/lustre/ptlrpc/ptlrpc_module.c
@@ -1,26 +1,37 @@
  /* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
   * vim:expandtab:shiftwidth=8:tabstop=8:
   *
- *  Copyright (c) 2002, 2003 Cluster File Systems, Inc.
+ * GPL HEADER START
   *
- *   This file is part of the Lustre file system, http://www.lustre.org
- *   Lustre is a trademark of Cluster File Systems, Inc.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   *
- *   You may have signed or agreed to another license before downloading
- *   this software.  If so, you are bound by the terms and conditions
- *   of that agreement, and the following does not apply to you.  See the
- *   LICENSE file included with this distribution for more information.
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
   *
- *   If you did not agree to a different license, then this copy of Lustre
- *   is open source software; you can redistribute it and/or modify it
- *   under the terms of version 2 of the GNU General Public License as
- *   published by the Free Software Foundation.
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
   *
- *   In either case, Lustre is distributed in the hope that it will be
- *   useful, but WITHOUT ANY WARRANTY; without even the implied warranty
- *   of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- *   license text for more details.
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
   *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright  2008 Sun Microsystems, Inc. All rights reserved
+ * Use is subject to license terms.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
   */
  
  #ifndef EXPORT_SYMTAB
@@ -37,14 +48,13 @@
  #include <lustre_net.h>
  
  #include "ptlrpc_internal.h"
+
  cfs_mem_cache_t *ptlrpc_cbdata_slab;
  extern spinlock_t ptlrpc_last_xid_lock;
  extern spinlock_t ptlrpc_rs_debug_lock;
  extern spinlock_t ptlrpc_all_services_lock;
  extern struct semaphore pinger_sem;
  extern struct semaphore ptlrpcd_sem;
-extern int ptlrpc_init_portals(void);
-extern void ptlrpc_exit_portals(void);
  
  __init int ptlrpc_init(void)
  {
@@ -52,23 +62,23 @@ __init int ptlrpc_init(void)
          ENTRY;
  
          lustre_assert_wire_constants();
-        spin_lock_init(&ptlrpc_last_xid_lock);
          spin_lock_init(&ptlrpc_rs_debug_lock);
          spin_lock_init(&ptlrpc_all_services_lock);
          init_mutex(&pinger_sem);
          init_mutex(&ptlrpcd_sem);
+        ptlrpc_init_xid();
  
          rc = ptlrpc_init_portals();
          if (rc)
                  RETURN(rc);
          cleanup_phase = 1;
  
-        rc = ptlrpc_init_connection();
+        rc = ptlrpc_connection_init();
          if (rc)
                  GOTO(cleanup, rc);
          cleanup_phase = 2;
  
-        ptlrpc_put_connection_superhack = ptlrpc_put_connection;
+        ptlrpc_put_connection_superhack = ptlrpc_connection_put;
  
          rc = ptlrpc_start_pinger();
          if (rc)
@@ -86,16 +96,23 @@ __init int ptlrpc_init(void)
          if (ptlrpc_cbdata_slab == NULL)
                  GOTO(cleanup, rc);
  
+        cleanup_phase = 5;
+        rc = llog_recov_init();
+        if (rc)
+                GOTO(cleanup, rc);
+
          RETURN(0);
  
  cleanup:
          switch(cleanup_phase) {
+        case 5:
+                cfs_mem_cache_destroy(ptlrpc_cbdata_slab);
          case 4:
                  ldlm_exit();
          case 3:
                  ptlrpc_stop_pinger();
          case 2:
-                ptlrpc_cleanup_connection();
+                ptlrpc_connection_fini();
          case 1:
                  ptlrpc_exit_portals();
          default: ;
@@ -107,21 +124,20 @@ cleanup:
  #ifdef __KERNEL__
  static void __exit ptlrpc_exit(void)
  {
+        llog_recov_fini();
          ldlm_exit();
          ptlrpc_stop_pinger();
          ptlrpc_exit_portals();
-        ptlrpc_cleanup_connection();
+        ptlrpc_connection_fini();
          cfs_mem_cache_destroy(ptlrpc_cbdata_slab);
  }
  
  /* connection.c */
-EXPORT_SYMBOL(ptlrpc_dump_connections);
-EXPORT_SYMBOL(ptlrpc_readdress_connection);
-EXPORT_SYMBOL(ptlrpc_get_connection);
-EXPORT_SYMBOL(ptlrpc_put_connection);
+EXPORT_SYMBOL(ptlrpc_connection_get);
+EXPORT_SYMBOL(ptlrpc_connection_put);
  EXPORT_SYMBOL(ptlrpc_connection_addref);
-EXPORT_SYMBOL(ptlrpc_init_connection);
-EXPORT_SYMBOL(ptlrpc_cleanup_connection);
+EXPORT_SYMBOL(ptlrpc_connection_init);
+EXPORT_SYMBOL(ptlrpc_connection_fini);
  
  /* niobuf.c */
  EXPORT_SYMBOL(ptlrpc_start_bulk_transfer);
@@ -148,7 +164,6 @@ EXPORT_SYMBOL(ptlrpc_free_rq_pool);
  EXPORT_SYMBOL(ptlrpc_prep_req_pool);
  EXPORT_SYMBOL(ptlrpc_at_set_req_timeout);
  EXPORT_SYMBOL(ptlrpc_prep_req);
-EXPORT_SYMBOL(ptlrpc_free_req);
  EXPORT_SYMBOL(ptlrpc_unregister_reply);
  EXPORT_SYMBOL(ptlrpc_req_finished);
  EXPORT_SYMBOL(ptlrpc_req_finished_with_imp_lock);
@@ -184,6 +199,7 @@ EXPORT_SYMBOL(ptlrpc_start_thread);
  EXPORT_SYMBOL(ptlrpc_unregister_service);
  EXPORT_SYMBOL(ptlrpc_daemonize);
  EXPORT_SYMBOL(ptlrpc_service_health_check);
+EXPORT_SYMBOL(ptlrpc_hpreq_reorder);
  
  /* pack_generic.c */
  EXPORT_SYMBOL(lustre_msg_check_version);
@@ -216,7 +232,8 @@ EXPORT_SYMBOL(lustre_swab_mds_rec_link);
  EXPORT_SYMBOL(lustre_swab_mds_rec_unlink);
  EXPORT_SYMBOL(lustre_swab_mds_rec_rename);
  EXPORT_SYMBOL(lustre_swab_lov_desc);
-EXPORT_SYMBOL(lustre_swab_lov_user_md);
+EXPORT_SYMBOL(lustre_swab_lov_user_md_v1);
+EXPORT_SYMBOL(lustre_swab_lov_user_md_v3);
  EXPORT_SYMBOL(lustre_swab_lov_user_md_objects);
  EXPORT_SYMBOL(lustre_swab_lov_user_md_join);
  EXPORT_SYMBOL(lustre_swab_ldlm_res_id);
@@ -247,6 +264,7 @@ EXPORT_SYMBOL(lustre_msg_add_version);
  EXPORT_SYMBOL(lustre_msg_get_opc);
  EXPORT_SYMBOL(lustre_msg_get_last_xid);
  EXPORT_SYMBOL(lustre_msg_get_last_committed);
+EXPORT_SYMBOL(lustre_msg_get_versions);
  EXPORT_SYMBOL(lustre_msg_get_transno);
  EXPORT_SYMBOL(lustre_msg_get_status);
  EXPORT_SYMBOL(lustre_msg_get_slv);
@@ -261,10 +279,12 @@ EXPORT_SYMBOL(lustre_msg_set_type);
  EXPORT_SYMBOL(lustre_msg_set_opc);
  EXPORT_SYMBOL(lustre_msg_set_last_xid);
  EXPORT_SYMBOL(lustre_msg_set_last_committed);
+EXPORT_SYMBOL(lustre_msg_set_versions);
  EXPORT_SYMBOL(lustre_msg_set_transno);
  EXPORT_SYMBOL(lustre_msg_set_status);
  EXPORT_SYMBOL(lustre_msg_set_conn_cnt);
  EXPORT_SYMBOL(lustre_swab_mgs_target_info);
+EXPORT_SYMBOL(lustre_swab_fiemap);
  
  /* recover.c */
  EXPORT_SYMBOL(ptlrpc_disconnect_import);
@@ -299,7 +319,7 @@ EXPORT_SYMBOL(llog_origin_handle_close);
  EXPORT_SYMBOL(llog_client_ops);
  EXPORT_SYMBOL(llog_catinfo);
  
-MODULE_AUTHOR("Cluster File Systems, Inc. <info@clusterfs.com>");
+MODULE_AUTHOR("Sun Microsystems, Inc. <http://www.lustre.org/>");
  MODULE_DESCRIPTION("Lustre Request Processor and Lock Management");
  MODULE_LICENSE("GPL");
  
diff --git a/lustre/ptlrpc/ptlrpcd.c b/lustre/ptlrpc/ptlrpcd.c

index 5b82f57..3ccff0f 100644 (file)
--- a/lustre/ptlrpc/ptlrpcd.c
+++ b/lustre/ptlrpc/ptlrpcd.c
@@ -1,27 +1,39 @@
  /* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
   * vim:expandtab:shiftwidth=8:tabstop=8:
   *
- *  Copyright (C) 2001-2003 Cluster File Systems, Inc.
- *   Author Peter Braam <braam@clusterfs.com>
+ * GPL HEADER START
   *
- *   This file is part of the Lustre file system, http://www.lustre.org
- *   Lustre is a trademark of Cluster File Systems, Inc.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   *
- *   You may have signed or agreed to another license before downloading
- *   this software.  If so, you are bound by the terms and conditions
- *   of that agreement, and the following does not apply to you.  See the
- *   LICENSE file included with this distribution for more information.
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
   *
- *   If you did not agree to a different license, then this copy of Lustre
- *   is open source software; you can redistribute it and/or modify it
- *   under the terms of version 2 of the GNU General Public License as
- *   published by the Free Software Foundation.
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
   *
- *   In either case, Lustre is distributed in the hope that it will be
- *   useful, but WITHOUT ANY WARRANTY; without even the implied warranty
- *   of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- *   license text for more details.
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
   *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright  2008 Sun Microsystems, Inc. All rights reserved
+ * Use is subject to license terms.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/ptlrpc/ptlrpcd.c
   */
  
  #define DEBUG_SUBSYSTEM S_RPC
@@ -42,21 +54,6 @@
  #include <obd_support.h> /* for OBD_FAIL_CHECK */
  #include <lprocfs_status.h>
  
-#define LIOD_STOP 0
-struct ptlrpcd_ctl {
-        unsigned long             pc_flags;
-        spinlock_t                pc_lock;
-        struct completion         pc_starting;
-        struct completion         pc_finishing;
-        struct ptlrpc_request_set *pc_set;
-        char                      pc_name[16];
-#ifndef __KERNEL__
-        int                       pc_recurred;
-        void                     *pc_wait_callback;
-        void                     *pc_idle_callback;
-#endif
-};
-
  static struct ptlrpcd_ctl ptlrpcd_pc;
  static struct ptlrpcd_ctl ptlrpcd_recovery_pc;
  
@@ -72,19 +69,39 @@ void ptlrpcd_wake(struct ptlrpc_request *req)
          cfs_waitq_signal(&rq_set->set_waitq);
  }
  
-/* requests that are added to the ptlrpcd queue are sent via
- * ptlrpcd_check->ptlrpc_check_set() */
+/* 
+ * Requests that are added to the ptlrpcd queue are sent via
+ * ptlrpcd_check->ptlrpc_check_set().
+ */
  void ptlrpcd_add_req(struct ptlrpc_request *req)
  {
          struct ptlrpcd_ctl *pc;
+        int rc;
  
          if (req->rq_send_state == LUSTRE_IMP_FULL)
                  pc = &ptlrpcd_pc;
          else
                  pc = &ptlrpcd_recovery_pc;
-
-        ptlrpc_set_add_new_req(pc->pc_set, req);
-        cfs_waitq_signal(&pc->pc_set->set_waitq);
+        rc = ptlrpc_set_add_new_req(pc, req);
+        if (rc) {
+                int (*interpreter)(struct ptlrpc_request *,
+                                   void *, int);
+                                
+                interpreter = req->rq_interpret_reply;
+
+                /*
+                 * Thread is probably in stop now so we need to
+                 * kill this rpc as it was not added. Let's call
+                 * interpret for it to let know we're killing it
+                 * so that higher levels might free assosiated
+                 * resources.
+                */
+                req->rq_status = -EBADR;
+                interpreter(req, &req->rq_async_args,
+                            req->rq_status);
+                req->rq_set = NULL;
+                ptlrpc_req_finished(req);
+        }
  }
  
  static int ptlrpcd_check(struct ptlrpcd_ctl *pc)
@@ -94,23 +111,25 @@ static int ptlrpcd_check(struct ptlrpcd_ctl *pc)
          int rc = 0;
          ENTRY;
  
-        if (test_bit(LIOD_STOP, &pc->pc_flags))
-                RETURN(1);
-
          spin_lock(&pc->pc_set->set_new_req_lock);
          list_for_each_safe(pos, tmp, &pc->pc_set->set_new_requests) {
                  req = list_entry(pos, struct ptlrpc_request, rq_set_chain);
                  list_del_init(&req->rq_set_chain);
                  ptlrpc_set_add_req(pc->pc_set, req);
-                rc = 1; /* need to calculate its timeout */
+                /* 
+                 * Need to calculate its timeout. 
+                 */
+                rc = 1;
          }
          spin_unlock(&pc->pc_set->set_new_req_lock);
  
          if (pc->pc_set->set_remaining) {
                  rc = rc | ptlrpc_check_set(pc->pc_set);
  
-                /* XXX our set never completes, so we prune the completed
-                 * reqs after each iteration. boy could this be smarter. */
+                /* 
+                 * XXX: our set never completes, so we prune the completed
+                 * reqs after each iteration. boy could this be smarter. 
+                 */
                  list_for_each_safe(pos, tmp, &pc->pc_set->set_requests) {
                          req = list_entry(pos, struct ptlrpc_request,
                                           rq_set_chain);
@@ -124,7 +143,9 @@ static int ptlrpcd_check(struct ptlrpcd_ctl *pc)
          }
  
          if (rc == 0) {
-                /* If new requests have been added, make sure to wake up */
+                /* 
+                 * If new requests have been added, make sure to wake up. 
+                 */
                  spin_lock(&pc->pc_set->set_new_req_lock);
                  rc = !list_empty(&pc->pc_set->set_new_requests);
                  spin_unlock(&pc->pc_set->set_new_req_lock);
@@ -134,43 +155,66 @@ static int ptlrpcd_check(struct ptlrpcd_ctl *pc)
  }
  
  #ifdef __KERNEL__
-/* ptlrpc's code paths like to execute in process context, so we have this
- * thread which spins on a set which contains the io rpcs.  llite specifies
- * ptlrpcd's set when it pushes pages down into the oscs */
+/* 
+ * ptlrpc's code paths like to execute in process context, so we have this
+ * thread which spins on a set which contains the io rpcs. llite specifies
+ * ptlrpcd's set when it pushes pages down into the oscs.
+ */
  static int ptlrpcd(void *arg)
  {
          struct ptlrpcd_ctl *pc = arg;
-        int rc;
+        int rc, exit = 0;
          ENTRY;
  
          if ((rc = cfs_daemonize_ctxt(pc->pc_name))) {
                  complete(&pc->pc_starting);
-                return rc;
+                goto out;
          }
  
          complete(&pc->pc_starting);
  
-        /* this mainloop strongly resembles ptlrpc_set_wait except
-         * that our set never completes.  ptlrpcd_check calls ptlrpc_check_set
-         * when there are requests in the set.  new requests come in
-         * on the set's new_req_list and ptlrpcd_check moves them into
-         * the set. */
-        while (1) {
+        /* 
+         * This mainloop strongly resembles ptlrpc_set_wait() except that our
+         * set never completes.  ptlrpcd_check() calls ptlrpc_check_set() when
+         * there are requests in the set. New requests come in on the set's 
+         * new_req_list and ptlrpcd_check() moves them into the set. 
+         */
+        do {
                  struct l_wait_info lwi;
-                cfs_duration_t timeout;
+                int timeout;
  
-                timeout = cfs_time_seconds(ptlrpc_set_next_timeout(pc->pc_set));
-                lwi = LWI_TIMEOUT(timeout, ptlrpc_expired_set, pc->pc_set);
+                timeout = ptlrpc_set_next_timeout(pc->pc_set);
+                lwi = LWI_TIMEOUT(cfs_time_seconds(timeout ? timeout : 1), 
+                                  ptlrpc_expired_set, pc->pc_set);
  
                  l_wait_event(pc->pc_set->set_waitq, ptlrpcd_check(pc), &lwi);
  
-                if (test_bit(LIOD_STOP, &pc->pc_flags))
-                        break;
-        }
-        /* wait for inflight requests to drain */
+                /*
+                 * Abort inflight rpcs for forced stop case.
+                 */
+                if (test_bit(LIOD_STOP, &pc->pc_flags)) {
+                        if (test_bit(LIOD_FORCE, &pc->pc_flags))
+                                ptlrpc_abort_set(pc->pc_set);
+                        exit++;
+                }
+
+                /* 
+                 * Let's make one more loop to make sure that ptlrpcd_check()
+                 * copied all raced new rpcs into the set so we can kill them.
+                 */
+        } while (exit < 2);
+
+        /* 
+         * Wait for inflight requests to drain. 
+         */
          if (!list_empty(&pc->pc_set->set_requests))
                  ptlrpc_set_wait(pc->pc_set);
+
          complete(&pc->pc_finishing);
+out:
+        clear_bit(LIOD_START, &pc->pc_flags);
+        clear_bit(LIOD_STOP, &pc->pc_flags);
+        clear_bit(LIOD_FORCE, &pc->pc_flags);
          return 0;
  }
  
@@ -181,14 +225,18 @@ int ptlrpcd_check_async_rpcs(void *arg)
          struct ptlrpcd_ctl *pc = arg;
          int                  rc = 0;
  
-        /* single threaded!! */
+        /* 
+         * Single threaded!! 
+         */
          pc->pc_recurred++;
  
          if (pc->pc_recurred == 1) {
                  rc = ptlrpcd_check(pc);
                  if (!rc)
                          ptlrpc_expired_set(pc->pc_set);
-                /*XXX send replay requests */
+                /* 
+                 * XXX: send replay requests. 
+                 */
                  if (pc == &ptlrpcd_recovery_pc)
                          rc = ptlrpcd_check(pc);
          }
@@ -207,29 +255,37 @@ int ptlrpcd_idle(void *arg)
  
  #endif
  
-static int ptlrpcd_start(char *name, struct ptlrpcd_ctl *pc)
+int ptlrpcd_start(char *name, struct ptlrpcd_ctl *pc)
  {
-        int rc;
-
+        int rc = 0;
          ENTRY;
-        memset(pc, 0, sizeof(*pc));
+ 
+        /* 
+         * Do not allow start second thread for one pc. 
+         */
+        if (test_bit(LIOD_START, &pc->pc_flags)) {
+                CERROR("Starting second thread (%s) for same pc %p\n",
+                       name, pc);
+                RETURN(-EALREADY);
+        }
+
+        set_bit(LIOD_START, &pc->pc_flags);
          init_completion(&pc->pc_starting);
          init_completion(&pc->pc_finishing);
-        pc->pc_flags = 0;
          spin_lock_init(&pc->pc_lock);
          snprintf (pc->pc_name, sizeof (pc->pc_name), name);
  
          pc->pc_set = ptlrpc_prep_set();
          if (pc->pc_set == NULL)
-                RETURN(-ENOMEM);
+                GOTO(out, rc = -ENOMEM);
  
  #ifdef __KERNEL__
          rc = cfs_kernel_thread(ptlrpcd, pc, 0);
          if (rc < 0)  {
                  ptlrpc_set_destroy(pc->pc_set);
-                RETURN(rc);
+                GOTO(out, rc);
          }
-
+        rc = 0;
          wait_for_completion(&pc->pc_starting);
  #else
          pc->pc_wait_callback =
@@ -238,14 +294,23 @@ static int ptlrpcd_start(char *name, struct ptlrpcd_ctl *pc)
          pc->pc_idle_callback =
                  liblustre_register_idle_callback("ptlrpcd_check_idle_rpcs",
                                                   &ptlrpcd_idle, pc);
-        (void)rc;
  #endif
-        RETURN(0);
+out:
+        if (rc)
+                clear_bit(LIOD_START, &pc->pc_flags);
+        RETURN(rc);
  }
  
-static void ptlrpcd_stop(struct ptlrpcd_ctl *pc)
+void ptlrpcd_stop(struct ptlrpcd_ctl *pc, int force)
  {
+        if (!test_bit(LIOD_START, &pc->pc_flags)) {
+                CERROR("Thread for pc %p was not started\n", pc);
+                return;
+        }
+
          set_bit(LIOD_STOP, &pc->pc_flags);
+        if (force)
+                set_bit(LIOD_FORCE, &pc->pc_flags);
          cfs_waitq_signal(&pc->pc_set->set_waitq);
  #ifdef __KERNEL__
          wait_for_completion(&pc->pc_finishing);
@@ -273,7 +338,7 @@ int ptlrpcd_addref(void)
  
          rc = ptlrpcd_start("ptlrpcd-recov", &ptlrpcd_recovery_pc);
          if (rc) {
-                ptlrpcd_stop(&ptlrpcd_pc);
+                ptlrpcd_stop(&ptlrpcd_pc, 0);
                  --ptlrpcd_users;
                  GOTO(out, rc);
          }
@@ -286,8 +351,8 @@ void ptlrpcd_decref(void)
  {
          mutex_down(&ptlrpcd_sem);
          if (--ptlrpcd_users == 0) {
-                ptlrpcd_stop(&ptlrpcd_pc);
-                ptlrpcd_stop(&ptlrpcd_recovery_pc);
+                ptlrpcd_stop(&ptlrpcd_pc, 0);
+                ptlrpcd_stop(&ptlrpcd_recovery_pc, 0);
          }
          mutex_up(&ptlrpcd_sem);
  }
diff --git a/lustre/ptlrpc/recov_thread.c b/lustre/ptlrpc/recov_thread.c

index b930aaa..b33522b 100644 (file)
--- a/lustre/ptlrpc/recov_thread.c
+++ b/lustre/ptlrpc/recov_thread.c
@@ -1,32 +1,48 @@
  /* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
   * vim:expandtab:shiftwidth=8:tabstop=8:
   *
- *  Copyright (C) 2003 Cluster File Systems, Inc.
- *   Author: Andreas Dilger <adilger@clusterfs.com>
+ * GPL HEADER START
   *
- *   This file is part of the Lustre file system, http://www.lustre.org
- *   Lustre is a trademark of Cluster File Systems, Inc.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   *
- *   You may have signed or agreed to another license before downloading
- *   this software.  If so, you are bound by the terms and conditions
- *   of that agreement, and the following does not apply to you.  See the
- *   LICENSE file included with this distribution for more information.
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
   *
- *   If you did not agree to a different license, then this copy of Lustre
- *   is open source software; you can redistribute it and/or modify it
- *   under the terms of version 2 of the GNU General Public License as
- *   published by the Free Software Foundation.
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
   *
- *   In either case, Lustre is distributed in the hope that it will be
- *   useful, but WITHOUT ANY WARRANTY; without even the implied warranty
- *   of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- *   license text for more details.
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
   *
- * OST<->MDS recovery logging thread.
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright  2008 Sun Microsystems, Inc. All rights reserved
+ * Use is subject to license terms.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/ptlrpc/recov_thread.c
   *
+ * OST<->MDS recovery logging thread.
   * Invariants in implementation:
   * - we do not share logs among different OST<->MDS connections, so that
   *   if an OST or MDS fails it need only look at log(s) relevant to itself
+ *
+ * Author: Andreas Dilger   <adilger@clusterfs.com>
+ *         Yury Umanets     <yury.umanets@sun.com>
+ *         Alexey Lyashkov  <alexey.lyashkov@sun.com>
   */
  
  #define DEBUG_SUBSYSTEM S_LOG
@@ -42,9 +58,7 @@
  # include <liblustre.h>
  #endif
  
-#include <libcfs/kp30.h>
  #include <obd_class.h>
-#include <lustre_commit_confd.h>
  #include <obd_support.h>
  #include <obd_class.h>
  #include <lustre_net.h>
@@ -53,623 +67,640 @@
  #include <lustre_log.h>
  #include "ptlrpc_internal.h"
  
+static atomic_t                   llcd_count = ATOMIC_INIT(0);
+static cfs_mem_cache_t           *llcd_cache = NULL;
+
  #ifdef __KERNEL__
+enum {
+        LLOG_LCM_FL_START       = 1 << 0,
+        LLOG_LCM_FL_EXIT        = 1 << 1
+};
  
-/* Allocate new commit structs in case we do not have enough.
- * Make the llcd size small enough that it fits into a single page when we
- * are sending/receiving it. */
-static int llcd_alloc(struct llog_commit_master *lcm)
+static void llcd_print(struct llog_canceld_ctxt *llcd, 
+                       const char *func, int line) 
  {
-        struct llog_canceld_ctxt *llcd;
-        int llcd_size;
-
-        /* payload of lustre_msg V2 is bigger */
-        llcd_size = 4096 - lustre_msg_size(LUSTRE_MSG_MAGIC_V2, 1, NULL);
-        OBD_ALLOC(llcd,
-                  llcd_size + offsetof(struct llog_canceld_ctxt, llcd_cookies));
-        if (llcd == NULL)
-                return -ENOMEM;
-
-        llcd->llcd_size = llcd_size;
-        llcd->llcd_lcm = lcm;
-
-        spin_lock(&lcm->lcm_llcd_lock);
-        list_add(&llcd->llcd_list, &lcm->lcm_llcd_free);
-        atomic_inc(&lcm->lcm_llcd_numfree);
-        spin_unlock(&lcm->lcm_llcd_lock);
-
-        return 0;
+        CDEBUG(D_RPCTRACE, "Llcd (%p) at %s:%d:\n", llcd, func, line);
+        CDEBUG(D_RPCTRACE, "  size: %d\n", llcd->llcd_size);
+        CDEBUG(D_RPCTRACE, "  ctxt: %p\n", llcd->llcd_ctxt);
+        CDEBUG(D_RPCTRACE, "  lcm : %p\n", llcd->llcd_lcm);
+        CDEBUG(D_RPCTRACE, "  cookiebytes : %d\n", llcd->llcd_cookiebytes);
  }
  
-/* Get a free cookie struct from the list */
-static struct llog_canceld_ctxt *llcd_grab(struct llog_commit_master *lcm)
+/** 
+ * Allocate new llcd from cache, init it and return to caller.
+ * Bumps number of objects allocated.
+ */
+static struct llog_canceld_ctxt *llcd_alloc(struct llog_commit_master *lcm)
  {
          struct llog_canceld_ctxt *llcd;
+        int size, overhead;
+
+        LASSERT(lcm != NULL);
+
+        /* 
+         * We want to send one page of cookies with rpc header. This buffer
+         * will be assigned later to the rpc, this is why we preserve the
+         * space for rpc header.
+         */
+        size = CFS_PAGE_SIZE - lustre_msg_size(LUSTRE_MSG_MAGIC_V2, 1, NULL);
+        overhead =  offsetof(struct llog_canceld_ctxt, llcd_cookies);
+        OBD_SLAB_ALLOC(llcd, llcd_cache, CFS_ALLOC_STD, size + overhead);
+        if (!llcd)
+                return NULL;
  
-repeat:
-        spin_lock(&lcm->lcm_llcd_lock);
-        if (list_empty(&lcm->lcm_llcd_free)) {
-                spin_unlock(&lcm->lcm_llcd_lock);
-                if (llcd_alloc(lcm) < 0) {
-                        CERROR("unable to allocate log commit data!\n");
-                        return NULL;
-                }
-                /* check new llcd wasn't grabbed while lock dropped, b=7407 */
-                goto repeat;
-        }
+        CFS_INIT_LIST_HEAD(&llcd->llcd_list);
+        llcd->llcd_cookiebytes = 0;
+        llcd->llcd_size = size;
  
-        llcd = list_entry(lcm->lcm_llcd_free.next, typeof(*llcd), llcd_list);
-        list_del(&llcd->llcd_list);
-        atomic_dec(&lcm->lcm_llcd_numfree);
-        spin_unlock(&lcm->lcm_llcd_lock);
+        spin_lock(&lcm->lcm_lock);
+        llcd->llcd_lcm = lcm;
+        atomic_inc(&lcm->lcm_count);
+        list_add_tail(&llcd->llcd_list, &lcm->lcm_llcds);
+        spin_unlock(&lcm->lcm_lock);
+        atomic_inc(&llcd_count);
  
-        llcd->llcd_cookiebytes = 0;
+        CDEBUG(D_RPCTRACE, "Alloc llcd %p on lcm %p (%d)\n",
+               llcd, lcm, atomic_read(&lcm->lcm_count));
  
          return llcd;
  }
  
-static void llcd_put(struct llog_canceld_ctxt *llcd)
+/**
+ * Returns passed llcd to cache.
+ */
+static void llcd_free(struct llog_canceld_ctxt *llcd)
  {
          struct llog_commit_master *lcm = llcd->llcd_lcm;
+        int size;
  
-        llog_ctxt_put(llcd->llcd_ctxt);
-        if (atomic_read(&lcm->lcm_llcd_numfree) >= lcm->lcm_llcd_maxfree) {
-                int llcd_size = llcd->llcd_size +
-                         offsetof(struct llog_canceld_ctxt, llcd_cookies);
-                OBD_FREE(llcd, llcd_size);
-        } else {
-                spin_lock(&lcm->lcm_llcd_lock);
-                list_add(&llcd->llcd_list, &lcm->lcm_llcd_free);
-                atomic_inc(&lcm->lcm_llcd_numfree);
-                spin_unlock(&lcm->lcm_llcd_lock);
+        if (lcm) {
+                if (atomic_read(&lcm->lcm_count) == 0) {
+                        CERROR("Invalid llcd free %p\n", llcd);
+                        llcd_print(llcd, __FUNCTION__, __LINE__);
+                        LBUG();
+                }
+                spin_lock(&lcm->lcm_lock);
+                LASSERT(!list_empty(&llcd->llcd_list));
+                list_del_init(&llcd->llcd_list);
+                atomic_dec(&lcm->lcm_count);
+                spin_unlock(&lcm->lcm_lock);
+
+                CDEBUG(D_RPCTRACE, "Free llcd %p on lcm %p (%d)\n", 
+                       llcd, lcm, atomic_read(&lcm->lcm_count));
          }
+
+        LASSERT(atomic_read(&llcd_count) > 0);
+        atomic_dec(&llcd_count);
+
+        size = offsetof(struct llog_canceld_ctxt, llcd_cookies) + 
+            llcd->llcd_size;
+        OBD_SLAB_FREE(llcd, llcd_cache, size);
  }
  
-/* Send some cookies to the appropriate target */
-static void llcd_send(struct llog_canceld_ctxt *llcd)
+/**
+ * Checks if passed cookie fits into llcd free space buffer. Returns
+ * 1 if yes and 0 otherwise.
+ */
+static inline int 
+llcd_fit(struct llog_canceld_ctxt *llcd, struct llog_cookie *cookies)
  {
-        if (!(llcd->llcd_lcm->lcm_flags & LLOG_LCM_FL_EXIT)) {
-                spin_lock(&llcd->llcd_lcm->lcm_llcd_lock);
-                list_add_tail(&llcd->llcd_list,
-                              &llcd->llcd_lcm->lcm_llcd_pending);
-                spin_unlock(&llcd->llcd_lcm->lcm_llcd_lock);
-        }
-        cfs_waitq_signal_nr(&llcd->llcd_lcm->lcm_waitq, 1);
+        return (llcd->llcd_size - llcd->llcd_cookiebytes >= sizeof(*cookies));
  }
  
  /**
- * Grab llcd and assign it to passed @ctxt. Also set up backward link
- * and get ref on @ctxt.
+ * Copy passed @cookies to @llcd.
   */
-static struct llog_canceld_ctxt *ctxt_llcd_grab(struct llog_ctxt *ctxt)
+static inline void 
+llcd_copy(struct llog_canceld_ctxt *llcd, struct llog_cookie *cookies)
  {
-        struct llog_canceld_ctxt *llcd;
-
-        LASSERT_SEM_LOCKED(&ctxt->loc_sem);
-        llcd = llcd_grab(ctxt->loc_lcm);
-        if (llcd == NULL)
-                return NULL;
-
-        llcd->llcd_ctxt = llog_ctxt_get(ctxt);
-        ctxt->loc_llcd = llcd;
-
-        CDEBUG(D_RPCTRACE,"grab llcd %p:%p\n", ctxt->loc_llcd, ctxt);
-        return llcd;
+        LASSERT(llcd_fit(llcd, cookies));
+        memcpy((char *)llcd->llcd_cookies + llcd->llcd_cookiebytes, 
+              cookies, sizeof(*cookies));
+        llcd->llcd_cookiebytes += sizeof(*cookies);
  }
  
  /**
- * Put llcd in passed @ctxt. Set ->loc_llcd to NULL.
+ * Llcd completion function. Called uppon llcd send finish regardless
+ * sending result. Error is passed in @rc. Note, that this will be called
+ * in cleanup time when all inflight rpcs aborted.
   */
-static void ctxt_llcd_put(struct llog_ctxt *ctxt)
+static int 
+llcd_interpret(struct ptlrpc_request *req, void *noused, int rc)
  {
-        mutex_down(&ctxt->loc_sem);
-        if (ctxt->loc_llcd != NULL) {
-                CDEBUG(D_RPCTRACE,"put llcd %p:%p\n", ctxt->loc_llcd, ctxt);
-                llcd_put(ctxt->loc_llcd);
-                ctxt->loc_llcd = NULL;
-        }
-        if (ctxt->loc_imp) {
-                class_import_put(ctxt->loc_imp);
-                ctxt->loc_imp = NULL;
-        }
-        mutex_up(&ctxt->loc_sem);
+        struct llog_canceld_ctxt *llcd = req->rq_async_args.pointer_arg[0];
+        CDEBUG(D_RPCTRACE, "Sent llcd %p (%d) - killing it\n", llcd, rc);
+        llcd_free(llcd);
+        return 0;
  }
-
-/* deleted objects have a commit callback that cancels the MDS
- * log record for the deletion.  The commit callback calls this
- * function
+ 
+/**
+ * Send @llcd to remote node. Free llcd uppon completion or error. Sending
+ * is performed in async style so this function will return asap without 
+ * blocking.
   */
-int llog_obd_repl_cancel(struct llog_ctxt *ctxt,
-                         struct lov_stripe_md *lsm, int count,
-                         struct llog_cookie *cookies, int flags)
+static int llcd_send(struct llog_canceld_ctxt *llcd)
  {
-        struct llog_canceld_ctxt *llcd;
-        int rc = 0;
+        int size[2] = { sizeof(struct ptlrpc_body),
+                        llcd->llcd_cookiebytes };
+        char *bufs[2] = { NULL, (char *)llcd->llcd_cookies };
+        struct obd_import *import = NULL;
+        struct llog_commit_master *lcm;
+        struct ptlrpc_request *req;
+        struct llog_ctxt *ctxt;
+        int rc;
          ENTRY;
  
-        LASSERT(ctxt);
+        ctxt = llcd->llcd_ctxt;
+        if (!ctxt) {
+                CERROR("Invalid llcd with NULL ctxt found (%p)\n", 
+                       llcd);
+                llcd_print(llcd, __FUNCTION__, __LINE__);
+                LBUG();
+        }
+        LASSERT_SEM_LOCKED(&ctxt->loc_sem);
  
-        mutex_down(&ctxt->loc_sem);
-        llcd = ctxt->loc_llcd;
+        if (llcd->llcd_cookiebytes == 0)
+                GOTO(exit, rc = 0);
  
-        if (ctxt->loc_imp == NULL) {
-                CDEBUG(D_RPCTRACE, "no import for ctxt %p\n", ctxt);
-                GOTO(out, rc = 0);
-        }
+        lcm = llcd->llcd_lcm;
  
-        if (count > 0 && cookies != NULL) {
-                if (llcd == NULL) {
-                        llcd = ctxt_llcd_grab(ctxt);
-                        if (llcd == NULL) {
-                                CERROR("couldn't get an llcd - dropped "LPX64
-                                       ":%x+%u\n",
-                                       cookies->lgc_lgl.lgl_oid,
-                                       cookies->lgc_lgl.lgl_ogen,
-                                       cookies->lgc_index);
-                                GOTO(out, rc = -ENOMEM);
-                        }
-                }
+        /* 
+         * Check if we're in exit stage. Do not send llcd in
+         * this case. 
+         */
+        if (test_bit(LLOG_LCM_FL_EXIT, &lcm->lcm_flags))
+                GOTO(exit, rc = -ENODEV);
  
-                memcpy((char *)llcd->llcd_cookies + llcd->llcd_cookiebytes,
-                       cookies, sizeof(*cookies));
-                llcd->llcd_cookiebytes += sizeof(*cookies);
-        } else {
-                if (llcd == NULL || !(flags & OBD_LLOG_FL_SENDNOW))
-                        GOTO(out, rc);
-        }
+        CDEBUG(D_RPCTRACE, "Sending llcd %p\n", llcd);
  
-        if ((llcd->llcd_size - llcd->llcd_cookiebytes) < sizeof(*cookies) ||
-            (flags & OBD_LLOG_FL_SENDNOW)) {
-                CDEBUG(D_RPCTRACE, "send llcd %p:%p\n", llcd, llcd->llcd_ctxt);
-                ctxt->loc_llcd = NULL;
-                llcd_send(llcd);
+        import = llcd->llcd_ctxt->loc_imp;
+        if (!import || (import == LP_POISON) || 
+            (import->imp_client == LP_POISON)) {
+                CERROR("Invalid import %p for llcd %p\n", 
+                       import, llcd);
+                GOTO(exit, rc = -ENODEV);
          }
-out:
-        mutex_up(&ctxt->loc_sem);
-        return rc;
-}
-EXPORT_SYMBOL(llog_obd_repl_cancel);
  
-int llog_obd_repl_sync(struct llog_ctxt *ctxt, struct obd_export *exp)
-{
-        int rc = 0;
-        ENTRY;
+        OBD_FAIL_TIMEOUT(OBD_FAIL_PTLRPC_DELAY_RECOV, 10);
+
+        /*
+         * No need to get import here as it is already done in 
+         * llog_receptor_accept().
+         */
+        req = ptlrpc_prep_req(import, LUSTRE_LOG_VERSION,
+                              OBD_LOG_CANCEL, 2, size, bufs);
+        if (req == NULL) {
+                CERROR("Can't allocate request for sending llcd %p\n", 
+                       llcd);
+                GOTO(exit, rc = -ENOMEM);
+        }
  
-        if (exp && (ctxt->loc_imp == exp->exp_imp_reverse)) {
-                CDEBUG(D_RPCTRACE,"reverse import disconnect\n");
-                /* 
-                 * We put llcd because it is not going to sending list and
-                 * thus, its refc will not be handled. We will handle it here.
-                 */
-                ctxt_llcd_put(ctxt);
-        } else {
-                /* 
-                 * Sending cancel. This means that ctxt->loc_llcd wil be
-                 * put on sending list in llog_obd_repl_cancel() and in
-                 * this case recovery thread will take care of it refc.
-                 */
-                rc = llog_cancel(ctxt, NULL, 0, NULL, OBD_LLOG_FL_SENDNOW);
+        /* 
+         * Check if we're in exit stage again. Do not send llcd in
+         * this case. 
+         */
+        if (test_bit(LLOG_LCM_FL_EXIT, &lcm->lcm_flags)) {
+                ptlrpc_req_finished(req);
+                GOTO(exit, rc = -ENODEV);
          }
  
+        /* bug 5515 */
+        req->rq_request_portal = LDLM_CANCEL_REQUEST_PORTAL;
+        req->rq_reply_portal = LDLM_CANCEL_REPLY_PORTAL;
+        ptlrpc_req_set_repsize(req, 1, NULL);
+        ptlrpc_at_set_req_timeout(req);
+        req->rq_interpret_reply = llcd_interpret;
+        req->rq_async_args.pointer_arg[0] = llcd;
+
+        /* llog cancels will be replayed after reconnect so this will do twice
+         * first from replay llog, second for resended rpc */
+        req->rq_no_delay = req->rq_no_resend = 1;
+
+        rc = ptlrpc_set_add_new_req(&lcm->lcm_pc, req);
+        if (rc) {
+                ptlrpc_req_finished(req);
+                GOTO(exit, rc);
+        }
          RETURN(rc);
+exit:
+        CDEBUG(D_RPCTRACE, "Refused llcd %p\n", llcd);
+        llcd_free(llcd);
+        return rc;
  }
-EXPORT_SYMBOL(llog_obd_repl_sync);
  
-static void llog_lcm_dec(struct llog_commit_master *lcm)
+/**
+ * Attach @llcd to @ctxt. Establish llcd vs. ctxt reserve connection
+ * so hat they can refer each other.
+ */
+static int
+llcd_attach(struct llog_ctxt *ctxt, struct llog_canceld_ctxt *llcd)
  {
-        atomic_dec(&lcm->lcm_thread_total);
-        cfs_waitq_signal(&lcm->lcm_waitq);
-}
+        LASSERT(ctxt != NULL && llcd != NULL);
+        LASSERT_SEM_LOCKED(&ctxt->loc_sem);
+        LASSERT(ctxt->loc_llcd == NULL);
+        llcd->llcd_ctxt = llog_ctxt_get(ctxt);
+        ctxt->loc_llcd = llcd;
  
-static int log_commit_thread(void *arg)
-{
-        struct llog_commit_daemon *lcd = arg;
-        struct llog_commit_master *lcm = lcd->lcd_lcm;
-        struct llog_canceld_ctxt *llcd, *n;
-        struct obd_import *import = NULL;
-        ENTRY;
+        CDEBUG(D_RPCTRACE, "Attach llcd %p to ctxt %p\n",
+               llcd, ctxt);
  
-        THREAD_NAME(cfs_curproc_comm(), CFS_CURPROC_COMM_MAX - 1,
-                    "ll_log_comt_%02d", lcd->lcd_index);
-        
-        ptlrpc_daemonize(cfs_curproc_comm()); /* thread never needs to do IO */
-        CDEBUG(D_HA, "%s started\n", cfs_curproc_comm());
-        
-        do {
-                struct ptlrpc_request *request;
-                struct list_head *sending_list;
-                int rc = 0;
-
-                if (import)
-                        class_import_put(import);
-                import = NULL;
-
-                /* If we do not have enough pages available, allocate some */
-                while (atomic_read(&lcm->lcm_llcd_numfree) <
-                       lcm->lcm_llcd_minfree) {
-                        if (llcd_alloc(lcm) < 0)
-                                break;
-                }
+        return 0;
+}
  
-                spin_lock(&lcm->lcm_thread_lock);
-                atomic_inc(&lcm->lcm_thread_numidle);
-                list_move(&lcd->lcd_lcm_list, &lcm->lcm_thread_idle);
-                spin_unlock(&lcm->lcm_thread_lock);
-
-                wait_event_interruptible(lcm->lcm_waitq,
-                                         !list_empty(&lcm->lcm_llcd_pending) ||
-                                         lcm->lcm_flags & LLOG_LCM_FL_EXIT);
-
-                /* If we are the last available thread, start a new one in case
-                 * we get blocked on an RPC (nobody else will start a new one)*/
-                spin_lock(&lcm->lcm_thread_lock);
-                atomic_dec(&lcm->lcm_thread_numidle);
-                list_move(&lcd->lcd_lcm_list, &lcm->lcm_thread_busy);
-                spin_unlock(&lcm->lcm_thread_lock);
-
-                sending_list = &lcm->lcm_llcd_pending;
-        resend:
-                if (import)
-                        class_import_put(import);
-                import = NULL;
-                if (lcm->lcm_flags & LLOG_LCM_FL_EXIT) {
-                        lcm->lcm_llcd_maxfree = 0;
-                        lcm->lcm_llcd_minfree = 0;
-                        lcm->lcm_thread_max = 0;
-
-                        if (list_empty(&lcm->lcm_llcd_pending) ||
-                            lcm->lcm_flags & LLOG_LCM_FL_EXIT_FORCE)
-                                break;
-                }
+/**
+ * Opposite to llcd_attach(). Detaches llcd from its @ctxt. This makes
+ * sure that this llcd will not be found another time we try to cancel.
+ */
+static struct llog_canceld_ctxt *llcd_detach(struct llog_ctxt *ctxt)
+{
+        struct llog_canceld_ctxt *llcd;
  
-                if (atomic_read(&lcm->lcm_thread_numidle) <= 1 &&
-                    atomic_read(&lcm->lcm_thread_total) < lcm->lcm_thread_max) {
-                        rc = llog_start_commit_thread(lcm);
-                        if (rc < 0)
-                                CERROR("error starting thread: rc %d\n", rc);
-                }
+        LASSERT(ctxt != NULL);
+        LASSERT_SEM_LOCKED(&ctxt->loc_sem);
  
-                /* Move all of the pending cancels from the same OST off of
-                 * the list, so we don't get multiple threads blocked and/or
-                 * doing upcalls on the same OST in case of failure. */
-                spin_lock(&lcm->lcm_llcd_lock);
-                if (!list_empty(sending_list)) {
-                        list_move_tail(sending_list->next,
-                                       &lcd->lcd_llcd_list);
-                        llcd = list_entry(lcd->lcd_llcd_list.next,
-                                          typeof(*llcd), llcd_list);
-                        LASSERT(llcd->llcd_lcm == lcm);
-                        import = llcd->llcd_ctxt->loc_imp;
-                        if (import)
-                                class_import_get(import);
-                }
-                list_for_each_entry_safe(llcd, n, sending_list, llcd_list) {
-                        LASSERT(llcd->llcd_lcm == lcm);
-                        if (import == llcd->llcd_ctxt->loc_imp)
-                                list_move_tail(&llcd->llcd_list,
-                                               &lcd->lcd_llcd_list);
-                }
-                if (sending_list != &lcm->lcm_llcd_resend) {
-                        list_for_each_entry_safe(llcd, n, &lcm->lcm_llcd_resend,
-                                                 llcd_list) {
-                                LASSERT(llcd->llcd_lcm == lcm);
-                                if (import == llcd->llcd_ctxt->loc_imp)
-                                        list_move_tail(&llcd->llcd_list,
-                                                       &lcd->lcd_llcd_list);
-                        }
-                }
-                spin_unlock(&lcm->lcm_llcd_lock);
-
-                /* We are the only one manipulating our local list - no lock */
-                list_for_each_entry_safe(llcd,n, &lcd->lcd_llcd_list,llcd_list){
-                        int size[2] = { sizeof(struct ptlrpc_body),
-                                        llcd->llcd_cookiebytes };
-                        char *bufs[2] = { NULL, (char *)llcd->llcd_cookies };
-
-                        list_del(&llcd->llcd_list);
-                        if (llcd->llcd_cookiebytes == 0) {
-                                CDEBUG(D_RPCTRACE, "put empty llcd %p:%p\n",
-                                       llcd, llcd->llcd_ctxt);
-                                llcd_put(llcd);
-                                continue;
-                        }
-
-                        mutex_down(&llcd->llcd_ctxt->loc_sem);
-                        if (llcd->llcd_ctxt->loc_imp == NULL) {
-                                mutex_up(&llcd->llcd_ctxt->loc_sem);
-                                CWARN("import will be destroyed, put "
-                                      "llcd %p:%p\n", llcd, llcd->llcd_ctxt);
-                                llcd_put(llcd);
-                                continue;
-                        }
-                        mutex_up(&llcd->llcd_ctxt->loc_sem);
-
-                        if (!import || (import == LP_POISON) ||
-                            (import->imp_client == LP_POISON)) {
-                                CERROR("No import %p (llcd=%p, ctxt=%p)\n",
-                                       import, llcd, llcd->llcd_ctxt);
-                                llcd_put(llcd);
-                                continue;
-                        }
-
-                        OBD_FAIL_TIMEOUT(OBD_FAIL_PTLRPC_DELAY_RECOV, 10);
-
-                        request = ptlrpc_prep_req(import, LUSTRE_LOG_VERSION,
-                                                  OBD_LOG_CANCEL, 2, size,bufs);
-                        if (request == NULL) {
-                                rc = -ENOMEM;
-                                CERROR("error preparing commit: rc %d\n", rc);
-
-                                spin_lock(&lcm->lcm_llcd_lock);
-                                list_splice_init(&lcd->lcd_llcd_list,
-                                                 &lcm->lcm_llcd_resend);
-                                spin_unlock(&lcm->lcm_llcd_lock);
-                                break;
-                        }
-
-                        /* bug 5515 */
-                        request->rq_request_portal = LDLM_CANCEL_REQUEST_PORTAL;
-                        request->rq_reply_portal = LDLM_CANCEL_REPLY_PORTAL;
-                        ptlrpc_at_set_req_timeout(request);
-
-                        ptlrpc_req_set_repsize(request, 1, NULL);
-                        mutex_down(&llcd->llcd_ctxt->loc_sem);
-                        if (llcd->llcd_ctxt->loc_imp == NULL) {
-                                mutex_up(&llcd->llcd_ctxt->loc_sem);
-                                CWARN("import will be destroyed, put "
-                                      "llcd %p:%p\n", llcd, llcd->llcd_ctxt);
-                                llcd_put(llcd);
-                                ptlrpc_req_finished(request);
-                                continue;
-                        }
-                        mutex_up(&llcd->llcd_ctxt->loc_sem);
-                        rc = ptlrpc_queue_wait(request);
-                        ptlrpc_req_finished(request);
-
-                        /* If the RPC failed, we put this and the remaining
-                         * messages onto the resend list for another time. */
-                        if (rc == 0) {
-                                llcd_put(llcd);
-                                continue;
-                        }
-
-                        CERROR("commit %p:%p drop %d cookies: rc %d\n",
-                               llcd, llcd->llcd_ctxt,
-                               (int)(llcd->llcd_cookiebytes /
-                                     sizeof(*llcd->llcd_cookies)), rc);
-                        llcd_put(llcd);
-                }
+        llcd = ctxt->loc_llcd;
+        if (!llcd)
+                return NULL;
  
-                if (rc == 0) {
-                        sending_list = &lcm->lcm_llcd_resend;
-                        if (!list_empty(sending_list))
-                                goto resend;
-                }
-        } while(1);
+        CDEBUG(D_RPCTRACE, "Detach llcd %p from ctxt %p\n", 
+               llcd, ctxt);
  
-        if (import)
-                class_import_put(import);
+        ctxt->loc_llcd = NULL;
+        llog_ctxt_put(ctxt);
+        return llcd;
+}
  
-        /* If we are force exiting, just drop all of the cookies. */
-        if (lcm->lcm_flags & LLOG_LCM_FL_EXIT_FORCE) {
-                spin_lock(&lcm->lcm_llcd_lock);
-                list_splice_init(&lcm->lcm_llcd_pending, &lcd->lcd_llcd_list);
-                list_splice_init(&lcm->lcm_llcd_resend, &lcd->lcd_llcd_list);
-                list_splice_init(&lcm->lcm_llcd_free, &lcd->lcd_llcd_list);
-                spin_unlock(&lcm->lcm_llcd_lock);
+/**
+ * Return @llcd cached in @ctxt. Allocate new one if required. Attach it
+ * to ctxt so that it may be used for gathering cookies and sending.
+ */
+static struct llog_canceld_ctxt *llcd_get(struct llog_ctxt *ctxt)
+{
+        struct llog_canceld_ctxt *llcd;
  
-                list_for_each_entry_safe(llcd, n, &lcd->lcd_llcd_list,llcd_list)
-                        llcd_put(llcd);
+        llcd = llcd_alloc(ctxt->loc_lcm);
+        if (!llcd) {
+                CERROR("Can't alloc an llcd for ctxt %p\n", ctxt);
+                return NULL;
          }
+        llcd_attach(ctxt, llcd);
+        return llcd;
+}
  
+/**
+ * Deatch llcd from its @ctxt. Free llcd.
+ */
+static void llcd_put(struct llog_ctxt *ctxt)
+{
+        struct llog_canceld_ctxt *llcd;
  
-        CDEBUG(D_HA, "%s exiting\n", cfs_curproc_comm());
-
-        spin_lock(&lcm->lcm_thread_lock);
-        list_del(&lcd->lcd_lcm_list);
-        spin_unlock(&lcm->lcm_thread_lock);
-        OBD_FREE_PTR(lcd);
-        llog_lcm_dec(lcm);
-
-        RETURN(0);
+        llcd = llcd_detach(ctxt);
+        if (llcd)
+                llcd_free(llcd);
  }
  
-int llog_start_commit_thread(struct llog_commit_master *lcm)
+/**
+ * Detach llcd from its @ctxt so that nobody will find it with try to
+ * re-use. Send llcd to remote node.
+ */
+static int llcd_push(struct llog_ctxt *ctxt)
  {
-        struct llog_commit_daemon *lcd;
-        int rc, index; 
-        ENTRY;
+        struct llog_canceld_ctxt *llcd;
+        int rc;
  
-        if (atomic_read(&lcm->lcm_thread_total) >= lcm->lcm_thread_max)
-                RETURN(0);
-
-        /* Check whether it will be cleanup llog commit thread first,
-         * If not, increate the lcm_thread_total count to prevent the 
-         * lcm being freed when the log_commit_thread is started */
-        spin_lock(&lcm->lcm_thread_lock);
-        if (!lcm->lcm_flags & LLOG_LCM_FL_EXIT) { 
-                atomic_inc(&lcm->lcm_thread_total);
-                index = atomic_read(&lcm->lcm_thread_total);
-                spin_unlock(&lcm->lcm_thread_lock);
-        } else {
-                spin_unlock(&lcm->lcm_thread_lock);
-                RETURN(0);
+        /*
+         * Make sure that this llcd will not be sent again as we detach 
+         * it from ctxt.
+         */
+        llcd = llcd_detach(ctxt);
+        if (!llcd) {
+                CERROR("Invalid detached llcd found %p\n", llcd);
+                llcd_print(llcd, __FUNCTION__, __LINE__);
+                LBUG();
          }
+        
+        rc = llcd_send(llcd);
+        if (rc)
+                CERROR("Couldn't send llcd %p (%d)\n", llcd, rc);
+        return rc;
+}
  
-        OBD_ALLOC_PTR(lcd);
-        if (lcd == NULL)
-                GOTO(cleanup, rc = -ENOMEM);
-
-        CFS_INIT_LIST_HEAD(&lcd->lcd_lcm_list);
-        CFS_INIT_LIST_HEAD(&lcd->lcd_llcd_list);
-        lcd->lcd_index = index;
-        lcd->lcd_lcm = lcm;
+/**
+ * Start recovery thread which actually deals llcd sending. This
+ * is all ptlrpc standard thread based so there is not much of work
+ * to do.
+ */
+int llog_recov_thread_start(struct llog_commit_master *lcm)
+{
+        int rc;
+        ENTRY;
  
-        rc = cfs_kernel_thread(log_commit_thread, lcd, CLONE_VM | CLONE_FILES);
-cleanup:
-        if (rc < 0) {
-                CERROR("error starting thread #%d: %d\n", lcd->lcd_index, rc);
-                llog_lcm_dec(lcm);
-                if (lcd) 
-                        OBD_FREE_PTR(lcd);
+        rc = ptlrpcd_start(lcm->lcm_name, &lcm->lcm_pc);
+        if (rc) {
+                CERROR("Error %d while starting recovery thread %s\n", 
+                       rc, lcm->lcm_name);
                  RETURN(rc);
          }
-        RETURN(0);
+        RETURN(rc);
  }
-EXPORT_SYMBOL(llog_start_commit_thread);
-
-static struct llog_process_args {
-        struct semaphore         llpa_sem;
-        struct llog_ctxt        *llpa_ctxt;
-        void                    *llpa_cb;
-        void                    *llpa_arg;
-} llpa;
+EXPORT_SYMBOL(llog_recov_thread_start);
  
-int llog_init_commit_master(struct llog_commit_master *lcm)
+/**
+ * Stop recovery thread. Complement to llog_recov_thread_start().
+ */
+void llog_recov_thread_stop(struct llog_commit_master *lcm, int force)
  {
-        CFS_INIT_LIST_HEAD(&lcm->lcm_thread_busy);
-        CFS_INIT_LIST_HEAD(&lcm->lcm_thread_idle);
-        spin_lock_init(&lcm->lcm_thread_lock);
-        atomic_set(&lcm->lcm_thread_numidle, 0);
-        cfs_waitq_init(&lcm->lcm_waitq);
-        CFS_INIT_LIST_HEAD(&lcm->lcm_llcd_pending);
-        CFS_INIT_LIST_HEAD(&lcm->lcm_llcd_resend);
-        CFS_INIT_LIST_HEAD(&lcm->lcm_llcd_free);
-        spin_lock_init(&lcm->lcm_llcd_lock);
-        atomic_set(&lcm->lcm_llcd_numfree, 0);
-        lcm->lcm_llcd_minfree = 0;
-        lcm->lcm_thread_max = 5;
-        /* FIXME initialize semaphore for llog_process_args */
-        sema_init(&llpa.llpa_sem, 1);
-        return 0;
-}
-EXPORT_SYMBOL(llog_init_commit_master);
+        ENTRY;
  
-int llog_cleanup_commit_master(struct llog_commit_master *lcm,
-                               int force)
-{
-        spin_lock(&lcm->lcm_thread_lock);
-        lcm->lcm_flags |= LLOG_LCM_FL_EXIT;
-        if (force)
-                lcm->lcm_flags |= LLOG_LCM_FL_EXIT_FORCE;
+        /*
+         * Let all know that we're stopping. This will also make 
+         * llcd_send() refuse any new llcds.
+         */
+        set_bit(LLOG_LCM_FL_EXIT, &lcm->lcm_flags);
+
+        /*
+         * Stop processing thread. No new rpcs will be accepted for
+         * for processing now.
+         */
+        ptlrpcd_stop(&lcm->lcm_pc, force);
          
-        spin_unlock(&lcm->lcm_thread_lock);
-        
-        cfs_waitq_signal(&lcm->lcm_waitq);
-
-        wait_event_interruptible(lcm->lcm_waitq,
-                                 atomic_read(&lcm->lcm_thread_total) == 0);
-        return 0;
+        /*
+         * By this point no alive inflight llcds should be left. Only
+         * those forgotten in sync may still be attached to ctxt. Let's
+         * print them.
+         */
+        if (atomic_read(&lcm->lcm_count) != 0) {
+                struct llog_canceld_ctxt *llcd;
+                struct list_head         *tmp;
+
+                CERROR("Busy llcds found (%d) on lcm %p\n", 
+                       atomic_read(&lcm->lcm_count) == 0, lcm);
+
+                spin_lock(&lcm->lcm_lock);
+                list_for_each(tmp, &lcm->lcm_llcds) {
+                        llcd = list_entry(tmp, struct llog_canceld_ctxt,
+                                          llcd_list);
+                        llcd_print(llcd, __FUNCTION__, __LINE__);
+                }
+                spin_unlock(&lcm->lcm_lock);
+                
+                /*
+                 * No point to go further with busy llcds at this point
+                 * as this is clear bug. It might mean we got hanging
+                 * rpc which holds import ref and this means we will not
+                 * be able to cleanup anyways.
+                 *
+                 * Or we just missed to kill them when they were not
+                 * attached to ctxt. In this case our slab will remind
+                 * us about this a bit later.
+                 */
+                LBUG();
+        }
+        EXIT;
  }
-EXPORT_SYMBOL(llog_cleanup_commit_master);
+EXPORT_SYMBOL(llog_recov_thread_stop);
  
-static int log_process_thread(void *args)
+/**
+ * Initialize commit master structure and start recovery thread on it.
+ */
+struct llog_commit_master *llog_recov_thread_init(char *name)
  {
-        struct llog_process_args *data = args;
-        struct llog_ctxt *ctxt = data->llpa_ctxt;
-        void   *cb = data->llpa_cb;
-        struct llog_logid logid = *(struct llog_logid *)(data->llpa_arg);
-        struct llog_handle *llh = NULL;
+        struct llog_commit_master *lcm;
          int rc;
          ENTRY;
  
-        mutex_up(&data->llpa_sem);
-        ptlrpc_daemonize("llog_process");     /* thread does IO to log files */
+        OBD_ALLOC_PTR(lcm);
+        if (!lcm)
+                RETURN(NULL);
  
-        rc = llog_create(ctxt, &llh, &logid, NULL);
+        /*
+         * Try to create threads with unique names.
+         */
+        snprintf(lcm->lcm_name, sizeof(lcm->lcm_name), 
+                 "ll_log_commit_%s", name);
+
+        atomic_set(&lcm->lcm_count, 0);
+        spin_lock_init(&lcm->lcm_lock);
+        CFS_INIT_LIST_HEAD(&lcm->lcm_llcds);
+        rc = llog_recov_thread_start(lcm);
          if (rc) {
-                CERROR("llog_create failed %d\n", rc);
+                CERROR("Can't start commit thread, rc %d\n", rc);
                  GOTO(out, rc);
          }
-        rc = llog_init_handle(llh, LLOG_F_IS_CAT, NULL);
-        if (rc) {
-                CERROR("llog_init_handle failed %d\n", rc);
-                GOTO(release_llh, rc);
-        }
-
-        if (cb) {
-                rc = llog_cat_process(llh, (llog_cb_t)cb, NULL);
-                if (rc != LLOG_PROC_BREAK)
-                        CERROR("llog_cat_process failed %d\n", rc);
-        } else {
-                CWARN("no callback function for recovery\n");
-        }
-
-        CDEBUG(D_HA, "send llcd %p:%p forcibly after recovery\n",
-               ctxt->loc_llcd, ctxt);
-        llog_sync(ctxt, NULL);
-
-release_llh:
-        rc = llog_cat_put(llh);
-        if (rc)
-                CERROR("llog_cat_put failed %d\n", rc);
+        RETURN(lcm);
  out:
-        llog_ctxt_put(ctxt);
-        RETURN(rc);
+        OBD_FREE_PTR(lcm);
+        return NULL;
+}
+EXPORT_SYMBOL(llog_recov_thread_init);
+
+/**
+ * Finalize commit master and its recovery thread.
+ */
+void llog_recov_thread_fini(struct llog_commit_master *lcm, int force)
+{
+        ENTRY;
+        llog_recov_thread_stop(lcm, force);
+        OBD_FREE_PTR(lcm);
+        EXIT;
  }
+EXPORT_SYMBOL(llog_recov_thread_fini);
  
-static int llog_recovery_generic(struct llog_ctxt *ctxt, void *handle,void *arg)
+static int llog_recov_thread_replay(struct llog_ctxt *ctxt, 
+                                    void *cb, void *arg)
  {
          struct obd_device *obd = ctxt->loc_obd;
+        struct llog_process_cat_args *lpca;
          int rc;
          ENTRY;
  
          if (obd->obd_stopping)
                  RETURN(-ENODEV);
  
-        mutex_down(&llpa.llpa_sem);
-        llpa.llpa_cb = handle;
-        llpa.llpa_arg = arg;
-        llpa.llpa_ctxt = llog_get_context(ctxt->loc_obd, ctxt->loc_idx);
-        if (!llpa.llpa_ctxt) {
-                mutex_up(&llpa.llpa_sem);
+        /*
+         * This will be balanced in llog_cat_process_thread()
+         */
+        OBD_ALLOC_PTR(lpca);
+        if (!lpca)
+                RETURN(-ENOMEM);
+
+        lpca->lpca_cb = cb;
+        lpca->lpca_arg = arg;
+
+        /*
+         * This will be balanced in llog_cat_process_thread()
+         */
+        lpca->lpca_ctxt = llog_ctxt_get(ctxt);
+        if (!lpca->lpca_ctxt) {
+                OBD_FREE_PTR(lpca);
                  RETURN(-ENODEV);
          }
-        rc = cfs_kernel_thread(log_process_thread, &llpa, CLONE_VM | CLONE_FILES);
+        rc = cfs_kernel_thread(llog_cat_process_thread, lpca, 
+                               CLONE_VM | CLONE_FILES);
          if (rc < 0) {
+                CERROR("Error starting llog_cat_process_thread(): %d\n", rc);
+                OBD_FREE_PTR(lpca);
                  llog_ctxt_put(ctxt);
-                CERROR("error starting log_process_thread: %d\n", rc);
          } else {
-                CDEBUG(D_HA, "log_process_thread: %d\n", rc);
+                CDEBUG(D_HA, "Started llog_cat_process_thread(): %d\n", rc);
                  rc = 0;
          }
  
          RETURN(rc);
  }
  
-int llog_repl_connect(struct llog_ctxt *ctxt, int count,
-                      struct llog_logid *logid, struct llog_gen *gen,
-                      struct obd_uuid *uuid)
+int llog_obd_repl_connect(struct llog_ctxt *ctxt,
+                          struct llog_logid *logid, struct llog_gen *gen,
+                          struct obd_uuid *uuid)
  {
-        struct llog_canceld_ctxt *llcd;
          int rc;
          ENTRY;
  
-        /* send back llcd before recovery from llog */
-        if (ctxt->loc_llcd != NULL) {
-                CWARN("llcd %p:%p not empty\n", ctxt->loc_llcd, ctxt);
-                llog_sync(ctxt, NULL);
-        }
+        /* 
+         * Send back cached llcd from llog before recovery if we have any.
+         * This is void is nothing cached is found there.
+         */
+        llog_sync(ctxt, NULL);
  
+        /* 
+         * Start recovery in separate thread. 
+         */
          mutex_down(&ctxt->loc_sem);
          ctxt->loc_gen = *gen;
-        llcd = ctxt_llcd_grab(ctxt);
-        if (llcd == NULL) {
-                CERROR("couldn't get an llcd\n");
-                mutex_up(&ctxt->loc_sem);
-                RETURN(-ENOMEM);
-        }
+        rc = llog_recov_thread_replay(ctxt, ctxt->llog_proc_cb, logid);
          mutex_up(&ctxt->loc_sem);
  
-        rc = llog_recovery_generic(ctxt, ctxt->llog_proc_cb, logid);
-        if (rc != 0) {
-                ctxt_llcd_put(ctxt);
-                CERROR("error recovery process: %d\n", rc);
+        RETURN(rc);
+}
+EXPORT_SYMBOL(llog_obd_repl_connect);
+
+/** 
+ * Deleted objects have a commit callback that cancels the MDS
+ * log record for the deletion. The commit callback calls this
+ * function.
+ */
+int llog_obd_repl_cancel(struct llog_ctxt *ctxt,
+                         struct lov_stripe_md *lsm, int count,
+                         struct llog_cookie *cookies, int flags)
+{
+        struct llog_commit_master *lcm;
+        struct llog_canceld_ctxt *llcd;
+        int rc = 0;
+        ENTRY;
+
+        LASSERT(ctxt != NULL);
+
+        mutex_down(&ctxt->loc_sem);
+        lcm = ctxt->loc_lcm;
+
+        /*
+         * Let's check if we have all structures alive. We also check for
+         * possible shutdown. Do nothing if we're stopping.
+         */
+        if (ctxt->loc_imp == NULL) {
+                CDEBUG(D_RPCTRACE, "No import for ctxt %p\n", ctxt);
+                GOTO(out, rc = -ENODEV);
+        }
+
+        if (test_bit(LLOG_LCM_FL_EXIT, &lcm->lcm_flags)) {
+                CDEBUG(D_RPCTRACE, "Commit thread is stopping for ctxt %p\n", 
+                       ctxt);
+                GOTO(out, rc = -ENODEV);
+        }
+
+        llcd = ctxt->loc_llcd;
+
+        if (count > 0 && cookies != NULL) {
+                /*
+                 * Get new llcd from ctxt if required. 
+                 */
+                if (!llcd) {
+                        llcd = llcd_get(ctxt);
+                        if (!llcd)
+                                GOTO(out, rc = -ENOMEM);
+                        /*
+                         * Allocation is successful, let's check for stop
+                         * flag again to fall back as soon as possible.
+                         */
+                        if (test_bit(LLOG_LCM_FL_EXIT, &lcm->lcm_flags))
+                                GOTO(out, rc = -ENODEV);
+                }
+
+                /*
+                 * Llcd does not have enough room for @cookies. Let's push 
+                 * it out and allocate new one. 
+                 */
+                if (!llcd_fit(llcd, cookies)) {
+                        rc = llcd_push(ctxt);
+                        if (rc)
+                                GOTO(out, rc);
+                        llcd = llcd_get(ctxt);
+                        if (!llcd)
+                                GOTO(out, rc = -ENOMEM);
+                        /*
+                         * Allocation is successful, let's check for stop
+                         * flag again to fall back as soon as possible.
+                         */
+                        if (test_bit(LLOG_LCM_FL_EXIT, &lcm->lcm_flags))
+                                GOTO(out, rc = -ENODEV);
+                }
+
+                /*
+                 * Copy cookies to @llcd, no matter old or new allocated
+                 * one.
+                 */
+                llcd_copy(llcd, cookies);
          }
  
+        /*
+         * Let's check if we need to send copied @cookies asap. If yes
+         * then do it.
+         */
+        if (llcd && (flags & OBD_LLOG_FL_SENDNOW)) {
+                CDEBUG(D_RPCTRACE, "Sync llcd %p\n", llcd);
+                rc = llcd_push(ctxt);
+                if (rc)
+                        GOTO(out, rc);
+        }
+        EXIT;
+out:
+        if (rc)
+                llcd_put(ctxt);
+        mutex_up(&ctxt->loc_sem);
+        return rc;
+}
+EXPORT_SYMBOL(llog_obd_repl_cancel);
+
+int llog_obd_repl_sync(struct llog_ctxt *ctxt, struct obd_export *exp)
+{
+        int rc = 0;
+        ENTRY;
+
+        /* 
+         * Flush any remaining llcd. 
+         */
+        mutex_down(&ctxt->loc_sem);
+        if (exp && (ctxt->loc_imp == exp->exp_imp_reverse)) {
+                /*
+                 * This is ost->mds connection, we can't be sure that mds
+                 * can still receive cookies, let's killed the cached llcd.
+                 */
+                CDEBUG(D_RPCTRACE, "Kill cached llcd\n");
+                llcd_put(ctxt);
+                mutex_up(&ctxt->loc_sem);
+        } else {
+                /* 
+                 * This is either llog_sync() from generic llog code or sync
+                 * on client disconnect. In either way let's do it and send
+                 * llcds to the target with waiting for completion. 
+                 */
+                CDEBUG(D_RPCTRACE, "Sync cached llcd\n");
+                mutex_up(&ctxt->loc_sem);
+                rc = llog_cancel(ctxt, NULL, 0, NULL, OBD_LLOG_FL_SENDNOW);
+        }
          RETURN(rc);
  }
-EXPORT_SYMBOL(llog_repl_connect);
+EXPORT_SYMBOL(llog_obd_repl_sync);
  
  #else /* !__KERNEL__ */
  
@@ -680,3 +711,43 @@ int llog_obd_repl_cancel(struct llog_ctxt *ctxt,
          return 0;
  }
  #endif
+
+/**
+ * Module init time fucntion. Initializes slab for llcd objects.
+ */
+int llog_recov_init(void)
+{
+        int llcd_size;
+
+        llcd_size = CFS_PAGE_SIZE - 
+                lustre_msg_size(LUSTRE_MSG_MAGIC_V2, 1, NULL);
+        llcd_size += offsetof(struct llog_canceld_ctxt, llcd_cookies);
+        llcd_cache = cfs_mem_cache_create("llcd_cache", llcd_size, 0, 0);
+        if (!llcd_cache) {
+                CERROR("Error allocating llcd cache\n");
+                return -ENOMEM;
+        }
+        return 0;
+}
+
+/**
+ * Module fini time fucntion. Releases slab for llcd objects.
+ */
+void llog_recov_fini(void)
+{
+        /*
+         * Kill llcd cache when thread is stopped and we're sure no 
+         * llcd in use left.
+         */
+        if (llcd_cache) {
+                /*
+                 * In 2.6.22 cfs_mem_cache_destroy() will not return error
+                 * for busy resources. Let's check it another way.
+                 */
+                LASSERTF(atomic_read(&llcd_count) == 0, 
+                         "Can't destroy llcd cache! Number of "
+                         "busy llcds: %d\n", atomic_read(&llcd_count));
+                cfs_mem_cache_destroy(llcd_cache);
+                llcd_cache = NULL;
+        }
+}
diff --git a/lustre/ptlrpc/recover.c b/lustre/ptlrpc/recover.c

index cc9448c..2ffab93 100644 (file)
--- a/lustre/ptlrpc/recover.c
+++ b/lustre/ptlrpc/recover.c
@@ -1,28 +1,41 @@
  /* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
   * vim:expandtab:shiftwidth=8:tabstop=8:
   *
- * Portal-RPC reconnection and replay operations, for use in recovery.
+ * GPL HEADER START
   *
- *  Copyright (c) 2002, 2003 Cluster File Systems, Inc.
- *   Author: Mike Shaver <shaver@clusterfs.com>
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   *
- *   This file is part of the Lustre file system, http://www.lustre.org
- *   Lustre is a trademark of Cluster File Systems, Inc.
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
   *
- *   You may have signed or agreed to another license before downloading
- *   this software.  If so, you are bound by the terms and conditions
- *   of that agreement, and the following does not apply to you.  See the
- *   LICENSE file included with this distribution for more information.
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
   *
- *   If you did not agree to a different license, then this copy of Lustre
- *   is open source software; you can redistribute it and/or modify it
- *   under the terms of version 2 of the GNU General Public License as
- *   published by the Free Software Foundation.
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
   *
- *   In either case, Lustre is distributed in the hope that it will be
- *   useful, but WITHOUT ANY WARRANTY; without even the implied warranty
- *   of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- *   license text for more details.
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright  2008 Sun Microsystems, Inc. All rights reserved
+ * Use is subject to license terms.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/ptlrpc/recover.c
+ *
+ * Author: Mike Shaver <shaver@clusterfs.com>
   */
  
  #define DEBUG_SUBSYSTEM S_RPC
@@ -97,20 +110,15 @@ int ptlrpc_replay_next(struct obd_import *imp, int *inflight)
           */
          list_for_each_safe(tmp, pos, &imp->imp_replay_list) {
                  req = list_entry(tmp, struct ptlrpc_request, rq_replay_list);
-
                  /* If need to resend the last sent transno (because a
                     reconnect has occurred), then stop on the matching
                     req and send it again. If, however, the last sent
                     transno has been committed then we continue replay
                     from the next request. */
-                if (imp->imp_resend_replay && 
-                    req->rq_transno == last_transno) {
-                        lustre_msg_add_flags(req->rq_reqmsg, MSG_RESENT);
-                        break;
-                }
-
                  if (req->rq_transno > last_transno) {
-                        imp->imp_last_replay_transno = req->rq_transno;
+                        if (imp->imp_resend_replay)
+                                lustre_msg_add_flags(req->rq_reqmsg,
+                                                     MSG_RESENT);
                          break;
                  }
  
@@ -173,7 +181,7 @@ void ptlrpc_wake_delayed(struct obd_import *imp)
                  req = list_entry(tmp, struct ptlrpc_request, rq_list);
  
                  DEBUG_REQ(D_HA, req, "waking (set %p):", req->rq_set);
-                ptlrpc_wake_client_req(req);
+                ptlrpc_client_wake_req(req);
          }
          spin_unlock(&imp->imp_lock);
  }
@@ -232,19 +240,18 @@ int ptlrpc_set_import_active(struct obd_import *imp, int active)
          if (!active) {
                  LCONSOLE_WARN("setting import %s INACTIVE by administrator "
                                "request\n", obd2cli_tgt(imp->imp_obd));
-                ptlrpc_invalidate_import(imp);
  
+                /* set before invalidate to avoid messages about imp_inval
+                 * set without imp_deactive in ptlrpc_import_delay_req */
                  spin_lock(&imp->imp_lock);
                  imp->imp_deactive = 1;
                  spin_unlock(&imp->imp_lock);
+
+                ptlrpc_invalidate_import(imp);
          }
  
          /* When activating, mark import valid, and attempt recovery */
          if (active) {
-                spin_lock(&imp->imp_lock);
-                imp->imp_deactive = 0;
-                spin_unlock(&imp->imp_lock);
-
                  CDEBUG(D_HA, "setting import %s VALID\n",
                         obd2cli_tgt(imp->imp_obd));
                  rc = ptlrpc_recover_import(imp, NULL);
@@ -259,6 +266,13 @@ int ptlrpc_recover_import(struct obd_import *imp, char *new_uuid)
          int rc;
          ENTRY;
  
+        spin_lock(&imp->imp_lock);
+        if (atomic_read(&imp->imp_inval_count)) {
+                spin_unlock(&imp->imp_lock);
+                RETURN(-EINVAL);
+        }
+        spin_unlock(&imp->imp_lock);
+
          /* force import to be disconnected. */
          ptlrpc_set_import_discon(imp, 0);
  
diff --git a/lustre/ptlrpc/service.c b/lustre/ptlrpc/service.c

index 276689b..f32d283 100644 (file)
--- a/lustre/ptlrpc/service.c
+++ b/lustre/ptlrpc/service.c
@@ -1,26 +1,37 @@
  /* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
   * vim:expandtab:shiftwidth=8:tabstop=8:
   *
- *  Copyright (C) 2002 Cluster File Systems, Inc.
+ * GPL HEADER START
   *
- *   This file is part of the Lustre file system, http://www.lustre.org
- *   Lustre is a trademark of Cluster File Systems, Inc.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   *
- *   You may have signed or agreed to another license before downloading
- *   this software.  If so, you are bound by the terms and conditions
- *   of that agreement, and the following does not apply to you.  See the
- *   LICENSE file included with this distribution for more information.
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
   *
- *   If you did not agree to a different license, then this copy of Lustre
- *   is open source software; you can redistribute it and/or modify it
- *   under the terms of version 2 of the GNU General Public License as
- *   published by the Free Software Foundation.
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
   *
- *   In either case, Lustre is distributed in the hope that it will be
- *   useful, but WITHOUT ANY WARRANTY; without even the implied warranty
- *   of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- *   license text for more details.
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
   *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright  2008 Sun Microsystems, Inc. All rights reserved
+ * Use is subject to license terms.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
   */
  
  #define DEBUG_SUBSYSTEM S_RPC
@@ -199,7 +210,7 @@ ptlrpc_schedule_difficult_reply (struct ptlrpc_reply_state *rs)
  }
  
  void
-ptlrpc_commit_replies (struct obd_device *obd)
+ptlrpc_commit_replies (struct obd_export *exp)
  {
          struct list_head   *tmp;
          struct list_head   *nxt;
@@ -208,15 +219,16 @@ ptlrpc_commit_replies (struct obd_device *obd)
           * to attend to complete them. */
  
          /* CAVEAT EMPTOR: spinlock ordering!!! */
-        spin_lock(&obd->obd_uncommitted_replies_lock);
+        spin_lock(&exp->exp_uncommitted_replies_lock);
  
-        list_for_each_safe (tmp, nxt, &obd->obd_uncommitted_replies) {
+        list_for_each_safe(tmp, nxt, &exp->exp_uncommitted_replies) {
                  struct ptlrpc_reply_state *rs =
                          list_entry(tmp, struct ptlrpc_reply_state, rs_obd_list);
  
-                LASSERT (rs->rs_difficult);
-
-                if (rs->rs_transno <= obd->obd_last_committed) {
+                LASSERT(rs->rs_difficult);
+                /* VBR: per-export last_committed */
+                LASSERT(rs->rs_export);
+                if (rs->rs_transno <= rs->rs_export->exp_last_committed) {
                          struct ptlrpc_service *svc = rs->rs_service;
  
                          spin_lock (&svc->srv_lock);
@@ -226,7 +238,7 @@ ptlrpc_commit_replies (struct obd_device *obd)
                  }
          }
  
-        spin_unlock(&obd->obd_uncommitted_replies_lock);
+        spin_unlock(&exp->exp_uncommitted_replies_lock);
  }
  
  static int
@@ -279,9 +291,6 @@ ptlrpc_server_post_idle_rqbds (struct ptlrpc_service *svc)
  static void ptlrpc_at_timer(unsigned long castmeharder)
  {
          struct ptlrpc_service *svc = (struct ptlrpc_service *)castmeharder;
-        CDEBUG(D_INFO, "at timer %s hit at %ld%s\n",
-               svc->srv_name, cfs_time_current_sec(), 
-               list_empty(&svc->srv_at_list) ? ", empty" : ""); 
          svc->srv_at_check = 1;
          svc->srv_at_checktime = cfs_time_current();
          cfs_waitq_signal(&svc->srv_waitq);
@@ -293,8 +302,9 @@ ptlrpc_init_svc(int nbufs, int bufsize, int max_req_size, int max_reply_size,
                  int req_portal, int rep_portal, int watchdog_factor,
                  svc_handler_t handler, char *name,
                  cfs_proc_dir_entry_t *proc_entry,
-                svcreq_printfn_t svcreq_printfn, 
-                int min_threads, int max_threads, char *threadname)
+                svcreq_printfn_t svcreq_printfn,
+                int min_threads, int max_threads, char *threadname,
+                svc_hpreq_handler_t hp_handler)
  {
          int                    rc;
          struct ptlrpc_service *service;
@@ -302,7 +312,7 @@ ptlrpc_init_svc(int nbufs, int bufsize, int max_req_size, int max_reply_size,
  
          LASSERT (nbufs > 0);
          LASSERT (bufsize >= max_req_size);
-        
+
          OBD_ALLOC(service, sizeof(*service));
          if (service == NULL)
                  RETURN(NULL);
@@ -327,11 +337,16 @@ ptlrpc_init_svc(int nbufs, int bufsize, int max_req_size, int max_reply_size,
          service->srv_threads_min = min_threads;
          service->srv_threads_max = max_threads;
          service->srv_thread_name = threadname;
+        service->srv_hpreq_handler = hp_handler;
+        service->srv_hpreq_ratio = PTLRPC_SVC_HP_RATIO;
+        service->srv_hpreq_count = 0;
+        service->srv_n_hpreq = 0;
  
          rc = LNetSetLazyPortal(service->srv_req_portal);
          LASSERT (rc == 0);
  
          CFS_INIT_LIST_HEAD(&service->srv_request_queue);
+        CFS_INIT_LIST_HEAD(&service->srv_request_hpq);
          CFS_INIT_LIST_HEAD(&service->srv_idle_rqbds);
          CFS_INIT_LIST_HEAD(&service->srv_active_rqbds);
          CFS_INIT_LIST_HEAD(&service->srv_history_rqbds);
@@ -345,14 +360,14 @@ ptlrpc_init_svc(int nbufs, int bufsize, int max_req_size, int max_reply_size,
          CFS_INIT_LIST_HEAD(&service->srv_req_in_queue);
          CFS_INIT_LIST_HEAD(&service->srv_at_list);
          cfs_timer_init(&service->srv_at_timer, ptlrpc_at_timer, service);
-        /* At SOW, service time should be quick; 10s seems generous. If client 
+        /* At SOW, service time should be quick; 10s seems generous. If client
             timeout is less than this, we'll be sending an early reply. */
          at_init(&service->srv_at_estimate, 10, 0);
  
          spin_lock (&ptlrpc_all_services_lock);
          list_add (&service->srv_list, &ptlrpc_all_services);
          spin_unlock (&ptlrpc_all_services_lock);
-        
+
          /* Now allocate the request buffers */
          rc = ptlrpc_grow_req_bufs(service);
          /* We shouldn't be under memory pressure at startup, so
@@ -378,38 +393,32 @@ failed:
          return NULL;
  }
  
-static void ptlrpc_server_req_decref(struct ptlrpc_request *req)
+/**
+ * to actually free the request, must be called without holding svc_lock.
+ * note it's caller's responsibility to unlink req->rq_list.
+ */
+static void ptlrpc_server_free_request(struct ptlrpc_request *req)
  {
-        struct ptlrpc_request_buffer_desc *rqbd = req->rq_rqbd;
+        LASSERT(atomic_read(&req->rq_refcount) == 0);
+        LASSERT(list_empty(&req->rq_timed_list));
  
-        if (!atomic_dec_and_test(&req->rq_refcount))
-                return;
+        /* DEBUG_REQ() assumes the reply state of a request with a valid
+         * ref will not be destroyed until that reference is dropped. */
+        ptlrpc_req_drop_rs(req);
  
-        LASSERT(list_empty(&req->rq_timed_list));
-        if (req != &rqbd->rqbd_req) {
+        if (req != &req->rq_rqbd->rqbd_req) {
                  /* NB request buffers use an embedded
                   * req if the incoming req unlinked the
                   * MD; this isn't one of them! */
                  OBD_FREE(req, sizeof(*req));
-        } else {
-                struct ptlrpc_service *svc = rqbd->rqbd_service;
-                /* schedule request buffer for re-use.
-                 * NB I can only do this after I've disposed of their
-                 * reqs; particularly the embedded req */
-                spin_lock(&svc->srv_lock);
-                list_add_tail(&rqbd->rqbd_list, &svc->srv_idle_rqbds);
-                spin_unlock(&svc->srv_lock);
          }
  }
  
-static void __ptlrpc_server_free_request(struct ptlrpc_request *req)
-{
-        list_del(&req->rq_list);
-        ptlrpc_req_drop_rs(req);
-        ptlrpc_server_req_decref(req);
-}
-
-static void ptlrpc_server_free_request(struct ptlrpc_request *req)
+/**
+ * drop a reference count of the request. if it reaches 0, we either
+ * put it into history list, or free it immediately.
+ */
+static void ptlrpc_server_drop_request(struct ptlrpc_request *req)
  {
          struct ptlrpc_request_buffer_desc *rqbd = req->rq_rqbd;
          struct ptlrpc_service             *svc = rqbd->rqbd_service;
@@ -417,12 +426,8 @@ static void ptlrpc_server_free_request(struct ptlrpc_request *req)
          struct list_head                  *tmp;
          struct list_head                  *nxt;
  
-        if (req->rq_phase != RQ_PHASE_NEW) /* incorrect message magic */
-                DEBUG_REQ(D_INFO, req, "free req");
-        spin_lock(&svc->srv_at_lock);
-        req->rq_sent_final = 1;
-        list_del_init(&req->rq_timed_list);
-        spin_unlock(&svc->srv_at_lock);
+        if (!atomic_dec_and_test(&req->rq_refcount))
+                return;
  
          spin_lock(&svc->srv_lock);
  
@@ -465,19 +470,54 @@ static void ptlrpc_server_free_request(struct ptlrpc_request *req)
                                  req = list_entry(rqbd->rqbd_reqs.next,
                                                   struct ptlrpc_request,
                                                   rq_list);
-                                __ptlrpc_server_free_request(req);
+                                list_del(&req->rq_list);
+                                ptlrpc_server_free_request(req);
                          }
  
                          spin_lock(&svc->srv_lock);
+                        /*
+                         * now all reqs including the embedded req has been
+                         * disposed, schedule request buffer for re-use.
+                         */
+                        LASSERT(atomic_read(&rqbd->rqbd_req.rq_refcount) == 0);
+                        list_add_tail(&rqbd->rqbd_list, &svc->srv_idle_rqbds);
                  }
+
+                spin_unlock(&svc->srv_lock);
          } else if (req->rq_reply_state && req->rq_reply_state->rs_prealloc) {
-                 /* If we are low on memory, we are not interested in
-                    history */
-                list_del(&req->rq_history_list);
-                __ptlrpc_server_free_request(req);
+                 /* If we are low on memory, we are not interested in history */
+                list_del(&req->rq_list);
+                list_del_init(&req->rq_history_list);
+                spin_unlock(&svc->srv_lock);
+
+                ptlrpc_server_free_request(req);
+        } else {
+                spin_unlock(&svc->srv_lock);
          }
+}
  
-        spin_unlock(&svc->srv_lock);
+/**
+ * to finish a request: stop sending more early replies, and release
+ * the request. should be called after we finished handling the request.
+ */
+static void ptlrpc_server_finish_request(struct ptlrpc_request *req)
+{
+        struct ptlrpc_service  *svc = req->rq_rqbd->rqbd_service;
+
+        if (req->rq_export) {
+                class_export_put(req->rq_export);
+                req->rq_export = NULL;
+        }
+
+        if (req->rq_phase != RQ_PHASE_NEW) /* incorrect message magic */
+                DEBUG_REQ(D_INFO, req, "free req");
+
+        spin_lock(&svc->srv_at_lock);
+        req->rq_sent_final = 1;
+        list_del_init(&req->rq_timed_list);
+        spin_unlock(&svc->srv_at_lock);
+
+        ptlrpc_server_drop_request(req);
  }
  
  /* This function makes sure dead exports are evicted in a timely manner.
@@ -486,7 +526,7 @@ static void ptlrpc_server_free_request(struct ptlrpc_request *req)
  static void ptlrpc_update_export_timer(struct obd_export *exp, long extra_delay)
  {
          struct obd_export *oldest_exp;
-        time_t oldest_time;
+        time_t oldest_time, new_time;
  
          ENTRY;
  
@@ -497,9 +537,13 @@ static void ptlrpc_update_export_timer(struct obd_export *exp, long extra_delay)
             of the list, we can be really lazy here - we don't have to evict
             at the exact right moment.  Eventually, all silent exports
             will make it to the top of the list. */
-        exp->exp_last_request_time = max(exp->exp_last_request_time,
-                                         cfs_time_current_sec() + extra_delay);
  
+        /* Do not pay attention on 1sec or smaller renewals. */
+        new_time = cfs_time_current_sec() + extra_delay;
+        if (exp->exp_last_request_time + 1 /*second */ >= new_time)
+                RETURN_EXIT;
+
+        exp->exp_last_request_time = new_time;
          CDEBUG(D_INFO, "updating export %s at %ld\n",
                 exp->exp_client_uuid.uuid,
                 exp->exp_last_request_time);
@@ -512,8 +556,7 @@ static void ptlrpc_update_export_timer(struct obd_export *exp, long extra_delay)
          if (list_empty(&exp->exp_obd_chain_timed)) {
                  /* this one is not timed */
                  spin_unlock(&exp->exp_obd->obd_dev_lock);
-                EXIT;
-                return;
+                RETURN_EXIT;
          }
  
          list_move_tail(&exp->exp_obd_chain_timed,
@@ -546,7 +589,7 @@ static void ptlrpc_update_export_timer(struct obd_export *exp, long extra_delay)
                                 oldest_time);
                  }
          } else {
-                if (cfs_time_current_sec() > 
+                if (cfs_time_current_sec() >
                      (exp->exp_obd->obd_eviction_timer + extra_delay)) {
                          /* The evictor won't evict anyone who we've heard from
                           * recently, so we don't have to check before we start
@@ -561,7 +604,7 @@ static void ptlrpc_update_export_timer(struct obd_export *exp, long extra_delay)
  
  static int ptlrpc_check_req(struct ptlrpc_request *req)
  {
-        if (lustre_msg_get_conn_cnt(req->rq_reqmsg) < 
+        if (lustre_msg_get_conn_cnt(req->rq_reqmsg) <
              req->rq_export->exp_conn_cnt) {
                  DEBUG_REQ(D_ERROR, req,
                            "DROPPING req from old connection %d < %d",
@@ -594,16 +637,16 @@ static void ptlrpc_at_set_timer(struct ptlrpc_service *svc)
          }
  
          /* Set timer for closest deadline */
-        rq = list_entry(svc->srv_at_list.next, struct ptlrpc_request, 
+        rq = list_entry(svc->srv_at_list.next, struct ptlrpc_request,
                          rq_timed_list);
          next = (__s32)(rq->rq_deadline - cfs_time_current_sec() -
                         at_early_margin);
-        if (next <= 0) 
+        if (next <= 0)
                  ptlrpc_at_timer((unsigned long)svc);
          else
                  cfs_timer_arm(&svc->srv_at_timer, cfs_time_shift(next));
          spin_unlock(&svc->srv_at_lock);
-        CDEBUG(D_INFO, "armed %s at %+lds\n", svc->srv_name, next);
+        CDEBUG(D_INFO, "armed %s at %+ds\n", svc->srv_name, next);
  }
  
  /* Add rpc to early reply check list */
@@ -613,15 +656,12 @@ static int ptlrpc_at_add_timed(struct ptlrpc_request *req)
          struct ptlrpc_request *rq;
          int found = 0;
  
-        if (AT_OFF) 
+        if (AT_OFF)
                  return(0);
  
          if ((lustre_msghdr_get_flags(req->rq_reqmsg) & MSGHDR_AT_SUPPORT) == 0)
                  return(-ENOSYS);
-        
-        DEBUG_REQ(D_ADAPTTO, req, "add timed %lds", 
-                  req->rq_deadline - cfs_time_current_sec());
-        
+
          spin_lock(&svc->srv_at_lock);
  
          if (unlikely(req->rq_sent_final)) {
@@ -633,7 +673,7 @@ static int ptlrpc_at_add_timed(struct ptlrpc_request *req)
          /* Add to sorted list.  Presumably latest rpcs will have the latest
             deadlines, so search backward. */
          list_for_each_entry_reverse(rq, &svc->srv_at_list, rq_timed_list) {
-                if (req->rq_deadline > rq->rq_deadline) {
+                if (req->rq_deadline >= rq->rq_deadline) {
                          list_add(&req->rq_timed_list, &rq->rq_timed_list);
                          found++;
                          break;
@@ -652,9 +692,9 @@ static int ptlrpc_at_add_timed(struct ptlrpc_request *req)
                  ptlrpc_at_set_timer(svc);
  
          return 0;
-}            
+}
  
-static int ptlrpc_at_send_early_reply(struct ptlrpc_request *req, 
+static int ptlrpc_at_send_early_reply(struct ptlrpc_request *req,
                                        int extra_time)
  {
          struct ptlrpc_service *svc = req->rq_rqbd->rqbd_service;
@@ -664,47 +704,55 @@ static int ptlrpc_at_send_early_reply(struct ptlrpc_request *req,
          time_t newdl;
          int rc;
          ENTRY;
-                            
-        /* deadline is when the client expects us to reply, margin is the 
+
+        /* deadline is when the client expects us to reply, margin is the
             difference between clients' and servers' expectations */
-        DEBUG_REQ(D_ADAPTTO, req, 
+        DEBUG_REQ(D_ADAPTTO, req,
                    "%ssending early reply (deadline %+lds, margin %+lds) for "
                    "%d+%d", AT_OFF ? "AT off - not " : "",
                    olddl, olddl - at_get(&svc->srv_at_estimate),
                    at_get(&svc->srv_at_estimate), extra_time);
  
-        if (AT_OFF) 
+        if (AT_OFF)
                  RETURN(0);
-        
+
          if (olddl < 0) {
-                CDEBUG(D_WARNING, "x"LPU64": Already past deadline (%+lds), not"
-                       " sending early reply. Increase at_early_margin (%d)?\n",
-                       req->rq_xid, olddl, at_early_margin);
+                DEBUG_REQ(D_WARNING, req, "Already past deadline (%+lds), "
+                          "not sending early reply. Consider increasing "
+                          "at_early_margin (%d)?", olddl, at_early_margin);
+
                  /* Return an error so we're not re-added to the timed list. */
                  RETURN(-ETIMEDOUT);
          }
  
          if ((lustre_msghdr_get_flags(req->rq_reqmsg) & MSGHDR_AT_SUPPORT) == 0){
-                CDEBUG(D_INFO, "Wanted to ask client for more time, but no AT "
-                      "support\n");
+                DEBUG_REQ(D_INFO, req, "Wanted to ask client for more time, "
+                          "but no AT support");
                  RETURN(-ENOSYS);
          }
  
-        if (extra_time) {
-                /* Fake our processing time into the future to ask the
-                   clients for some extra amount of time */
-                extra_time += cfs_time_current_sec() -
-                        req->rq_arrival_time.tv_sec;
-                at_add(&svc->srv_at_estimate, extra_time);
+        if (req->rq_export && req->rq_export->exp_in_recovery) {
+                /* don't increase server estimates during recovery, and give
+                   clients the full recovery time. */
+                newdl = cfs_time_current_sec() +
+                        req->rq_export->exp_obd->obd_recovery_timeout;
+        } else {
+                if (extra_time) {
+                        /* Fake our processing time into the future to ask the
+                           clients for some extra amount of time */
+                        extra_time += cfs_time_current_sec() -
+                                      req->rq_arrival_time.tv_sec;
+                        at_add(&svc->srv_at_estimate, extra_time);
+                }
+                newdl = req->rq_arrival_time.tv_sec +
+                        at_get(&svc->srv_at_estimate);
          }
-
-        newdl = req->rq_arrival_time.tv_sec + at_get(&svc->srv_at_estimate);
          if (req->rq_deadline >= newdl) {
                  /* We're not adding any time, no need to send an early reply
                     (e.g. maybe at adaptive_max) */
-                CDEBUG(D_ADAPTTO, "x"LPU64": Couldn't add any time (%ld/%ld), "
-                       "not sending early reply\n", req->rq_xid, olddl,
-                       newdl - cfs_time_current_sec());
+                DEBUG_REQ(D_WARNING, req, "Couldn't add any time "
+                          "(%ld/%ld), not sending early reply\n",
+                          olddl, newdl - cfs_time_current_sec());
                  RETURN(-ETIMEDOUT);
          }
  
@@ -716,7 +764,7 @@ static int ptlrpc_at_send_early_reply(struct ptlrpc_request *req,
                  OBD_FREE(reqcopy, sizeof *reqcopy);
                  RETURN(-ENOMEM);
          }
-        
+
          *reqcopy = *req;
          reqcopy->rq_reply_state = NULL;
          reqcopy->rq_rep_swab_mask = 0;
@@ -725,8 +773,8 @@ static int ptlrpc_at_send_early_reply(struct ptlrpc_request *req,
          memcpy(reqmsg, req->rq_reqmsg, req->rq_reqlen);
  
          if (req->rq_sent_final) {
-                CDEBUG(D_ADAPTTO, "x"LPU64": normal reply already sent out, "
-                       "abort sending early reply\n", req->rq_xid);
+                DEBUG_REQ(D_ADAPTTO, reqcopy, "Normal reply already sent out, "
+                          "abort sending early reply\n");
                  GOTO(out, rc = 0);
          }
  
@@ -738,12 +786,12 @@ static int ptlrpc_at_send_early_reply(struct ptlrpc_request *req,
  
          /* RPC ref */
          class_export_rpc_get(reqcopy->rq_export);
-        if (reqcopy->rq_export->exp_obd && 
+        if (reqcopy->rq_export->exp_obd &&
              reqcopy->rq_export->exp_obd->obd_fail)
                  GOTO(out_put, rc = -ENODEV);
  
          rc = lustre_pack_reply_flags(reqcopy, 1, NULL, NULL, LPRFL_EARLY_REPLY);
-        if (rc) 
+        if (rc)
                  GOTO(out_put, rc);
  
          rc = ptlrpc_send_reply(reqcopy, PTLRPC_REPLY_EARLY);
@@ -756,7 +804,7 @@ static int ptlrpc_at_send_early_reply(struct ptlrpc_request *req,
                  DEBUG_REQ(D_ERROR, req, "Early reply send failed %d", rc);
          }
  
-        /* Free the (early) reply state from lustre_pack_reply. 
+        /* Free the (early) reply state from lustre_pack_reply.
             (ptlrpc_send_reply takes it's own rs ref, so this is safe here) */
          ptlrpc_req_drop_rs(reqcopy);
  
@@ -787,10 +835,10 @@ static int ptlrpc_at_check_timed(struct ptlrpc_service *svc)
          }
          delay = cfs_time_sub(cfs_time_current(), svc->srv_at_checktime);
          svc->srv_at_check = 0;
-        
+
          if (list_empty(&svc->srv_at_list)) {
                  spin_unlock(&svc->srv_at_lock);
-                RETURN(0);      
+                RETURN(0);
          }
  
          /* The timer went off, but maybe the nearest rpc already completed. */
@@ -801,10 +849,10 @@ static int ptlrpc_at_check_timed(struct ptlrpc_service *svc)
                  /* We've still got plenty of time.  Reset the timer. */
                  spin_unlock(&svc->srv_at_lock);
                  ptlrpc_at_set_timer(svc);
-                RETURN(0);      
+                RETURN(0);
          }
  
-        /* We're close to a timeout, and we don't know how much longer the 
+        /* We're close to a timeout, and we don't know how much longer the
             server will take. Send early replies to everyone expiring soon. */
          CFS_INIT_LIST_HEAD(&work_list);
          list_for_each_entry_safe(rq, n, &svc->srv_at_list, rq_timed_list) {
@@ -823,9 +871,9 @@ static int ptlrpc_at_check_timed(struct ptlrpc_service *svc)
  
          CDEBUG(D_ADAPTTO, "timeout in %+ds, asking for %d secs on %d early "
                 "replies\n", first, at_extra, counter);
-        
+
          if (first < 0) {
-                /* We're already past request deadlines before we even get a 
+                /* We're already past request deadlines before we even get a
                     chance to send early replies */
                  LCONSOLE_WARN("%s: This server is not able to keep up with "
                                "request traffic (cpu-bound).\n",  svc->srv_name);
@@ -835,8 +883,8 @@ static int ptlrpc_at_check_timed(struct ptlrpc_service *svc)
                        at_get(&svc->srv_at_estimate), delay);
          }
  
-        /* ptlrpc_server_free_request may delete an entry out of the work
-           list */
+        /* ptlrpc_server_finish_request may delete an entry out of
+         * the work list */
          spin_lock(&svc->srv_at_lock);
          while (!list_empty(&work_list)) {
                  rq = list_entry(work_list.next, struct ptlrpc_request,
@@ -850,7 +898,7 @@ static int ptlrpc_at_check_timed(struct ptlrpc_service *svc)
                  if (ptlrpc_at_send_early_reply(rq, at_extra) == 0)
                          ptlrpc_at_add_timed(rq);
  
-                ptlrpc_server_req_decref(rq);
+                ptlrpc_server_drop_request(rq);
                  spin_lock(&svc->srv_at_lock);
          }
          spin_unlock(&svc->srv_at_lock);
@@ -858,6 +906,167 @@ static int ptlrpc_at_check_timed(struct ptlrpc_service *svc)
          RETURN(0);
  }
  
+/**
+ * Put the request to the export list if the request may become
+ * a high priority one.
+ */
+static int ptlrpc_hpreq_init(struct ptlrpc_service *svc,
+                             struct ptlrpc_request *req)
+{
+        int rc;
+        ENTRY;
+
+        if (svc->srv_hpreq_handler) {
+                rc = svc->srv_hpreq_handler(req);
+                if (rc)
+                        RETURN(rc);
+        }
+        if (req->rq_export && req->rq_ops) {
+                spin_lock(&req->rq_export->exp_lock);
+                list_add(&req->rq_exp_list, &req->rq_export->exp_queued_rpc);
+                spin_unlock(&req->rq_export->exp_lock);
+        }
+
+        RETURN(0);
+}
+
+/** Remove the request from the export list. */
+static void ptlrpc_hpreq_fini(struct ptlrpc_request *req)
+{
+        ENTRY;
+        if (req->rq_export && req->rq_ops) {
+                spin_lock(&req->rq_export->exp_lock);
+                list_del_init(&req->rq_exp_list);
+                spin_unlock(&req->rq_export->exp_lock);
+        }
+        EXIT;
+}
+
+/**
+ * Make the request a high priority one.
+ *
+ * All the high priority requests are queued in a separate FIFO
+ * ptlrpc_service::srv_request_hpq list which is parallel to
+ * ptlrpc_service::srv_request_queue list but has a higher priority
+ * for handling.
+ *
+ * \see ptlrpc_server_handle_request().
+ */
+static void ptlrpc_hpreq_reorder_nolock(struct ptlrpc_service *svc,
+                                        struct ptlrpc_request *req)
+{
+        ENTRY;
+        LASSERT(svc != NULL);
+        spin_lock(&req->rq_lock);
+        if (req->rq_hp == 0) {
+                int opc = lustre_msg_get_opc(req->rq_reqmsg);
+
+                /* Add to the high priority queue. */
+                list_move_tail(&req->rq_list, &svc->srv_request_hpq);
+                req->rq_hp = 1;
+                if (opc != OBD_PING)
+                        DEBUG_REQ(D_NET, req, "high priority req");
+        }
+        spin_unlock(&req->rq_lock);
+        EXIT;
+}
+
+void ptlrpc_hpreq_reorder(struct ptlrpc_request *req)
+{
+        struct ptlrpc_service *svc = req->rq_rqbd->rqbd_service;
+        ENTRY;
+
+        spin_lock(&svc->srv_lock);
+        /* It may happen that the request is already taken for the processing
+         * but still in the export list, do not re-add it into the HP list. */
+        if (req->rq_phase == RQ_PHASE_NEW)
+                ptlrpc_hpreq_reorder_nolock(svc, req);
+        spin_unlock(&svc->srv_lock);
+        EXIT;
+}
+
+/** Check if the request if a high priority one. */
+static int ptlrpc_server_hpreq_check(struct ptlrpc_request *req)
+{
+        int opc, rc = 0;
+        ENTRY;
+
+        /* Check by request opc. */
+        opc = lustre_msg_get_opc(req->rq_reqmsg);
+        if (opc == OBD_PING)
+                RETURN(1);
+
+        /* Perform request specific check. */
+        if (req->rq_ops && req->rq_ops->hpreq_check)
+                rc = req->rq_ops->hpreq_check(req);
+        RETURN(rc);
+}
+
+/** Check if a request is a high priority one. */
+static int ptlrpc_server_request_add(struct ptlrpc_service *svc,
+                                     struct ptlrpc_request *req)
+{
+        int rc;
+        ENTRY;
+
+        rc = ptlrpc_server_hpreq_check(req);
+        if (rc < 0)
+                RETURN(rc);
+
+        spin_lock(&svc->srv_lock);
+        /* Before inserting the request into the queue, check if it is not
+         * inserted yet, or even already handled -- it may happen due to
+         * a racing ldlm_server_blocking_ast(). */
+        if (req->rq_phase == RQ_PHASE_NEW && list_empty(&req->rq_list)) {
+                if (rc)
+                        ptlrpc_hpreq_reorder_nolock(svc, req);
+                else
+                        list_add_tail(&req->rq_list, &svc->srv_request_queue);
+        }
+        spin_unlock(&svc->srv_lock);
+
+        RETURN(0);
+}
+
+/* Only allow normal priority requests on a service that has a high-priority
+ * queue if forced (i.e. cleanup), if there are other high priority requests
+ * already being processed (i.e. those threads can service more high-priority
+ * requests), or if there are enough idle threads that a later thread can do
+ * a high priority request. */
+static int ptlrpc_server_allow_normal(struct ptlrpc_service *svc, int force)
+{
+        return force || !svc->srv_hpreq_handler || svc->srv_n_hpreq > 0 ||
+               svc->srv_n_active_reqs < svc->srv_threads_running - 2;
+}
+
+static struct ptlrpc_request *
+ptlrpc_server_request_get(struct ptlrpc_service *svc, int force)
+{
+        struct ptlrpc_request *req = NULL;
+        ENTRY;
+
+        if (ptlrpc_server_allow_normal(svc, force) &&
+            !list_empty(&svc->srv_request_queue) &&
+            (list_empty(&svc->srv_request_hpq) ||
+             svc->srv_hpreq_count >= svc->srv_hpreq_ratio)) {
+                req = list_entry(svc->srv_request_queue.next,
+                                 struct ptlrpc_request, rq_list);
+                svc->srv_hpreq_count = 0;
+        } else if (!list_empty(&svc->srv_request_hpq)) {
+                req = list_entry(svc->srv_request_hpq.next,
+                                 struct ptlrpc_request, rq_list);
+                svc->srv_hpreq_count++;
+        }
+        RETURN(req);
+}
+
+static int ptlrpc_server_request_pending(struct ptlrpc_service *svc, int force)
+{
+        return ((ptlrpc_server_allow_normal(svc, force) &&
+                 !list_empty(&svc->srv_request_queue)) ||
+                !list_empty(&svc->srv_request_hpq));
+}
+
  /* Handle freshly incoming reqs, add to timed early reply list,
     pass on to regular request queue */
  static int
@@ -919,20 +1128,19 @@ ptlrpc_server_handle_req_in(struct ptlrpc_service *svc)
                  lustre_msg_get_handle(req->rq_reqmsg));
          if (req->rq_export) {
                  rc = ptlrpc_check_req(req);
-                class_export_put(req->rq_export);
-                req->rq_export = NULL;
-                if (rc) 
+                if (rc)
                          goto err_req;
+                ptlrpc_update_export_timer(req->rq_export, 0);
          }
  
          /* req_in handling should/must be fast */
-        if (cfs_time_current_sec() - req->rq_arrival_time.tv_sec > 5) 
+        if (cfs_time_current_sec() - req->rq_arrival_time.tv_sec > 5)
                  DEBUG_REQ(D_WARNING, req, "Slow req_in handling %lus",
                            cfs_time_current_sec() - req->rq_arrival_time.tv_sec);
  
          /* Set rpc server deadline and add it to the timed list */
          deadline = (lustre_msghdr_get_flags(req->rq_reqmsg) &
-                    MSGHDR_AT_SUPPORT) ? 
+                    MSGHDR_AT_SUPPORT) ?
                     /* The max time the client expects us to take */
                     lustre_msg_get_timeout(req->rq_reqmsg) : obd_timeout;
          req->rq_deadline = req->rq_arrival_time.tv_sec + deadline;
@@ -940,14 +1148,17 @@ ptlrpc_server_handle_req_in(struct ptlrpc_service *svc)
                  DEBUG_REQ(D_ERROR, req, "Dropping request with 0 timeout");
                  goto err_req;
          }
-        
+
          ptlrpc_at_add_timed(req);
+        rc = ptlrpc_hpreq_init(svc, req);
+        if (rc)
+                GOTO(err_req, rc);
  
          /* Move it over to the request processing queue */
-        spin_lock(&svc->srv_lock);
-        list_add_tail(&req->rq_list, &svc->srv_request_queue);
+        rc = ptlrpc_server_request_add(svc, req);
+        if (rc)
+                GOTO(err_req, rc);
          cfs_waitq_signal(&svc->srv_waitq);
-        spin_unlock(&svc->srv_lock);
          RETURN(1);
  
  err_req:
@@ -955,7 +1166,7 @@ err_req:
          svc->srv_n_queued_reqs--;
          svc->srv_n_active_reqs++;
          spin_unlock(&svc->srv_lock);
-        ptlrpc_server_free_request(req);
+        ptlrpc_server_finish_request(req);
  
          RETURN(1);
  }
@@ -969,35 +1180,71 @@ ptlrpc_server_handle_request(struct ptlrpc_service *svc,
          struct timeval         work_start;
          struct timeval         work_end;
          long                   timediff;
-        int                    rc;
+        int                    opc, rc;
+        int                    fail_opc = 0;
          ENTRY;
  
          LASSERT(svc);
  
          spin_lock(&svc->srv_lock);
-        if (list_empty (&svc->srv_request_queue) ||
+        if (!ptlrpc_server_request_pending(svc, 0) ||
              (
  #ifndef __KERNEL__
               /* !@%$# liblustre only has 1 thread */
               svc->srv_n_difficult_replies != 0 &&
  #endif
               svc->srv_n_active_reqs >= (svc->srv_threads_running - 1))) {
-                /* Don't handle regular requests in the last thread, in order               * remain free to handle any 'difficult' replies (that might
+                /* Don't handle regular requests in the last thread, in order
                   * to handle difficult replies (which might block other threads)
-                 * as well as handle any incoming reqs, early replies, etc. 
+                 * as well as handle any incoming reqs, early replies, etc.
                   * That means we always need at least 2 service threads. */
                  spin_unlock(&svc->srv_lock);
                  RETURN(0);
          }
  
-        request = list_entry (svc->srv_request_queue.next,
-                              struct ptlrpc_request, rq_list);
-        list_del_init (&request->rq_list);
+        request = ptlrpc_server_request_get(svc, 0);
+        if  (request == NULL) {
+                spin_unlock(&svc->srv_lock);
+                RETURN(0);
+        }
+
+        opc = lustre_msg_get_opc(request->rq_reqmsg);
+        if (OBD_FAIL_CHECK(OBD_FAIL_PTLRPC_HPREQ_NOTIMEOUT))
+                fail_opc = OBD_FAIL_PTLRPC_HPREQ_NOTIMEOUT;
+        else if (OBD_FAIL_CHECK(OBD_FAIL_PTLRPC_HPREQ_TIMEOUT))
+                fail_opc = OBD_FAIL_PTLRPC_HPREQ_TIMEOUT;
+
+        if (unlikely(fail_opc)) {
+                if (request->rq_export && request->rq_ops) {
+                        spin_unlock(&svc->srv_lock);
+                        OBD_FAIL_TIMEOUT(fail_opc, 4);
+                        spin_lock(&svc->srv_lock);
+                        request = ptlrpc_server_request_get(svc, 0);
+                        if  (request == NULL) {
+                                spin_unlock(&svc->srv_lock);
+                                RETURN(0);
+                        }
+                        LASSERT(ptlrpc_server_request_pending(svc, 0));
+                }
+        }
+
+        list_del_init(&request->rq_list);
          svc->srv_n_queued_reqs--;
          svc->srv_n_active_reqs++;
  
+        if (request->rq_hp)
+                svc->srv_n_hpreq++;
+
+        /* The phase is changed under the lock here because we need to know
+         * the request is under processing (see ptlrpc_hpreq_reorder()). */
+        ptlrpc_rqphase_move(request, RQ_PHASE_INTERPRET);
          spin_unlock(&svc->srv_lock);
  
+        ptlrpc_hpreq_fini(request);
+
+        if(OBD_FAIL_CHECK(OBD_FAIL_PTLRPC_DUMP_LOG))
+                libcfs_debug_dumplog();
+
          do_gettimeofday(&work_start);
          timediff = cfs_timeval_sub(&work_start, &request->rq_arrival_time,NULL);
          if (svc->srv_stats != NULL) {
@@ -1010,13 +1257,10 @@ ptlrpc_server_handle_request(struct ptlrpc_service *svc,
                  lprocfs_counter_add(svc->srv_stats, PTLRPC_TIMEOUT,
                                      at_get(&svc->srv_at_estimate));
          }
-        
+
          CDEBUG(D_NET, "got req "LPD64"\n", request->rq_xid);
-        
-        request->rq_svc_thread = thread;
-        request->rq_export = class_conn2export(
-                                     lustre_msg_get_handle(request->rq_reqmsg));
  
+        request->rq_svc_thread = thread;
          if (request->rq_export) {
                  if (ptlrpc_check_req(request))
                          goto put_conn;
@@ -1024,7 +1268,7 @@ ptlrpc_server_handle_request(struct ptlrpc_service *svc,
                  export = class_export_rpc_get(request->rq_export);
          }
  
-        /* Discard requests queued for longer than the deadline.  
+        /* Discard requests queued for longer than the deadline.
             The deadline is increased if we send an early reply. */
          if (cfs_time_current_sec() > request->rq_deadline) {
                  DEBUG_REQ(D_ERROR, request, "Dropping timed-out request from %s"
@@ -1036,8 +1280,6 @@ ptlrpc_server_handle_request(struct ptlrpc_service *svc,
                  goto put_rpc_export;
          }
  
-        request->rq_phase = RQ_PHASE_INTERPRET;
-
          CDEBUG(D_RPCTRACE, "Handling RPC pname:cluuid+ref:pid:xid:nid:opc "
                 "%s:%s+%d:%d:x"LPU64":%s:%d\n", cfs_curproc_comm(),
                 (request->rq_export ?
@@ -1051,8 +1293,8 @@ ptlrpc_server_handle_request(struct ptlrpc_service *svc,
          OBD_FAIL_TIMEOUT_MS(OBD_FAIL_PTLRPC_PAUSE_REQ, obd_fail_val);
  
          rc = svc->srv_handler(request);
-        
-        request->rq_phase = RQ_PHASE_COMPLETE;
+
+        ptlrpc_rqphase_move(request, RQ_PHASE_COMPLETE);
  
          CDEBUG(D_RPCTRACE, "Handled RPC pname:cluuid+ref:pid:xid:nid:opc "
                 "%s:%s+%d:%d:x"LPU64":%s:%d\n", cfs_curproc_comm(),
@@ -1069,9 +1311,6 @@ put_rpc_export:
                  class_export_rpc_put(export);
  
  put_conn:
-        if (request->rq_export != NULL)
-                class_export_put(request->rq_export);
-
          if (cfs_time_current_sec() > request->rq_deadline) {
                  DEBUG_REQ(D_WARNING, request, "Request x"LPU64" took longer "
                            "than estimated (%ld%+lds); client may timeout.",
@@ -1089,7 +1328,7 @@ put_conn:
                 cfs_timeval_sub(&work_end, &request->rq_arrival_time, NULL),
                 request->rq_repmsg ? lustre_msg_get_transno(request->rq_repmsg) :
                 request->rq_transno, request->rq_status,
-               request->rq_repmsg ? lustre_msg_get_status(request->rq_repmsg): 
+               request->rq_repmsg ? lustre_msg_get_status(request->rq_repmsg):
                 -999);
          if (svc->srv_stats != NULL) {
                  __u32 op = lustre_msg_get_opc(request->rq_reqmsg);
@@ -1102,13 +1341,17 @@ put_conn:
                  }
          }
          if (request->rq_early_count) {
-                DEBUG_REQ(D_ADAPTTO, request, 
+                DEBUG_REQ(D_ADAPTTO, request,
                            "sent %d early replies before finishing in %lds",
                            request->rq_early_count,
                            work_end.tv_sec - request->rq_arrival_time.tv_sec);
          }
  
-        ptlrpc_server_free_request(request);
+        spin_lock(&svc->srv_lock);
+        if (request->rq_hp)
+                svc->srv_n_hpreq--;
+        spin_unlock(&svc->srv_lock);
+        ptlrpc_server_finish_request(request);
  
          RETURN(1);
  }
@@ -1143,10 +1386,10 @@ ptlrpc_server_handle_reply (struct ptlrpc_service *svc)
          /* Disengage from notifiers carefully (lock order - irqrestore below!)*/
          spin_unlock(&svc->srv_lock);
  
-        spin_lock (&obd->obd_uncommitted_replies_lock);
+        spin_lock (&exp->exp_uncommitted_replies_lock);
          /* Noop if removed already */
          list_del_init (&rs->rs_obd_list);
-        spin_unlock (&obd->obd_uncommitted_replies_lock);
+        spin_unlock (&exp->exp_uncommitted_replies_lock);
  
          spin_lock (&exp->exp_lock);
          /* Noop if removed already */
@@ -1165,9 +1408,7 @@ ptlrpc_server_handle_reply (struct ptlrpc_service *svc)
                  /* If we see this, we should already have seen the warning
                   * in mds_steal_ack_locks()  */
                  CWARN("All locks stolen from rs %p x"LPD64".t"LPD64
-                      " o%d NID %s\n",
-                      rs,
-                      rs->rs_xid, rs->rs_transno,
+                      " o%d NID %s\n", rs, rs->rs_xid, rs->rs_transno,
                        lustre_msg_get_opc(rs->rs_msg),
                        libcfs_nid2str(exp->exp_connection->c_peer.nid));
          }
@@ -1301,7 +1542,6 @@ static int ptlrpc_main(void *arg)
          struct ptlrpc_thread   *thread = data->thread;
          struct obd_device      *dev = data->dev;
          struct ptlrpc_reply_state *rs;
-        struct lc_watchdog     *watchdog;
  #ifdef WITH_GROUP_INFO
          struct group_info *ginfo = NULL;
  #endif
@@ -1359,9 +1599,10 @@ static int ptlrpc_main(void *arg)
           */
          cfs_waitq_signal(&thread->t_ctl_waitq);
  
-        watchdog = lc_watchdog_add(max_t(int, obd_timeout, AT_OFF ? 0 : 
-                                   at_get(&svc->srv_at_estimate)) * 
-                                   svc->srv_watchdog_factor, NULL, NULL);
+        thread->t_watchdog = lc_watchdog_add(max_t(int, obd_timeout, AT_OFF ? 0 :
+                                                   at_get(&svc->srv_at_estimate))
+                                             *  svc->srv_watchdog_factor,
+                                             NULL, NULL);
  
          spin_lock(&svc->srv_lock);
          svc->srv_threads_running++;
@@ -1380,7 +1621,7 @@ static int ptlrpc_main(void *arg)
                  struct l_wait_info lwi = LWI_TIMEOUT(svc->srv_rqbd_timeout,
                                                       ptlrpc_retry_rqbds, svc);
  
-                lc_watchdog_disable(watchdog);
+                lc_watchdog_disable(thread->t_watchdog);
  
                  cond_resched();
  
@@ -1391,15 +1632,15 @@ static int ptlrpc_main(void *arg)
                                 svc->srv_rqbd_timeout == 0) ||
                                !list_empty(&svc->srv_req_in_queue) ||
                                !list_empty(&svc->srv_reply_queue) ||
-                              (!list_empty(&svc->srv_request_queue) &&
+                              (ptlrpc_server_request_pending(svc, 0) &&
                                 (svc->srv_n_active_reqs <
                                  (svc->srv_threads_running - 1))) ||
                                svc->srv_at_check,
                                &lwi);
  
-                lc_watchdog_touch_ms(watchdog, max_t(int, obd_timeout,
-                                     AT_OFF ? 0 : 
-                                     at_get(&svc->srv_at_estimate)) * 
+                lc_watchdog_touch_ms(thread->t_watchdog, max_t(int, obd_timeout,
+                                     AT_OFF ? 0 :
+                                     at_get(&svc->srv_at_estimate)) *
                                       svc->srv_watchdog_factor);
  
                  ptlrpc_check_rqbd_pool(svc);
@@ -1416,17 +1657,17 @@ static int ptlrpc_main(void *arg)
                  if (!list_empty(&svc->srv_req_in_queue)) {
                          /* Process all incoming reqs before handling any */
                          ptlrpc_server_handle_req_in(svc);
-                        /* but limit ourselves in case of flood */ 
+                        /* but limit ourselves in case of flood */
                          if (counter++ < 1000)
                                  continue;
                          counter = 0;
                  }
  
-                if (svc->srv_at_check) 
+                if (svc->srv_at_check)
                          ptlrpc_at_check_timed(svc);
  
                  /* don't handle requests in the last thread */
-                if (!list_empty (&svc->srv_request_queue) &&
+                if (ptlrpc_server_request_pending(svc, 0) &&
                      (svc->srv_n_active_reqs < (svc->srv_threads_running - 1)))
                          ptlrpc_server_handle_request(svc, thread);
  
@@ -1441,7 +1682,8 @@ static int ptlrpc_main(void *arg)
                  }
          }
  
-        lc_watchdog_delete(watchdog);
+        lc_watchdog_delete(thread->t_watchdog);
+        thread->t_watchdog = NULL;
  
  out_srv_init:
          /*
@@ -1508,6 +1750,7 @@ int ptlrpc_start_threads(struct obd_device *dev, struct ptlrpc_service *svc)
  
          /* We require 2 threads min - see note in
           * ptlrpc_server_handle_request() */
+
          LASSERT(svc->srv_threads_min >= 2);
          for (i = 0; i < svc->srv_threads_min; i++) {
                  rc = ptlrpc_start_thread(dev, svc);
@@ -1563,7 +1806,7 @@ int ptlrpc_start_thread(struct obd_device *dev, struct ptlrpc_service *svc)
          d.thread = thread;
  
          CDEBUG(D_RPCTRACE, "starting thread '%s'\n", name);
-        
+
          /* CLONE_VM and CLONE_FILES just avoid a needless copy, because we
           * just drop the VM and FILES in ptlrpc_daemonize() right away.
           */
@@ -1618,7 +1861,7 @@ int ptlrpc_unregister_service(struct ptlrpc_service *service)
           * its 'unlink' flag set for each posted rqbd */
          list_for_each(tmp, &service->srv_active_rqbds) {
                  struct ptlrpc_request_buffer_desc *rqbd =
-                        list_entry(tmp, struct ptlrpc_request_buffer_desc, 
+                        list_entry(tmp, struct ptlrpc_request_buffer_desc,
                                     rqbd_list);
  
                  rc = LNetMDUnlink(rqbd->rqbd_md_h);
@@ -1637,7 +1880,8 @@ int ptlrpc_unregister_service(struct ptlrpc_service *service)
  
                  /* Network access will complete in finite time but the HUGE
                   * timeout lets us CWARN for visibility of sluggish NALs */
-                lwi = LWI_TIMEOUT(cfs_time_seconds(LONG_UNLINK), NULL, NULL);
+                lwi = LWI_TIMEOUT_INTERVAL(cfs_time_seconds(LONG_UNLINK),
+                                           cfs_time_seconds(1), NULL, NULL);
                  rc = l_wait_event(service->srv_waitq,
                                    service->srv_nrqbd_receiving == 0,
                                    &lwi);
@@ -1668,18 +1912,17 @@ int ptlrpc_unregister_service(struct ptlrpc_service *service)
                  list_del(&req->rq_list);
                  service->srv_n_queued_reqs--;
                  service->srv_n_active_reqs++;
-                ptlrpc_server_free_request(req);
+                ptlrpc_server_finish_request(req);
          }
-        while (!list_empty(&service->srv_request_queue)) {
-                struct ptlrpc_request *req =
-                        list_entry(service->srv_request_queue.next,
-                                   struct ptlrpc_request,
-                                   rq_list);
+        while (ptlrpc_server_request_pending(service, 1)) {
+                struct ptlrpc_request *req;
  
+                req = ptlrpc_server_request_get(service, 1);
                  list_del(&req->rq_list);
                  service->srv_n_queued_reqs--;
                  service->srv_n_active_reqs++;
-                ptlrpc_server_free_request(req);
+                ptlrpc_hpreq_fini(req);
+                ptlrpc_server_finish_request(req);
          }
          LASSERT(service->srv_n_queued_reqs == 0);
          LASSERT(service->srv_n_active_reqs == 0);
@@ -1742,18 +1985,22 @@ int ptlrpc_service_health_check(struct ptlrpc_service *svc)
          do_gettimeofday(&right_now);
  
          spin_lock(&svc->srv_lock);
-        if (list_empty(&svc->srv_request_queue)) {
+        if (!ptlrpc_server_request_pending(svc, 1)) {
                  spin_unlock(&svc->srv_lock);
                  return 0;
          }
-        
+
          /* How long has the next entry been waiting? */
-        request = list_entry(svc->srv_request_queue.next,
-                             struct ptlrpc_request, rq_list);
+        if (list_empty(&svc->srv_request_queue))
+                request = list_entry(svc->srv_request_hpq.next,
+                                     struct ptlrpc_request, rq_list);
+        else
+                request = list_entry(svc->srv_request_queue.next,
+                                     struct ptlrpc_request, rq_list);
          timediff = cfs_timeval_sub(&right_now, &request->rq_arrival_time, NULL);
          spin_unlock(&svc->srv_lock);
  
-        if ((timediff / ONE_MILLION) > (AT_OFF ? obd_timeout * 3/2 : 
+        if ((timediff / ONE_MILLION) > (AT_OFF ? obd_timeout * 3/2 :
                                          at_max)) {
                  CERROR("%s: unhealthy - request has been waiting %lds\n",
                         svc->srv_name, timediff / ONE_MILLION);
diff --git a/lustre/ptlrpc/wirehdr.c b/lustre/ptlrpc/wirehdr.c

index a674953..aa5ff22 100644 (file)
--- a/lustre/ptlrpc/wirehdr.c
+++ b/lustre/ptlrpc/wirehdr.c
@@ -1,3 +1,39 @@
+/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
+ * vim:expandtab:shiftwidth=8:tabstop=8:
+ *
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright  2008 Sun Microsystems, Inc. All rights reserved
+ * Use is subject to license terms.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
  #define DEBUG_SUBSYSTEM S_RPC
  #ifndef __KERNEL__
  # include <liblustre.h>
@@ -25,4 +61,3 @@
  #include <obd_class.h>
  #include <lustre_net.h>
  #include <lustre_disk.h>
-
diff --git a/lustre/ptlrpc/wiretest.c b/lustre/ptlrpc/wiretest.c

index a70858b..998eff1 100644 (file)
--- a/lustre/ptlrpc/wiretest.c
+++ b/lustre/ptlrpc/wiretest.c
@@ -1,3 +1,39 @@
+/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
+ * vim:expandtab:shiftwidth=8:tabstop=8:
+ *
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright  2008 Sun Microsystems, Inc. All rights reserved
+ * Use is subject to license terms.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
  #define DEBUG_SUBSYSTEM S_RPC
  #ifndef __KERNEL__
  # include <liblustre.h>
@@ -30,8 +66,8 @@ void lustre_assert_wire_constants(void)
  {
          /* Wire protocol assertions generated by 'wirecheck'
           * (make -C lustre/utils newwiretest)
-         * running on Linux xlab.hostel 2.6.23.12-52.fc7 #1 SMP Tue Dec 18 21:18:02 EST 2007 i686 i68
-         * with gcc version 3.4.6 20060404 (Red Hat 3.4.6-7) */
+         * running on Linux vb1 2.6.18-build.1 #1 SMP Thu Mar 27 14:34:21 MDT 2008 i686 i686 i386 GNU
+         * with gcc version 4.1.2 20070626 (Red Hat 4.1.2-14) */
  
  
          /* Constants... */
@@ -147,7 +183,9 @@ void lustre_assert_wire_constants(void)
                   (long long)REINT_RENAME);
          LASSERTF(REINT_OPEN == 6, " found %lld\n",
                   (long long)REINT_OPEN);
-        LASSERTF(REINT_MAX == 7, " found %lld\n",
+        LASSERTF(REINT_SETXATTR == 7, " found %lld\n",
+                 (long long)REINT_SETXATTR);
+        LASSERTF(REINT_MAX == 8, " found %lld\n",
                   (long long)REINT_MAX);
          LASSERTF(MGS_CONNECT == 250, " found %lld\n",
                   (long long)MGS_CONNECT);
@@ -349,7 +387,7 @@ void lustre_assert_wire_constants(void)
          LASSERT(offsetof(struct lustre_msg_v1, lm_magic) == offsetof(struct lustre_msg_v2, lm_magic));
  
          /* Checks for struct ptlrpc_body */
-        LASSERTF((int)sizeof(struct ptlrpc_body) == 88, " found %lld\n",
+        LASSERTF((int)sizeof(struct ptlrpc_body) == 152, " found %lld\n",
                   (long long)(int)sizeof(struct ptlrpc_body));
          LASSERTF((int)offsetof(struct ptlrpc_body, pb_handle) == 0, " found %lld\n",
                   (long long)(int)offsetof(struct ptlrpc_body, pb_handle));
@@ -415,6 +453,14 @@ void lustre_assert_wire_constants(void)
                   (long long)(int)offsetof(struct ptlrpc_body, pb_limit));
          LASSERTF((int)sizeof(((struct ptlrpc_body *)0)->pb_limit) == 4, " found %lld\n",
                   (long long)(int)sizeof(((struct ptlrpc_body *)0)->pb_limit));
+        LASSERTF((int)offsetof(struct ptlrpc_body, pb_pre_versions) == 88, " found %lld\n",
+                 (long long)(int)offsetof(struct ptlrpc_body, pb_pre_versions));
+        LASSERTF((int)sizeof(((struct ptlrpc_body *)0)->pb_pre_versions) == 32, " found %lld\n",
+                 (long long)(int)sizeof(((struct ptlrpc_body *)0)->pb_pre_versions));
+        LASSERTF((int)offsetof(struct ptlrpc_body, pb_padding) == 120, " found %lld\n",
+                 (long long)(int)offsetof(struct ptlrpc_body, pb_padding));
+        LASSERTF((int)sizeof(((struct ptlrpc_body *)0)->pb_padding) == 32, " found %lld\n",
+                 (long long)(int)sizeof(((struct ptlrpc_body *)0)->pb_padding));
  
          /* Checks for struct obd_connect_data */
          LASSERTF((int)sizeof(struct obd_connect_data) == 72, " found %lld\n",
@@ -498,7 +544,9 @@ void lustre_assert_wire_constants(void)
          CLASSERT(OBD_CONNECT_LRU_RESIZE == 0x02000000ULL);
          CLASSERT(OBD_CONNECT_MDS_MDS == 0x04000000ULL);
          CLASSERT(OBD_CONNECT_REAL == 0x08000000ULL);
+        CLASSERT(OBD_CONNECT_FID == 0x40000000ULL);
          CLASSERT(OBD_CONNECT_CKSUM == 0x20000000ULL);
+        CLASSERT(OBD_CONNECT_VBR == 0x80000000ULL);
  
          /* Checks for struct obdo */
          LASSERTF((int)sizeof(struct obdo) == 208, " found %lld\n",
@@ -728,6 +776,67 @@ void lustre_assert_wire_constants(void)
          LASSERTF(LOV_PATTERN_RAID1 == 2, " found %lld\n",
                   (long long)LOV_PATTERN_RAID1);
  
+        /* Checks for struct lov_mds_md_v3 */
+        LASSERTF((int)sizeof(struct lov_mds_md_v3) == 48, " found %lld\n",
+                 (long long)(int)sizeof(struct lov_mds_md_v3));
+        LASSERTF((int)offsetof(struct lov_mds_md_v3, lmm_magic) == 0, " found %lld\n",
+                 (long long)(int)offsetof(struct lov_mds_md_v3, lmm_magic));
+        LASSERTF((int)sizeof(((struct lov_mds_md_v3 *)0)->lmm_magic) == 4, " found %lld\n",
+                 (long long)(int)sizeof(((struct lov_mds_md_v3 *)0)->lmm_magic));
+        LASSERTF((int)offsetof(struct lov_mds_md_v3, lmm_pattern) == 4, " found %lld\n",
+                 (long long)(int)offsetof(struct lov_mds_md_v3, lmm_pattern));
+        LASSERTF((int)sizeof(((struct lov_mds_md_v3 *)0)->lmm_pattern) == 4, " found %lld\n",
+                 (long long)(int)sizeof(((struct lov_mds_md_v3 *)0)->lmm_pattern));
+        LASSERTF((int)offsetof(struct lov_mds_md_v3, lmm_object_id) == 8, " found %lld\n",
+                 (long long)(int)offsetof(struct lov_mds_md_v3, lmm_object_id));
+        LASSERTF((int)sizeof(((struct lov_mds_md_v3 *)0)->lmm_object_id) == 8, " found %lld\n",
+                 (long long)(int)sizeof(((struct lov_mds_md_v3 *)0)->lmm_object_id));
+        LASSERTF((int)offsetof(struct lov_mds_md_v3, lmm_object_gr) == 16, " found %lld\n",
+                 (long long)(int)offsetof(struct lov_mds_md_v3, lmm_object_gr));
+        LASSERTF((int)sizeof(((struct lov_mds_md_v3 *)0)->lmm_object_gr) == 8, " found %lld\n",
+                 (long long)(int)sizeof(((struct lov_mds_md_v3 *)0)->lmm_object_gr));
+        LASSERTF((int)offsetof(struct lov_mds_md_v3, lmm_stripe_size) == 24, " found %lld\n",
+                 (long long)(int)offsetof(struct lov_mds_md_v3, lmm_stripe_size));
+        LASSERTF((int)sizeof(((struct lov_mds_md_v3 *)0)->lmm_stripe_size) == 4, " found %lld\n",
+                 (long long)(int)sizeof(((struct lov_mds_md_v3 *)0)->lmm_stripe_size));
+        LASSERTF((int)offsetof(struct lov_mds_md_v3, lmm_stripe_count) == 28, " found %lld\n",
+                 (long long)(int)offsetof(struct lov_mds_md_v3, lmm_stripe_count));
+        LASSERTF((int)sizeof(((struct lov_mds_md_v3 *)0)->lmm_stripe_count) == 4, " found %lld\n",
+                 (long long)(int)sizeof(((struct lov_mds_md_v3 *)0)->lmm_stripe_count));
+        LASSERTF((int)offsetof(struct lov_mds_md_v3, lmm_pool_name) == 32, " found %lld\n",
+                 (long long)(int)offsetof(struct lov_mds_md_v3, lmm_pool_name));
+        LASSERTF((int)sizeof(((struct lov_mds_md_v3 *)0)->lmm_pool_name) == 16, " found %lld\n",
+                 (long long)(int)sizeof(((struct lov_mds_md_v3 *)0)->lmm_pool_name));
+        LASSERTF((int)offsetof(struct lov_mds_md_v3, lmm_objects) == 48, " found %lld\n",
+                 (long long)(int)offsetof(struct lov_mds_md_v3, lmm_objects));
+        LASSERTF((int)sizeof(((struct lov_mds_md_v3 *)0)->lmm_objects) == 0, " found %lld\n",
+                 (long long)(int)sizeof(((struct lov_mds_md_v3 *)0)->lmm_objects));
+
+        /* Checks for struct lov_ost_data_v1 */
+        LASSERTF((int)sizeof(struct lov_ost_data_v1) == 24, " found %lld\n",
+                 (long long)(int)sizeof(struct lov_ost_data_v1));
+        LASSERTF((int)offsetof(struct lov_ost_data_v1, l_object_id) == 0, " found %lld\n",
+                 (long long)(int)offsetof(struct lov_ost_data_v1, l_object_id));
+        LASSERTF((int)sizeof(((struct lov_ost_data_v1 *)0)->l_object_id) == 8, " found %lld\n",
+                 (long long)(int)sizeof(((struct lov_ost_data_v1 *)0)->l_object_id));
+        LASSERTF((int)offsetof(struct lov_ost_data_v1, l_object_gr) == 8, " found %lld\n",
+                 (long long)(int)offsetof(struct lov_ost_data_v1, l_object_gr));
+        LASSERTF((int)sizeof(((struct lov_ost_data_v1 *)0)->l_object_gr) == 8, " found %lld\n",
+                 (long long)(int)sizeof(((struct lov_ost_data_v1 *)0)->l_object_gr));
+        LASSERTF((int)offsetof(struct lov_ost_data_v1, l_ost_gen) == 16, " found %lld\n",
+                 (long long)(int)offsetof(struct lov_ost_data_v1, l_ost_gen));
+        LASSERTF((int)sizeof(((struct lov_ost_data_v1 *)0)->l_ost_gen) == 4, " found %lld\n",
+                 (long long)(int)sizeof(((struct lov_ost_data_v1 *)0)->l_ost_gen));
+        LASSERTF((int)offsetof(struct lov_ost_data_v1, l_ost_idx) == 20, " found %lld\n",
+                 (long long)(int)offsetof(struct lov_ost_data_v1, l_ost_idx));
+        LASSERTF((int)sizeof(((struct lov_ost_data_v1 *)0)->l_ost_idx) == 4, " found %lld\n",
+                 (long long)(int)sizeof(((struct lov_ost_data_v1 *)0)->l_ost_idx));
+        CLASSERT(LOV_MAGIC_V3 == 0x0BD30BD0);
+        LASSERTF(LOV_PATTERN_RAID0 == 1, " found %lld\n",
+                 (long long)LOV_PATTERN_RAID0);
+        LASSERTF(LOV_PATTERN_RAID1 == 2, " found %lld\n",
+                 (long long)LOV_PATTERN_RAID1);
+
          /* Checks for struct lov_mds_md_join */
          LASSERTF((int)sizeof(struct lov_mds_md_join) == 56, " found %lld\n",
                   (long long)(int)sizeof(struct lov_mds_md_join));
@@ -1611,6 +1720,38 @@ void lustre_assert_wire_constants(void)
          LASSERTF((int)sizeof(((struct ost_lvb *)0)->lvb_blocks) == 8, " found %lld\n",
                   (long long)(int)sizeof(((struct ost_lvb *)0)->lvb_blocks));
  
+        /* Checks for struct cfg_marker */
+        LASSERTF((int)sizeof(struct cfg_marker) == 160, " found %lld\n",
+                 (long long)(int)sizeof(struct cfg_marker));
+        LASSERTF((int)offsetof(struct cfg_marker, cm_step) == 0, " found %lld\n",
+                 (long long)(int)offsetof(struct cfg_marker, cm_step));
+        LASSERTF((int)sizeof(((struct cfg_marker *)0)->cm_step) == 4, " found %lld\n",
+                 (long long)(int)sizeof(((struct cfg_marker *)0)->cm_step));
+        LASSERTF((int)offsetof(struct cfg_marker, cm_flags) == 4, " found %lld\n",
+                 (long long)(int)offsetof(struct cfg_marker, cm_flags));
+        LASSERTF((int)sizeof(((struct cfg_marker *)0)->cm_flags) == 4, " found %lld\n",
+                 (long long)(int)sizeof(((struct cfg_marker *)0)->cm_flags));
+        LASSERTF((int)offsetof(struct cfg_marker, cm_vers) == 8, " found %lld\n",
+                 (long long)(int)offsetof(struct cfg_marker, cm_vers));
+        LASSERTF((int)sizeof(((struct cfg_marker *)0)->cm_vers) == 4, " found %lld\n",
+                 (long long)(int)sizeof(((struct cfg_marker *)0)->cm_vers));
+        LASSERTF((int)offsetof(struct cfg_marker, cm_createtime) == 16, " found %lld\n",
+                 (long long)(int)offsetof(struct cfg_marker, cm_createtime));
+        LASSERTF((int)sizeof(((struct cfg_marker *)0)->cm_createtime) == 8, " found %lld\n",
+                 (long long)(int)sizeof(((struct cfg_marker *)0)->cm_createtime));
+        LASSERTF((int)offsetof(struct cfg_marker, cm_canceltime) == 24, " found %lld\n",
+                 (long long)(int)offsetof(struct cfg_marker, cm_canceltime));
+        LASSERTF((int)sizeof(((struct cfg_marker *)0)->cm_canceltime) == 8, " found %lld\n",
+                 (long long)(int)sizeof(((struct cfg_marker *)0)->cm_canceltime));
+        LASSERTF((int)offsetof(struct cfg_marker, cm_tgtname) == 32, " found %lld\n",
+                 (long long)(int)offsetof(struct cfg_marker, cm_tgtname));
+        LASSERTF((int)sizeof(((struct cfg_marker *)0)->cm_tgtname) == 64, " found %lld\n",
+                 (long long)(int)sizeof(((struct cfg_marker *)0)->cm_tgtname));
+        LASSERTF((int)offsetof(struct cfg_marker, cm_comment) == 96, " found %lld\n",
+                 (long long)(int)offsetof(struct cfg_marker, cm_comment));
+        LASSERTF((int)sizeof(((struct cfg_marker *)0)->cm_comment) == 64, " found %lld\n",
+                 (long long)(int)sizeof(((struct cfg_marker *)0)->cm_comment));
+
          /* Checks for struct llog_logid */
          LASSERTF((int)sizeof(struct llog_logid) == 20, " found %lld\n",
                   (long long)(int)sizeof(struct llog_logid));
@@ -1740,10 +1881,10 @@ void lustre_assert_wire_constants(void)
                   (long long)(int)offsetof(struct llog_create_rec, lcr_oid));
          LASSERTF((int)sizeof(((struct llog_create_rec *)0)->lcr_oid) == 8, " found %lld\n",
                   (long long)(int)sizeof(((struct llog_create_rec *)0)->lcr_oid));
-        LASSERTF((int)offsetof(struct llog_create_rec, lcr_ogen) == 40, " found %lld\n",
-                 (long long)(int)offsetof(struct llog_create_rec, lcr_ogen));
-        LASSERTF((int)sizeof(((struct llog_create_rec *)0)->lcr_ogen) == 4, " found %lld\n",
-                 (long long)(int)sizeof(((struct llog_create_rec *)0)->lcr_ogen));
+        LASSERTF((int)offsetof(struct llog_create_rec, lcr_ogr) == 40, " found %lld\n",
+                 (long long)(int)offsetof(struct llog_create_rec, lcr_ogr));
+        LASSERTF((int)sizeof(((struct llog_create_rec *)0)->lcr_ogr) == 4, " found %lld\n",
+                 (long long)(int)sizeof(((struct llog_create_rec *)0)->lcr_ogr));
          LASSERTF((int)offsetof(struct llog_create_rec, padding) == 44, " found %lld\n",
                   (long long)(int)offsetof(struct llog_create_rec, padding));
          LASSERTF((int)sizeof(((struct llog_create_rec *)0)->padding) == 4, " found %lld\n",
@@ -1784,14 +1925,14 @@ void lustre_assert_wire_constants(void)
                   (long long)(int)offsetof(struct llog_unlink_rec, lur_oid));
          LASSERTF((int)sizeof(((struct llog_unlink_rec *)0)->lur_oid) == 8, " found %lld\n",
                   (long long)(int)sizeof(((struct llog_unlink_rec *)0)->lur_oid));
-        LASSERTF((int)offsetof(struct llog_unlink_rec, lur_ogen) == 24, " found %lld\n",
-                 (long long)(int)offsetof(struct llog_unlink_rec, lur_ogen));
-        LASSERTF((int)sizeof(((struct llog_unlink_rec *)0)->lur_ogen) == 4, " found %lld\n",
-                 (long long)(int)sizeof(((struct llog_unlink_rec *)0)->lur_ogen));
-        LASSERTF((int)offsetof(struct llog_unlink_rec, padding) == 28, " found %lld\n",
-                 (long long)(int)offsetof(struct llog_unlink_rec, padding));
-        LASSERTF((int)sizeof(((struct llog_unlink_rec *)0)->padding) == 4, " found %lld\n",
-                 (long long)(int)sizeof(((struct llog_unlink_rec *)0)->padding));
+        LASSERTF((int)offsetof(struct llog_unlink_rec, lur_ogr) == 24, " found %lld\n",
+                 (long long)(int)offsetof(struct llog_unlink_rec, lur_ogr));
+        LASSERTF((int)sizeof(((struct llog_unlink_rec *)0)->lur_ogr) == 4, " found %lld\n",
+                 (long long)(int)sizeof(((struct llog_unlink_rec *)0)->lur_ogr));
+        LASSERTF((int)offsetof(struct llog_unlink_rec, lur_count) == 28, " found %lld\n",
+                 (long long)(int)offsetof(struct llog_unlink_rec, lur_count));
+        LASSERTF((int)sizeof(((struct llog_unlink_rec *)0)->lur_count) == 4, " found %lld\n",
+                 (long long)(int)sizeof(((struct llog_unlink_rec *)0)->lur_count));
          LASSERTF((int)offsetof(struct llog_unlink_rec, lur_tail) == 32, " found %lld\n",
                   (long long)(int)offsetof(struct llog_unlink_rec, lur_tail));
          LASSERTF((int)sizeof(((struct llog_unlink_rec *)0)->lur_tail) == 8, " found %lld\n",
@@ -1808,10 +1949,10 @@ void lustre_assert_wire_constants(void)
                   (long long)(int)offsetof(struct llog_setattr_rec, lsr_oid));
          LASSERTF((int)sizeof(((struct llog_setattr_rec *)0)->lsr_oid) == 8, " found %lld\n",
                   (long long)(int)sizeof(((struct llog_setattr_rec *)0)->lsr_oid));
-        LASSERTF((int)offsetof(struct llog_setattr_rec, lsr_ogen) == 24, " found %lld\n",
-                 (long long)(int)offsetof(struct llog_setattr_rec, lsr_ogen));
-        LASSERTF((int)sizeof(((struct llog_setattr_rec *)0)->lsr_ogen) == 4, " found %lld\n",
-                 (long long)(int)sizeof(((struct llog_setattr_rec *)0)->lsr_ogen));
+        LASSERTF((int)offsetof(struct llog_setattr_rec, lsr_ogr) == 24, " found %lld\n",
+                 (long long)(int)offsetof(struct llog_setattr_rec, lsr_ogr));
+        LASSERTF((int)sizeof(((struct llog_setattr_rec *)0)->lsr_ogr) == 4, " found %lld\n",
+                 (long long)(int)sizeof(((struct llog_setattr_rec *)0)->lsr_ogr));
          LASSERTF((int)offsetof(struct llog_setattr_rec, lsr_uid) == 28, " found %lld\n",
                   (long long)(int)offsetof(struct llog_setattr_rec, lsr_uid));
          LASSERTF((int)sizeof(((struct llog_setattr_rec *)0)->lsr_uid) == 4, " found %lld\n",
@@ -1829,6 +1970,46 @@ void lustre_assert_wire_constants(void)
          LASSERTF((int)sizeof(((struct llog_setattr_rec *)0)->lsr_tail) == 8, " found %lld\n",
                   (long long)(int)sizeof(((struct llog_setattr_rec *)0)->lsr_tail));
  
+        /* Checks for struct llog_setattr64_rec */
+        LASSERTF((int)sizeof(struct llog_setattr64_rec) == 56, " found %lld\n",
+                 (long long)(int)sizeof(struct llog_setattr64_rec));
+        LASSERTF((int)offsetof(struct llog_setattr64_rec, lsr_hdr) == 0, " found %lld\n",
+                 (long long)(int)offsetof(struct llog_setattr64_rec, lsr_hdr));
+        LASSERTF((int)sizeof(((struct llog_setattr64_rec *)0)->lsr_hdr) == 16, " found %lld\n",
+                 (long long)(int)sizeof(((struct llog_setattr64_rec *)0)->lsr_hdr));
+        LASSERTF((int)offsetof(struct llog_setattr64_rec, lsr_oid) == 16, " found %lld\n",
+                 (long long)(int)offsetof(struct llog_setattr64_rec, lsr_oid));
+        LASSERTF((int)sizeof(((struct llog_setattr64_rec *)0)->lsr_oid) == 8, " found %lld\n",
+                 (long long)(int)sizeof(((struct llog_setattr64_rec *)0)->lsr_oid));
+        LASSERTF((int)offsetof(struct llog_setattr64_rec, lsr_ogr) == 24, " found %lld\n",
+                 (long long)(int)offsetof(struct llog_setattr64_rec, lsr_ogr));
+        LASSERTF((int)sizeof(((struct llog_setattr64_rec *)0)->lsr_ogr) == 4, " found %lld\n",
+                 (long long)(int)sizeof(((struct llog_setattr64_rec *)0)->lsr_ogr));
+        LASSERTF((int)offsetof(struct llog_setattr64_rec, padding) == 28, " found %lld\n",
+                 (long long)(int)offsetof(struct llog_setattr64_rec, padding));
+        LASSERTF((int)sizeof(((struct llog_setattr64_rec *)0)->padding) == 4, " found %lld\n",
+                 (long long)(int)sizeof(((struct llog_setattr64_rec *)0)->padding));
+        LASSERTF((int)offsetof(struct llog_setattr64_rec, lsr_uid) == 32, " found %lld\n",
+                 (long long)(int)offsetof(struct llog_setattr64_rec, lsr_uid));
+        LASSERTF((int)sizeof(((struct llog_setattr64_rec *)0)->lsr_uid) == 4, " found %lld\n",
+                 (long long)(int)sizeof(((struct llog_setattr64_rec *)0)->lsr_uid));
+        LASSERTF((int)offsetof(struct llog_setattr64_rec, lsr_uid_h) == 36, " found %lld\n",
+                 (long long)(int)offsetof(struct llog_setattr64_rec, lsr_uid_h));
+        LASSERTF((int)sizeof(((struct llog_setattr64_rec *)0)->lsr_uid_h) == 4, " found %lld\n",
+                 (long long)(int)sizeof(((struct llog_setattr64_rec *)0)->lsr_uid_h));
+        LASSERTF((int)offsetof(struct llog_setattr64_rec, lsr_gid) == 40, " found %lld\n",
+                 (long long)(int)offsetof(struct llog_setattr64_rec, lsr_gid));
+        LASSERTF((int)sizeof(((struct llog_setattr64_rec *)0)->lsr_gid) == 4, " found %lld\n",
+                 (long long)(int)sizeof(((struct llog_setattr64_rec *)0)->lsr_gid));
+        LASSERTF((int)offsetof(struct llog_setattr64_rec, lsr_gid_h) == 44, " found %lld\n",
+                 (long long)(int)offsetof(struct llog_setattr64_rec, lsr_gid_h));
+        LASSERTF((int)sizeof(((struct llog_setattr64_rec *)0)->lsr_gid_h) == 4, " found %lld\n",
+                 (long long)(int)sizeof(((struct llog_setattr64_rec *)0)->lsr_gid_h));
+        LASSERTF((int)offsetof(struct llog_setattr64_rec, lsr_tail) == 48, " found %lld\n",
+                 (long long)(int)offsetof(struct llog_setattr64_rec, lsr_tail));
+        LASSERTF((int)sizeof(((struct llog_setattr64_rec *)0)->lsr_tail) == 8, " found %lld\n",
+                 (long long)(int)sizeof(((struct llog_setattr64_rec *)0)->lsr_tail));
+
          /* Checks for struct llog_size_change_rec */
          LASSERTF((int)sizeof(struct llog_size_change_rec) == 48, " found %lld\n",
                   (long long)(int)sizeof(struct llog_size_change_rec));
@@ -2201,7 +2382,79 @@ void lustre_assert_wire_constants(void)
                   (long long)(int)offsetof(struct lustre_disk_data, ldd_params));
          LASSERTF((int)sizeof(((struct lustre_disk_data *)0)->ldd_params) == 4096, " found %lld\n",
                   (long long)(int)sizeof(((struct lustre_disk_data *)0)->ldd_params));
-#ifdef LIBLUSTRE_POSIX_ACL
+
+        /* Checks for struct ll_user_fiemap */
+        LASSERTF((int)sizeof(struct ll_user_fiemap) == 32, " found %lld\n",
+                 (long long)(int)sizeof(struct ll_user_fiemap));
+        LASSERTF((int)offsetof(struct ll_user_fiemap, fm_start) == 0, " found %lld\n",
+                 (long long)(int)offsetof(struct ll_user_fiemap, fm_start));
+        LASSERTF((int)sizeof(((struct ll_user_fiemap *)0)->fm_start) == 8, " found %lld\n",
+                 (long long)(int)sizeof(((struct ll_user_fiemap *)0)->fm_start));
+        LASSERTF((int)offsetof(struct ll_user_fiemap, fm_length) == 8, " found %lld\n",
+                 (long long)(int)offsetof(struct ll_user_fiemap, fm_length));
+        LASSERTF((int)sizeof(((struct ll_user_fiemap *)0)->fm_length) == 8, " found %lld\n",
+                 (long long)(int)sizeof(((struct ll_user_fiemap *)0)->fm_length));
+        LASSERTF((int)offsetof(struct ll_user_fiemap, fm_flags) == 16, " found %lld\n",
+                 (long long)(int)offsetof(struct ll_user_fiemap, fm_flags));
+        LASSERTF((int)sizeof(((struct ll_user_fiemap *)0)->fm_flags) == 4, " found %lld\n",
+                 (long long)(int)sizeof(((struct ll_user_fiemap *)0)->fm_flags));
+        LASSERTF((int)offsetof(struct ll_user_fiemap, fm_mapped_extents) == 20, " found %lld\n",
+                 (long long)(int)offsetof(struct ll_user_fiemap, fm_mapped_extents));
+        LASSERTF((int)sizeof(((struct ll_user_fiemap *)0)->fm_mapped_extents) == 4, " found %lld\n",
+                 (long long)(int)sizeof(((struct ll_user_fiemap *)0)->fm_mapped_extents));
+        LASSERTF((int)offsetof(struct ll_user_fiemap, fm_extent_count) == 24, " found %lld\n",
+                 (long long)(int)offsetof(struct ll_user_fiemap, fm_extent_count));
+        LASSERTF((int)sizeof(((struct ll_user_fiemap *)0)->fm_extent_count) == 4, " found %lld\n",
+                 (long long)(int)sizeof(((struct ll_user_fiemap *)0)->fm_extent_count));
+        LASSERTF((int)offsetof(struct ll_user_fiemap, fm_reserved) == 28, " found %lld\n",
+                 (long long)(int)offsetof(struct ll_user_fiemap, fm_reserved));
+        LASSERTF((int)sizeof(((struct ll_user_fiemap *)0)->fm_reserved) == 4, " found %lld\n",
+                 (long long)(int)sizeof(((struct ll_user_fiemap *)0)->fm_reserved));
+        LASSERTF((int)offsetof(struct ll_user_fiemap, fm_extents) == 32, " found %lld\n",
+                 (long long)(int)offsetof(struct ll_user_fiemap, fm_extents));
+        LASSERTF((int)sizeof(((struct ll_user_fiemap *)0)->fm_extents) == 0, " found %lld\n",
+                 (long long)(int)sizeof(((struct ll_user_fiemap *)0)->fm_extents));
+        CLASSERT(FIEMAP_FLAG_SYNC == 0x00000001);
+        CLASSERT(FIEMAP_FLAG_XATTR == 0x00000002);
+        CLASSERT(FIEMAP_FLAG_DEVICE_ORDER == 0x40000000);
+
+        /* Checks for struct ll_fiemap_extent */
+        LASSERTF((int)sizeof(struct ll_fiemap_extent) == 32, " found %lld\n",
+                 (long long)(int)sizeof(struct ll_fiemap_extent));
+        LASSERTF((int)offsetof(struct ll_fiemap_extent, fe_logical) == 0, " found %lld\n",
+                 (long long)(int)offsetof(struct ll_fiemap_extent, fe_logical));
+        LASSERTF((int)sizeof(((struct ll_fiemap_extent *)0)->fe_logical) == 8, " found %lld\n",
+                 (long long)(int)sizeof(((struct ll_fiemap_extent *)0)->fe_logical));
+        LASSERTF((int)offsetof(struct ll_fiemap_extent, fe_physical) == 8, " found %lld\n",
+                 (long long)(int)offsetof(struct ll_fiemap_extent, fe_physical));
+        LASSERTF((int)sizeof(((struct ll_fiemap_extent *)0)->fe_physical) == 8, " found %lld\n",
+                 (long long)(int)sizeof(((struct ll_fiemap_extent *)0)->fe_physical));
+        LASSERTF((int)offsetof(struct ll_fiemap_extent, fe_length) == 16, " found %lld\n",
+                 (long long)(int)offsetof(struct ll_fiemap_extent, fe_length));
+        LASSERTF((int)sizeof(((struct ll_fiemap_extent *)0)->fe_length) == 8, " found %lld\n",
+                 (long long)(int)sizeof(((struct ll_fiemap_extent *)0)->fe_length));
+        LASSERTF((int)offsetof(struct ll_fiemap_extent, fe_flags) == 24, " found %lld\n",
+                 (long long)(int)offsetof(struct ll_fiemap_extent, fe_flags));
+        LASSERTF((int)sizeof(((struct ll_fiemap_extent *)0)->fe_flags) == 4, " found %lld\n",
+                 (long long)(int)sizeof(((struct ll_fiemap_extent *)0)->fe_flags));
+        LASSERTF((int)offsetof(struct ll_fiemap_extent, fe_device) == 28, " found %lld\n",
+                 (long long)(int)offsetof(struct ll_fiemap_extent, fe_device));
+        LASSERTF((int)sizeof(((struct ll_fiemap_extent *)0)->fe_device) == 4, " found %lld\n",
+                 (long long)(int)sizeof(((struct ll_fiemap_extent *)0)->fe_device));
+        CLASSERT(FIEMAP_EXTENT_LAST == 0x00000001);
+        CLASSERT(FIEMAP_EXTENT_UNKNOWN == 0x00000002);
+        CLASSERT(FIEMAP_EXTENT_DELALLOC == 0x00000004);
+        CLASSERT(FIEMAP_EXTENT_NO_DIRECT == 0x00000008);
+        CLASSERT(FIEMAP_EXTENT_SECONDARY == 0x00000010);
+        CLASSERT(FIEMAP_EXTENT_NET == 0x00000020);
+        CLASSERT(FIEMAP_EXTENT_DATA_COMPRESSED == 0x00000040);
+        CLASSERT(FIEMAP_EXTENT_DATA_ENCRYPTED == 0x00000080);
+        CLASSERT(FIEMAP_EXTENT_NOT_ALIGNED == 0x00000100);
+        CLASSERT(FIEMAP_EXTENT_DATA_INLINE == 0x00000200);
+        CLASSERT(FIEMAP_EXTENT_DATA_TAIL == 0x00000400);
+        CLASSERT(FIEMAP_EXTENT_UNWRITTEN == 0x00000800);
+        CLASSERT(FIEMAP_EXTENT_MERGED == 0x00001000);
+#if defined(LIBLUSTRE_POSIX_ACL) && defined(CONFIG_FS_POSIX_ACL)
  
          /* Checks for type posix_acl_xattr_entry */
          LASSERTF((int)sizeof(xattr_acl_entry) == 8, " found %lld\n",
diff --git a/lustre/quota/Makefile.in b/lustre/quota/Makefile.in

index 2f1bfad..f052b42 100644 (file)
--- a/lustre/quota/Makefile.in
+++ b/lustre/quota/Makefile.in
@@ -1,10 +1,7 @@
  MODULES := lquota
-MODULES += quotactl_test quotacheck_test
  
  lquota-objs := quota_check.o quota_context.o quota_ctl.o quota_interface.o
-lquota-objs += quota_master.o quota_adjust_qunit.o
-quotactl-objs := quotactl_test.o
-quotaccheck-objs := quotacheck_test.o
+lquota-objs += quota_master.o quota_adjust_qunit.o lproc_quota.o
  
  @INCLUDE_RULES@
  
diff --git a/lustre/quota/autoMakefile.am b/lustre/quota/autoMakefile.am

index a397190..9a20d28 100644 (file)
--- a/lustre/quota/autoMakefile.am
+++ b/lustre/quota/autoMakefile.am
@@ -1,7 +1,38 @@
-# Copyright (C) 2005  Cluster File Systems, Inc.
  #
-# This code is issued under the GNU General Public License.
-# See the file COPYING in this distribution
+# GPL HEADER START
+#
+# DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License version 2 only,
+# as published by the Free Software Foundation.
+#
+# This program is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+# General Public License version 2 for more details (a copy is included
+# in the LICENSE file that accompanied this code).
+#
+# You should have received a copy of the GNU General Public License
+# version 2 along with this program; If not, see
+# http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+#
+# Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+# CA 95054 USA or visit www.sun.com if you need additional information or
+# have any questions.
+#
+# GPL HEADER END
+#
+
+#
+# Copyright  2008 Sun Microsystems, Inc. All rights reserved
+# Use is subject to license terms.
+#
+
+#
+# This file is part of Lustre, http://www.lustre.org/
+# Lustre is a trademark of Sun Microsystems, Inc.
+#
  
  if LIBLUSTRE
  noinst_LIBRARIES = libquota.a
@@ -10,10 +41,9 @@ libquota_a_CPPFLAGS = $(LLCPPFLAGS)
  libquota_a_CFLAGS = $(LLCFLAGS)
  endif
  
-if MODULES
+if QUOTA
  modulefs_DATA = lquota$(KMODEXT)
  endif
  
  MOSTLYCLEANFILES := @MOSTLYCLEANFILES@ 
  DIST_SOURCES := $(lquota-objs:%.o=%.c) quota_internal.h
-DIST_SOURCES += quotactl_test.c quotacheck_test.c
diff --git a/lustre/quota/lproc_quota.c b/lustre/quota/lproc_quota.c

new file mode 100644 (file)

index 0000000..d211d65
--- /dev/null
+++ b/lustre/quota/lproc_quota.c
@@ -0,0 +1,772 @@
+/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
+ * vim:expandtab:shiftwidth=8:tabstop=8:
+ *
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright  2008 Sun Microsystems, Inc. All rights reserved
+ * Use is subject to license terms.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+#define DEBUG_SUBSYSTEM S_LQUOTA
+
+#include <linux/version.h>
+#include <lprocfs_status.h>
+#include <obd.h>
+#include <linux/seq_file.h>
+#include <lustre_fsfilt.h>
+
+#include "quota_internal.h"
+
+#ifdef HAVE_QUOTA_SUPPORT
+
+#ifdef LPROCFS
+int lprocfs_quota_rd_bunit(char *page, char **start, off_t off, int count,
+                           int *eof, void *data)
+{
+        struct obd_device *obd = (struct obd_device *)data;
+        LASSERT(obd != NULL);
+
+        return snprintf(page, count, "%lu\n",
+                        obd->u.obt.obt_qctxt.lqc_bunit_sz);
+}
+EXPORT_SYMBOL(lprocfs_quota_rd_bunit);
+
+int lprocfs_quota_wr_bunit(struct file *file, const char *buffer,
+                           unsigned long count, void *data)
+{
+        struct obd_device *obd = (struct obd_device *)data;
+        int val, rc;
+        LASSERT(obd != NULL);
+
+        rc = lprocfs_write_helper(buffer, count, &val);
+        if (rc)
+                return rc;
+
+        if (val % QUOTABLOCK_SIZE ||
+            val <= obd->u.obt.obt_qctxt.lqc_btune_sz)
+                return -EINVAL;
+
+        obd->u.obt.obt_qctxt.lqc_bunit_sz = val;
+        return count;
+}
+EXPORT_SYMBOL(lprocfs_quota_wr_bunit);
+
+int lprocfs_quota_rd_btune(char *page, char **start, off_t off, int count,
+                           int *eof, void *data)
+{
+        struct obd_device *obd = (struct obd_device *)data;
+        LASSERT(obd != NULL);
+
+        return snprintf(page, count, "%lu\n",
+                        obd->u.obt.obt_qctxt.lqc_btune_sz);
+}
+EXPORT_SYMBOL(lprocfs_quota_rd_btune);
+
+int lprocfs_quota_wr_btune(struct file *file, const char *buffer,
+                           unsigned long count, void *data)
+{
+        struct obd_device *obd = (struct obd_device *)data;
+        int val, rc;
+        LASSERT(obd != NULL);
+
+        rc = lprocfs_write_helper(buffer, count, &val);
+        if (rc)
+                return rc;
+
+        if (val <= QUOTABLOCK_SIZE * MIN_QLIMIT || val % QUOTABLOCK_SIZE ||
+            val >= obd->u.obt.obt_qctxt.lqc_bunit_sz)
+                return -EINVAL;
+
+        obd->u.obt.obt_qctxt.lqc_btune_sz = val;
+        return count;
+}
+EXPORT_SYMBOL(lprocfs_quota_wr_btune);
+
+int lprocfs_quota_rd_iunit(char *page, char **start, off_t off, int count,
+                           int *eof, void *data)
+{
+        struct obd_device *obd = (struct obd_device *)data;
+        LASSERT(obd != NULL);
+
+        return snprintf(page, count, "%lu\n",
+                        obd->u.obt.obt_qctxt.lqc_iunit_sz);
+}
+EXPORT_SYMBOL(lprocfs_quota_rd_iunit);
+
+int lprocfs_quota_wr_iunit(struct file *file, const char *buffer,
+                           unsigned long count, void *data)
+{
+        struct obd_device *obd = (struct obd_device *)data;
+        int val, rc;
+        LASSERT(obd != NULL);
+
+        rc = lprocfs_write_helper(buffer, count, &val);
+        if (rc)
+                return rc;
+
+        if (val <= obd->u.obt.obt_qctxt.lqc_itune_sz)
+                return -EINVAL;
+
+        obd->u.obt.obt_qctxt.lqc_iunit_sz = val;
+        return count;
+}
+EXPORT_SYMBOL(lprocfs_quota_wr_iunit);
+
+int lprocfs_quota_rd_itune(char *page, char **start, off_t off, int count,
+                           int *eof, void *data)
+{
+        struct obd_device *obd = (struct obd_device *)data;
+        LASSERT(obd != NULL);
+
+        return snprintf(page, count, "%lu\n",
+                        obd->u.obt.obt_qctxt.lqc_itune_sz);
+}
+EXPORT_SYMBOL(lprocfs_quota_rd_itune);
+
+int lprocfs_quota_wr_itune(struct file *file, const char *buffer,
+                           unsigned long count, void *data)
+{
+        struct obd_device *obd = (struct obd_device *)data;
+        int val, rc;
+        LASSERT(obd != NULL);
+
+        rc = lprocfs_write_helper(buffer, count, &val);
+        if (rc)
+                return rc;
+
+        if (val <= MIN_QLIMIT ||
+            val >= obd->u.obt.obt_qctxt.lqc_iunit_sz)
+                return -EINVAL;
+
+        obd->u.obt.obt_qctxt.lqc_itune_sz = val;
+        return count;
+}
+EXPORT_SYMBOL(lprocfs_quota_wr_itune);
+
+#define USER_QUOTA      1
+#define GROUP_QUOTA     2
+
+#define MAX_STYPE_SIZE  5
+
+/* The following information about CURRENT quotas is expected on the output:
+ * MDS: u for user quotas (administrative+operational) turned on,
+ *      g for group quotas (administrative+operational) turned on,
+ *      1 for 32-bit operational quotas and 32-bit administrative quotas,
+ *      2 for 32-bit operational quotas and 64-bit administrative quotas,
+ *      3 for 64-bit operational quotas and 64-bit administrative quotas
+ * OST: u for user quotas (operational) turned on,
+ *      g for group quotas (operational) turned on,
+ *      1 for 32-bit local operational quotas,
+ *      3 for 64-bit local operational quotas,
+ * Permanent parameters can be read with lctl (?)
+ */
+int lprocfs_quota_rd_type(char *page, char **start, off_t off, int count,
+                          int *eof, void *data)
+{
+        struct obd_device *obd = (struct obd_device *)data;
+        char stype[MAX_STYPE_SIZE + 1] = "";
+        int oq_type, rc, is_mds;
+        lustre_quota_version_t aq_version, oq_version;
+        struct obd_device_target *obt;
+
+        LASSERT(obd != NULL);
+
+        obt = &obd->u.obt;
+        is_mds = !strcmp(obd->obd_type->typ_name, LUSTRE_MDS_NAME);
+
+        /* Collect the needed information */
+        oq_type = obd->u.obt.obt_qctxt.lqc_flags;
+        oq_version = obt->obt_qfmt;
+        if (is_mds) {
+                rc = mds_quota_get_version(obd, &aq_version);
+                if (rc)
+                        return -EPROTO;
+                /* Here we can also assert that aq_type == oq_type
+                 * except for quota startup/shutdown states     */
+        }
+
+        /* Transform the collected data into a user-readable string */
+        if (oq_type & LQC_USRQUOTA_FLAG)
+                strcat(stype, "u");
+        if (oq_type & LQC_GRPQUOTA_FLAG)
+                strcat(stype, "g");
+
+        if ((!is_mds || aq_version == LUSTRE_QUOTA_V1) &&
+            oq_version == LUSTRE_QUOTA_V1)
+                strcat(stype, "1");
+#ifdef HAVE_QUOTA64
+        else if ((!is_mds || aq_version == LUSTRE_QUOTA_V2) &&
+                 oq_version == LUSTRE_QUOTA_V2)
+                strcat(stype, "3");
+#endif
+        else if (is_mds && aq_version == LUSTRE_QUOTA_V2 &&
+                 oq_version == LUSTRE_QUOTA_V1)
+                strcat(stype, "2");
+        else
+                return -EPROTO;
+
+        return snprintf(page, count, "%s\n", stype);
+}
+EXPORT_SYMBOL(lprocfs_quota_rd_type);
+
+static int auto_quota_on(struct obd_device *obd, int type,
+                         struct super_block *sb, int is_master)
+{
+        struct obd_quotactl *oqctl;
+        struct lvfs_run_ctxt saved;
+        int rc = 0, id;
+        struct obd_device_target *obt;
+        ENTRY;
+
+        LASSERT(type == USRQUOTA || type == GRPQUOTA || type == UGQUOTA);
+
+        obt = &obd->u.obt;
+
+        OBD_ALLOC_PTR(oqctl);
+        if (!oqctl)
+                RETURN(-ENOMEM);
+
+        if (!atomic_dec_and_test(&obt->obt_quotachecking)) {
+                CDEBUG(D_INFO, "other people are doing quotacheck\n");
+                atomic_inc(&obt->obt_quotachecking);
+                RETURN(-EBUSY);
+        }
+
+        id = UGQUOTA2LQC(type);
+        /* quota already turned on */
+        if ((obt->obt_qctxt.lqc_flags & id) == id) {
+                rc = 0;
+                goto out;
+        }
+
+        oqctl->qc_type = type;
+        oqctl->qc_cmd = Q_QUOTAON;
+        oqctl->qc_id = obt->obt_qfmt;
+
+        push_ctxt(&saved, &obd->obd_lvfs_ctxt, NULL);
+        if (is_master) {
+                struct mds_obd *mds = &obd->u.mds;
+
+                down(&mds->mds_qonoff_sem);
+                /* turn on cluster wide quota */
+                rc = mds_admin_quota_on(obd, oqctl);
+                if (rc)
+                        CDEBUG(rc == -ENOENT ? D_QUOTA : D_ERROR,
+                               "auto-enable admin quota failed. rc=%d\n", rc);
+                up(&mds->mds_qonoff_sem);
+
+        }
+        if (!rc) {
+                /* turn on local quota */
+                rc = fsfilt_quotactl(obd, sb, oqctl);
+                if (rc)
+                        CDEBUG(rc == -ENOENT ? D_QUOTA : D_ERROR,
+                               "auto-enable local quota failed. rc=%d\n", rc);
+                else
+                        obt->obt_qctxt.lqc_flags |= UGQUOTA2LQC(type);
+        }
+
+        pop_ctxt(&saved, &obd->obd_lvfs_ctxt, NULL);
+
+out:
+        atomic_inc(&obt->obt_quotachecking);
+
+        OBD_FREE_PTR(oqctl);
+        RETURN(rc);
+}
+
+static int filter_quota_set_version(struct obd_device *obd,
+                                    lustre_quota_version_t version)
+{
+        struct obd_device_target *obt = &obd->u.obt;
+
+        if (version != LUSTRE_QUOTA_V1) {
+#ifdef HAVE_QUOTA64
+                if (version != LUSTRE_QUOTA_V2)
+#endif
+                        return -EINVAL;
+        }
+
+        if (!atomic_dec_and_test(&obt->obt_quotachecking)) {
+                CDEBUG(D_INFO, "other people are doing quotacheck\n");
+                atomic_inc(&obt->obt_quotachecking);
+                return -EBUSY;
+        }
+
+        if (obt->obt_qctxt.lqc_flags & (LQC_USRQUOTA_FLAG | LQC_GRPQUOTA_FLAG)) {
+                atomic_inc(&obt->obt_quotachecking);
+                return -EBUSY;
+        }
+
+        obt->obt_qfmt = version;
+
+        atomic_inc(&obt->obt_quotachecking);
+
+        return 0;
+}
+
+/* The following settings of CURRENT quotas is expected on the input:
+ * MDS: u for user quotas (administrative+operational) turned on,
+ *      g for group quotas (administrative+operational) turned on,
+ *      1 for 32-bit operational quotas and 32-bit administrative quotas,
+ *      2 for 32-bit operational quotas and 64-bit administrative quotas,
+ *      3 for 64-bit operational quotas and 64-bit administrative quotas
+ * OST: u for user quotas (operational) turned on,
+ *      g for group quotas (operational) turned on,
+ *      1 for 32-bit local operational quotas,
+ *      2 for 32-bit local operational quotas,
+ *      3 for 64-bit local operational quotas,
+ * Permanent parameters can be set with lctl/tunefs
+ */
+int lprocfs_quota_wr_type(struct file *file, const char *buffer,
+                          unsigned long count, void *data)
+{
+        struct obd_device *obd = (struct obd_device *)data;
+        struct obd_device_target *obt;
+        int type = 0, is_mds, idx;
+        unsigned long i;
+        char stype[MAX_STYPE_SIZE + 1] = "";
+        static const lustre_quota_version_t s2av[3] = {LUSTRE_QUOTA_V1,
+                                                       LUSTRE_QUOTA_V2,
+                                                       LUSTRE_QUOTA_V2},
+                                            s2ov[3] = {LUSTRE_QUOTA_V1,
+                                                       LUSTRE_QUOTA_V1,
+                                                       LUSTRE_QUOTA_V2};
+        LASSERT(obd != NULL);
+
+        obt = &obd->u.obt;
+
+        is_mds = !strcmp(obd->obd_type->typ_name, LUSTRE_MDS_NAME);
+
+        if (count > MAX_STYPE_SIZE)
+                return -EINVAL;
+
+        if (copy_from_user(stype, buffer, count))
+                return -EFAULT;
+
+        for (i = 0 ; i < count ; i++) {
+                int rc;
+
+                switch (stype[i]) {
+                case 'u' :
+                        type |= USER_QUOTA;
+                        break;
+                case 'g' :
+                        type |= GROUP_QUOTA;
+                        break;
+                /* quota version specifiers */
+                case '1' :
+                case '2' :
+                case '3' :
+                        idx = stype[i] - '1';
+#ifndef HAVE_QUOTA64
+                        if (s2ov[idx] == LUSTRE_QUOTA_V2)
+                                return -EINVAL;
+#endif
+                        if (is_mds) {
+                                rc = mds_quota_set_version(obd, s2av[idx]);
+                                if (rc) {
+                                        CDEBUG(D_QUOTA, "failed to set admin "
+                                               "quota to spec %c! %d\n",
+                                               stype[i], rc);
+                                        return rc;
+                                }
+                        }
+                        rc = filter_quota_set_version(obd, s2ov[idx]);
+                        if (rc) {
+                                CDEBUG(D_QUOTA, "failed to set operational quota"
+                                       " to spec %c! %d\n", stype[i], rc);
+                                return rc;
+                        }
+                        break;
+                default  : /* just skip stray symbols like \n */
+                        break;
+                }
+        }
+
+        if (type != 0)
+                auto_quota_on(obd, type - 1, obt->obt_sb, is_mds);
+
+        return count;
+}
+EXPORT_SYMBOL(lprocfs_quota_wr_type);
+
+int lprocfs_quota_rd_switch_seconds(char *page, char **start, off_t off,
+                                    int count, int *eof, void *data)
+{
+        struct obd_device *obd = (struct obd_device *)data;
+        LASSERT(obd != NULL);
+
+        return snprintf(page, count, "%d\n",
+                        obd->u.obt.obt_qctxt.lqc_switch_seconds);
+}
+EXPORT_SYMBOL(lprocfs_quota_rd_switch_seconds);
+
+int lprocfs_quota_wr_switch_seconds(struct file *file, const char *buffer,
+                                    unsigned long count, void *data)
+{
+        struct obd_device *obd = (struct obd_device *)data;
+        int val, rc;
+        LASSERT(obd != NULL);
+
+        rc = lprocfs_write_helper(buffer, count, &val);
+        if (rc)
+                return rc;
+
+        if (val <= 10)
+                return -EINVAL;
+
+        obd->u.obt.obt_qctxt.lqc_switch_seconds = val;
+        return count;
+}
+EXPORT_SYMBOL(lprocfs_quota_wr_switch_seconds);
+
+int lprocfs_quota_rd_sync_blk(char *page, char **start, off_t off,
+                              int count, int *eof, void *data)
+{
+        struct obd_device *obd = (struct obd_device *)data;
+        LASSERT(obd != NULL);
+
+        return snprintf(page, count, "%d\n",
+                        obd->u.obt.obt_qctxt.lqc_sync_blk);
+}
+EXPORT_SYMBOL(lprocfs_quota_rd_sync_blk);
+
+int lprocfs_quota_wr_sync_blk(struct file *file, const char *buffer,
+                              unsigned long count, void *data)
+{
+        struct obd_device *obd = (struct obd_device *)data;
+        int val, rc;
+        LASSERT(obd != NULL);
+
+        rc = lprocfs_write_helper(buffer, count, &val);
+        if (rc)
+                return rc;
+
+        if (val < 0)
+                return -EINVAL;
+
+        obd->u.obt.obt_qctxt.lqc_sync_blk = val;
+        return count;
+}
+EXPORT_SYMBOL(lprocfs_quota_wr_sync_blk);
+
+int lprocfs_quota_rd_switch_qs(char *page, char **start, off_t off,
+                               int count, int *eof, void *data)
+{
+        struct obd_device *obd = (struct obd_device *)data;
+        LASSERT(obd != NULL);
+
+        return snprintf(page, count, "changing qunit size is %s\n",
+                        obd->u.obt.obt_qctxt.lqc_switch_qs ?
+                        "enabled" : "disabled");
+}
+EXPORT_SYMBOL(lprocfs_quota_rd_switch_qs);
+
+int lprocfs_quota_wr_switch_qs(struct file *file, const char *buffer,
+                               unsigned long count, void *data)
+{
+        struct obd_device *obd = (struct obd_device *)data;
+        int val, rc;
+        LASSERT(obd != NULL);
+
+        rc = lprocfs_write_helper(buffer, count, &val);
+        if (rc)
+                return rc;
+
+        if (val)
+            obd->u.obt.obt_qctxt.lqc_switch_qs = 1;
+        else
+            obd->u.obt.obt_qctxt.lqc_switch_qs = 0;
+
+        return count;
+}
+EXPORT_SYMBOL(lprocfs_quota_wr_switch_qs);
+
+int lprocfs_quota_rd_boundary_factor(char *page, char **start, off_t off,
+                                     int count, int *eof, void *data)
+{
+        struct obd_device *obd = (struct obd_device *)data;
+        LASSERT(obd != NULL);
+
+
+        return snprintf(page, count, "%lu\n",
+                        obd->u.obt.obt_qctxt.lqc_cqs_boundary_factor);
+}
+EXPORT_SYMBOL(lprocfs_quota_rd_boundary_factor);
+
+int lprocfs_quota_wr_boundary_factor(struct file *file, const char *buffer,
+                                     unsigned long count, void *data)
+{
+        struct obd_device *obd = (struct obd_device *)data;
+        int val, rc;
+        LASSERT(obd != NULL);
+
+        rc = lprocfs_write_helper(buffer, count, &val);
+        if (rc)
+                return rc;
+
+        if (val < 2)
+                return -EINVAL;
+
+        obd->u.obt.obt_qctxt.lqc_cqs_boundary_factor = val;
+        return count;
+}
+EXPORT_SYMBOL(lprocfs_quota_wr_boundary_factor);
+
+int lprocfs_quota_rd_least_bunit(char *page, char **start, off_t off,
+                                 int count, int *eof, void *data)
+{
+        struct obd_device *obd = (struct obd_device *)data;
+        LASSERT(obd != NULL);
+
+
+        return snprintf(page, count, "%lu\n",
+                        obd->u.obt.obt_qctxt.lqc_cqs_least_bunit);
+}
+EXPORT_SYMBOL(lprocfs_quota_rd_least_bunit);
+
+int lprocfs_quota_wr_least_bunit(struct file *file, const char *buffer,
+                                 unsigned long count, void *data)
+{
+        struct obd_device *obd = (struct obd_device *)data;
+        int val, rc;
+        LASSERT(obd != NULL);
+
+        rc = lprocfs_write_helper(buffer, count, &val);
+        if (rc)
+                return rc;
+
+        if (val < PTLRPC_MAX_BRW_SIZE ||
+            val >= obd->u.obt.obt_qctxt.lqc_bunit_sz)
+                return -EINVAL;
+
+        obd->u.obt.obt_qctxt.lqc_cqs_least_bunit = val;
+        return count;
+}
+EXPORT_SYMBOL(lprocfs_quota_wr_least_bunit);
+
+int lprocfs_quota_rd_least_iunit(char *page, char **start, off_t off,
+                                 int count, int *eof, void *data)
+{
+        struct obd_device *obd = (struct obd_device *)data;
+        LASSERT(obd != NULL);
+
+
+        return snprintf(page, count, "%lu\n",
+                        obd->u.obt.obt_qctxt.lqc_cqs_least_iunit);
+}
+EXPORT_SYMBOL(lprocfs_quota_rd_least_iunit);
+
+int lprocfs_quota_wr_least_iunit(struct file *file, const char *buffer,
+                                 unsigned long count, void *data)
+{
+        struct obd_device *obd = (struct obd_device *)data;
+        int val, rc;
+        LASSERT(obd != NULL);
+
+        rc = lprocfs_write_helper(buffer, count, &val);
+        if (rc)
+                return rc;
+
+        if (val < 1 || val >= obd->u.obt.obt_qctxt.lqc_iunit_sz)
+                return -EINVAL;
+
+        obd->u.obt.obt_qctxt.lqc_cqs_least_iunit = val;
+        return count;
+}
+EXPORT_SYMBOL(lprocfs_quota_wr_least_iunit);
+
+int lprocfs_quota_rd_qs_factor(char *page, char **start, off_t off,
+                               int count, int *eof, void *data)
+{
+        struct obd_device *obd = (struct obd_device *)data;
+        LASSERT(obd != NULL);
+
+
+        return snprintf(page, count, "%lu\n",
+                        obd->u.obt.obt_qctxt.lqc_cqs_qs_factor);
+}
+EXPORT_SYMBOL(lprocfs_quota_rd_qs_factor);
+
+int lprocfs_quota_wr_qs_factor(struct file *file, const char *buffer,
+                               unsigned long count, void *data)
+{
+        struct obd_device *obd = (struct obd_device *)data;
+        int val, rc;
+        LASSERT(obd != NULL);
+
+        rc = lprocfs_write_helper(buffer, count, &val);
+        if (rc)
+                return rc;
+
+        if (val < 2)
+                return -EINVAL;
+
+        obd->u.obt.obt_qctxt.lqc_cqs_qs_factor = val;
+        return count;
+}
+EXPORT_SYMBOL(lprocfs_quota_wr_qs_factor);
+
+struct lprocfs_vars lprocfs_quota_common_vars[] = {
+        { "quota_bunit_sz", lprocfs_quota_rd_bunit,
+                            lprocfs_quota_wr_bunit, 0},
+        { "quota_btune_sz", lprocfs_quota_rd_btune,
+                            lprocfs_quota_wr_btune, 0},
+        { "quota_iunit_sz", lprocfs_quota_rd_iunit,
+                            lprocfs_quota_wr_iunit, 0},
+        { "quota_itune_sz", lprocfs_quota_rd_itune,
+                            lprocfs_quota_wr_itune, 0},
+        { "quota_type",     lprocfs_quota_rd_type,
+                            lprocfs_quota_wr_type, 0},
+        { "quota_switch_seconds",  lprocfs_quota_rd_switch_seconds,
+                                   lprocfs_quota_wr_switch_seconds, 0 },
+        { "quota_sync_blk", lprocfs_quota_rd_sync_blk,
+                            lprocfs_quota_wr_sync_blk, 0},
+        { NULL }
+};
+
+struct lprocfs_vars lprocfs_quota_master_vars[] = {
+        { "quota_switch_qs", lprocfs_quota_rd_switch_qs,
+                             lprocfs_quota_wr_switch_qs, 0 },
+        { "quota_boundary_factor", lprocfs_quota_rd_boundary_factor,
+                                   lprocfs_quota_wr_boundary_factor, 0 },
+        { "quota_least_bunit", lprocfs_quota_rd_least_bunit,
+                               lprocfs_quota_wr_least_bunit, 0 },
+        { "quota_least_iunit", lprocfs_quota_rd_least_iunit,
+                               lprocfs_quota_wr_least_iunit, 0 },
+        { "quota_qs_factor",   lprocfs_quota_rd_qs_factor,
+                               lprocfs_quota_wr_qs_factor, 0 },
+        { NULL }
+};
+
+int lquota_proc_setup(struct obd_device *obd, int is_master)
+{
+        struct lustre_quota_ctxt *qctxt = &obd->u.obt.obt_qctxt;
+        int rc = 0;
+        ENTRY;
+
+        LASSERT(lquota_type_proc_dir && obd);
+        qctxt->lqc_proc_dir = lprocfs_register(obd->obd_name,
+                                               lquota_type_proc_dir,
+                                               lprocfs_quota_common_vars, obd);
+        if (IS_ERR(qctxt->lqc_proc_dir)) {
+                rc = PTR_ERR(qctxt->lqc_proc_dir);
+                CERROR("error %d setting up lprocfs for %s\n", rc,
+                       obd->obd_name);
+                qctxt->lqc_proc_dir = NULL;
+                GOTO(out, rc);
+        }
+
+        if (is_master) {
+                rc = lprocfs_add_vars(qctxt->lqc_proc_dir,
+                                      lprocfs_quota_master_vars, obd);
+                if (rc) {
+                        CERROR("error %d setting up lprocfs for %s"
+                               "(quota master)\n", rc, obd->obd_name);
+                        GOTO(out_free_proc, rc);
+                }
+        }
+
+        qctxt->lqc_stats = lprocfs_alloc_stats(LQUOTA_LAST_STAT -
+                                               LQUOTA_FIRST_STAT, 0);
+        if (!qctxt->lqc_stats)
+                GOTO(out_free_proc, rc = -ENOMEM);
+
+        lprocfs_counter_init(qctxt->lqc_stats, LQUOTA_SYNC_ACQ,
+                             LPROCFS_CNTR_AVGMINMAX, "sync_acq_req", "us");
+        lprocfs_counter_init(qctxt->lqc_stats, LQUOTA_SYNC_REL,
+                             LPROCFS_CNTR_AVGMINMAX, "sync_rel_req", "us");
+        lprocfs_counter_init(qctxt->lqc_stats, LQUOTA_ASYNC_ACQ,
+                             LPROCFS_CNTR_AVGMINMAX, "async_acq_req", "us");
+        lprocfs_counter_init(qctxt->lqc_stats, LQUOTA_ASYNC_REL,
+                             LPROCFS_CNTR_AVGMINMAX, "async_rel_req", "us");
+
+        lprocfs_counter_init(qctxt->lqc_stats, LQUOTA_WAIT_FOR_CHK_BLK,
+                             LPROCFS_CNTR_AVGMINMAX,
+                             "wait_for_blk_quota(lquota_chkquota)", "us");
+        lprocfs_counter_init(qctxt->lqc_stats, LQUOTA_WAIT_FOR_CHK_INO,
+                             LPROCFS_CNTR_AVGMINMAX,
+                             "wait_for_ino_quota(lquota_chkquota)", "us");
+        lprocfs_counter_init(qctxt->lqc_stats, LQUOTA_WAIT_FOR_COMMIT_BLK,
+                             LPROCFS_CNTR_AVGMINMAX,
+                             "wait_for_blk_quota(lquota_pending_commit)",
+                             "us");
+        lprocfs_counter_init(qctxt->lqc_stats, LQUOTA_WAIT_FOR_COMMIT_INO,
+                             LPROCFS_CNTR_AVGMINMAX,
+                             "wait_for_ino_quota(lquota_pending_commit)",
+                             "us");
+
+        lprocfs_counter_init(qctxt->lqc_stats, LQUOTA_WAIT_PENDING_BLK_QUOTA,
+                             LPROCFS_CNTR_AVGMINMAX,
+                             "wait_for_pending_blk_quota_req"
+                             "(qctxt_wait_pending_dqacq)", "us");
+        lprocfs_counter_init(qctxt->lqc_stats, LQUOTA_WAIT_PENDING_INO_QUOTA,
+                             LPROCFS_CNTR_AVGMINMAX,
+                             "wait_for_pending_ino_quota_req"
+                             "(qctxt_wait_pending_dqacq)", "us");
+        lprocfs_counter_init(qctxt->lqc_stats, LQUOTA_NOWAIT_PENDING_BLK_QUOTA,
+                             LPROCFS_CNTR_AVGMINMAX,
+                             "nowait_for_pending_blk_quota_req"
+                             "(qctxt_wait_pending_dqacq)", "us");
+        lprocfs_counter_init(qctxt->lqc_stats, LQUOTA_NOWAIT_PENDING_INO_QUOTA,
+                             LPROCFS_CNTR_AVGMINMAX,
+                             "nowait_for_pending_ino_quota_req"
+                             "(qctxt_wait_pending_dqacq)", "us");
+
+        lprocfs_counter_init(qctxt->lqc_stats, LQUOTA_QUOTA_CTL,
+                             LPROCFS_CNTR_AVGMINMAX, "quota_ctl", "us");
+        lprocfs_counter_init(qctxt->lqc_stats, LQUOTA_ADJUST_QUNIT,
+                             LPROCFS_CNTR_AVGMINMAX, "adjust_qunit", "us");
+
+        lprocfs_register_stats(qctxt->lqc_proc_dir, "stats", qctxt->lqc_stats);
+
+        RETURN(rc);
+
+out_free_proc:
+        lprocfs_remove(&qctxt->lqc_proc_dir);
+out:
+        RETURN(rc);
+}
+
+int lquota_proc_cleanup(struct lustre_quota_ctxt *qctxt)
+{
+        if (!qctxt || !qctxt->lqc_proc_dir)
+                return -EINVAL;
+
+        if (qctxt->lqc_stats != NULL)
+                lprocfs_free_stats(&qctxt->lqc_stats);
+
+        lprocfs_remove(&qctxt->lqc_proc_dir);
+        return 0;
+}
+
+#endif  /* LPROCFS */
+#endif
diff --git a/lustre/quota/quota_adjust_qunit.c b/lustre/quota/quota_adjust_qunit.c

index 3283a75..952eee5 100644 (file)
--- a/lustre/quota/quota_adjust_qunit.c
+++ b/lustre/quota/quota_adjust_qunit.c
@@ -1,19 +1,42 @@
  /* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
   * vim:expandtab:shiftwidth=8:tabstop=8:
   *
- *  lustre/quota/quota_adjust_qunit.c
+ * GPL HEADER START
   *
- *  Copyright (c) 2005 Cluster File Systems, Inc.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   *
- *   This file is part of Lustre, http://www.lustre.org.
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
   *
- *   No redistribution or use is permitted outside of Cluster File Systems, Inc.
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
   *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright  2008 Sun Microsystems, Inc. All rights reserved
+ * Use is subject to license terms.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
   */
  #ifndef EXPORT_SYMTAB
  # define EXPORT_SYMTAB
  #endif
-#define DEBUG_SUBSYSTEM S_MDS
+#define DEBUG_SUBSYSTEM S_LQUOTA
  
  #ifdef __KERNEL__
  # include <linux/version.h>
@@ -21,16 +44,11 @@
  # include <linux/init.h>
  # include <linux/fs.h>
  # include <linux/jbd.h>
-# include <linux/ext3_fs.h>
  # include <linux/quota.h>
-# if (LINUX_VERSION_CODE >= KERNEL_VERSION(2,5,0))
  #  include <linux/smp_lock.h>
  #  include <linux/buffer_head.h>
  #  include <linux/workqueue.h>
  #  include <linux/mount.h>
-# else
-#  include <linux/locks.h>
-# endif
  #else /* __KERNEL__ */
  # include <liblustre.h>
  #endif
@@ -45,6 +63,8 @@
  #include <class_hash.h>
  #include "quota_internal.h"
  
+#ifdef HAVE_QUOTA_SUPPORT
+
  #ifdef __KERNEL__
  /* this function is charge of recording lqs_ino_rec and
   * lqs_blk_rec. when a lquota slave checks a quota
@@ -121,8 +141,7 @@ int quota_search_lqs(struct qunit_data *qdata, struct quota_adjust_qunit *oqaq,
                  oqaq_tmp = oqaq;
          }
  
-        *lqs_return = lustre_hash_get_object_by_key(LQC_HASH_BODY(qctxt),
-                                                    oqaq_tmp);
+        *lqs_return = lustre_hash_lookup(qctxt->lqc_lqs_hash, oqaq_tmp);
          if (*lqs_return)
                  LQS_DEBUG((*lqs_return), "show lqs\n");
  
@@ -135,45 +154,42 @@ int quota_create_lqs(struct qunit_data *qdata, struct quota_adjust_qunit *oqaq,
                       struct lustre_quota_ctxt *qctxt,
                       struct lustre_qunit_size **lqs_return)
  {
-        int rc = 0;
-        struct quota_adjust_qunit *oqaq_tmp = NULL;
          struct lustre_qunit_size *lqs = NULL;
+        int rc = 0;
          ENTRY;
  
          LASSERT(*lqs_return == NULL);
          LASSERT(oqaq || qdata);
  
-        if (!oqaq) {
-                OBD_ALLOC_PTR(oqaq_tmp);
-                if (!oqaq_tmp)
-                        RETURN(-ENOMEM);
-                qdata_to_oqaq(qdata, oqaq_tmp);
-        } else {
-                oqaq_tmp = oqaq;
-        }
-
          OBD_ALLOC_PTR(lqs);
          if (!lqs)
                  GOTO(out, rc = -ENOMEM);
  
+        if (!oqaq) {
+                qdata_to_oqaq(qdata, &lqs->lqs_key);
+        } else {
+                lqs->lqs_key = *oqaq;
+        }
+
          spin_lock_init(&lqs->lqs_lock);
          lqs->lqs_bwrite_pending = 0;
          lqs->lqs_iwrite_pending = 0;
          lqs->lqs_ino_rec = 0;
          lqs->lqs_blk_rec = 0;
-        lqs->lqs_id = oqaq_tmp->qaq_id;
-        lqs->lqs_flags = QAQ_IS_GRP(oqaq_tmp);
+        lqs->lqs_id = lqs->lqs_key.qaq_id;
+        lqs->lqs_flags = QAQ_IS_GRP(&lqs->lqs_key);
          lqs->lqs_bunit_sz = qctxt->lqc_bunit_sz;
          lqs->lqs_iunit_sz = qctxt->lqc_iunit_sz;
          lqs->lqs_btune_sz = qctxt->lqc_btune_sz;
          lqs->lqs_itune_sz = qctxt->lqc_itune_sz;
+        lqs->lqs_ctxt = qctxt;
          if (qctxt->lqc_handler) {
                  lqs->lqs_last_bshrink  = 0;
                  lqs->lqs_last_ishrink  = 0;
          }
          lqs_initref(lqs);
-        rc = lustre_hash_additem_unique(LQC_HASH_BODY(qctxt),
-                                        oqaq_tmp, &lqs->lqs_hash);
+        rc = lustre_hash_add_unique(qctxt->lqc_lqs_hash,
+                                    &lqs->lqs_key, &lqs->lqs_hash);
          LQS_DEBUG(lqs, "create lqs\n");
          if (!rc) {
                  lqs_getref(lqs);
@@ -182,8 +198,6 @@ int quota_create_lqs(struct qunit_data *qdata, struct quota_adjust_qunit *oqaq,
   out:
          if (rc && lqs)
                  OBD_FREE_PTR(lqs);
-        if (!oqaq)
-                OBD_FREE_PTR(oqaq_tmp);
          RETURN(rc);
  }
  
@@ -211,7 +225,7 @@ search_lqs:
                          LQS_DEBUG(lqs, "release lqs\n");
                          /* this is for quota_search_lqs */
                          lqs_putref(lqs);
-                        /* this is for deleting this lqs */
+                        /* kill lqs */
                          lqs_putref(lqs);
                  }
                  RETURN(rc);
@@ -298,10 +312,10 @@ search_lqs:
  }
  
  int filter_quota_adjust_qunit(struct obd_export *exp,
-                              struct quota_adjust_qunit *oqaq)
+                              struct quota_adjust_qunit *oqaq,
+                              struct lustre_quota_ctxt *qctxt)
  {
          struct obd_device *obd = exp->exp_obd;
-        struct lustre_quota_ctxt *qctxt = &obd->u.obt.obt_qctxt;
          unsigned int uid = 0, gid = 0;
          int rc = 0;
          ENTRY;
@@ -319,8 +333,8 @@ int filter_quota_adjust_qunit(struct obd_export *exp,
                  uid = oqaq->qaq_id;
  
          if (rc > 0) {
-                rc = qctxt_adjust_qunit(obd, qctxt, uid, gid, 1, 0);
-                if (rc == -EDQUOT || rc == -EBUSY) {
+                rc = qctxt_adjust_qunit(obd, qctxt, uid, gid, 1, 0, NULL);
+                if (rc == -EDQUOT || rc == -EBUSY || rc == -EAGAIN) {
                          CDEBUG(D_QUOTA, "rc: %d.\n", rc);
                          rc = 0;
                  }
@@ -330,13 +344,15 @@ int filter_quota_adjust_qunit(struct obd_export *exp,
          RETURN(rc);
  }
  #endif /* __KERNEL__ */
+#endif
  
  int client_quota_adjust_qunit(struct obd_export *exp,
-                              struct quota_adjust_qunit *oqaq)
+                              struct quota_adjust_qunit *oqaq,
+                              struct lustre_quota_ctxt *qctxt)
  {
          struct ptlrpc_request *req;
          struct quota_adjust_qunit *oqa;
-        int size[2] = { sizeof(struct ptlrpc_body), sizeof(*oqaq) };
+        __u32 size[2] = { sizeof(struct ptlrpc_body), sizeof(*oqaq) };
          int rc = 0;
          ENTRY;
  
@@ -372,7 +388,8 @@ out:
  }
  
  int lov_quota_adjust_qunit(struct obd_export *exp,
-                           struct quota_adjust_qunit *oqaq)
+                           struct quota_adjust_qunit *oqaq,
+                           struct lustre_quota_ctxt *qctxt)
  {
          struct obd_device *obd = class_exp2obd(exp);
          struct lov_obd *lov = &obd->u.lov;
@@ -392,7 +409,8 @@ int lov_quota_adjust_qunit(struct obd_export *exp,
                          continue;
                  }
  
-                err = obd_quota_adjust_qunit(lov->lov_tgts[i]->ltd_exp, oqaq);
+                err = obd_quota_adjust_qunit(lov->lov_tgts[i]->ltd_exp, oqaq,
+                                             NULL);
                  if (err) {
                          if (lov->lov_tgts[i]->ltd_active && !rc)
                                  rc = err;
diff --git a/lustre/quota/quota_check.c b/lustre/quota/quota_check.c

index 95bde12..5fbd4a7 100644 (file)
--- a/lustre/quota/quota_check.c
+++ b/lustre/quota/quota_check.c
@@ -1,35 +1,53 @@
  /* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
   * vim:expandtab:shiftwidth=8:tabstop=8:
   *
- *  lustre/quota/quota_check.c
+ * GPL HEADER START
   *
- *  Copyright (c) 2005 Cluster File Systems, Inc.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   *
- *   This file is part of Lustre, http://www.lustre.org.
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
   *
- *   No redistribution or use is permitted outside of Cluster File Systems, Inc.
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
   *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright  2008 Sun Microsystems, Inc. All rights reserved
+ * Use is subject to license terms.
   */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+
  #ifndef EXPORT_SYMTAB
  # define EXPORT_SYMTAB
  #endif
-#define DEBUG_SUBSYSTEM S_MDS
+#define DEBUG_SUBSYSTEM S_LQUOTA
  
  #ifdef __KERNEL__
  # include <linux/version.h>
  # include <linux/module.h>
  # include <linux/init.h>
-# include <linux/fs.h>
-# include <linux/jbd.h>
-# include <linux/ext3_fs.h>
-# if (LINUX_VERSION_CODE >= KERNEL_VERSION(2,5,0))
  #  include <linux/smp_lock.h>
  #  include <linux/buffer_head.h>
  #  include <linux/workqueue.h>
  #  include <linux/mount.h>
-# else
-#  include <linux/locks.h>
-# endif
  #else /* __KERNEL__ */
  # include <liblustre.h>
  #endif
@@ -43,6 +61,7 @@
  #include <lustre_quota.h>
  #include "quota_internal.h"
  
+#ifdef HAVE_QUOTA_SUPPORT
  #ifdef __KERNEL__
  static int target_quotacheck_callback(struct obd_export *exp,
                                        struct obd_quotactl *oqctl)
@@ -148,13 +167,14 @@ out:
  }
  
  #endif /* __KERNEL__ */
+#endif /* HAVE_QUOTA_SUPPORT */
  
  int client_quota_check(struct obd_export *exp, struct obd_quotactl *oqctl)
  {
          struct client_obd *cli = &exp->exp_obd->u.cli;
          struct ptlrpc_request *req;
          struct obd_quotactl *body;
-        int size[2] = { sizeof(struct ptlrpc_body), sizeof(*body) };
+        __u32 size[2] = { sizeof(struct ptlrpc_body), sizeof(*body) };
          int ver, opc, rc;
          ENTRY;
  
@@ -230,7 +250,7 @@ int lov_quota_check(struct obd_export *exp, struct obd_quotactl *oqctl)
                  }
  
                  err = obd_quotacheck(lov->lov_tgts[i]->ltd_exp, oqctl);
-                if (err && lov->lov_tgts[i]->ltd_active && !rc)
+                if (err && !rc)
                          rc = err;
          }
  
diff --git a/lustre/quota/quota_context.c b/lustre/quota/quota_context.c

index 6f6efb8..2091b87 100644 (file)
--- a/lustre/quota/quota_context.c
+++ b/lustre/quota/quota_context.c
@@ -1,22 +1,50 @@
  /* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
   * vim:expandtab:shiftwidth=8:tabstop=8:
   *
- *  lustre/quota/quota_context.c
- *  Lustre Quota Context
+ * GPL HEADER START
   *
- *  Copyright (c) 2001-2005 Cluster File Systems, Inc.
- *   Author: Niu YaWei <niu@clusterfs.com>
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   *
- *   This file is part of Lustre, http://www.lustre.org.
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
   *
- *   No redistribution or use is permitted outside of Cluster File Systems, Inc.
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
   *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright  2008 Sun Microsystems, Inc. All rights reserved
+ * Use is subject to license terms.
   */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/quota/quota_context.c
+ *
+ * Lustre Quota Context
+ *
+ * Author: Niu YaWei <niu@clusterfs.com>
+ */
+
  #ifndef EXPORT_SYMTAB
  # define EXPORT_SYMTAB
  #endif
  
-#define DEBUG_SUBSYSTEM S_MDS
+#define DEBUG_SUBSYSTEM S_LQUOTA
  
  #include <linux/version.h>
  #include <linux/fs.h>
@@ -30,9 +58,12 @@
  #include <lustre_quota.h>
  #include <lustre_fsfilt.h>
  #include <class_hash.h>
+#include <lprocfs_status.h>
  #include "quota_internal.h"
  
-extern struct lustre_hash_operations lqs_hash_operations;
+#ifdef HAVE_QUOTA_SUPPORT
+
+static lustre_hash_ops_t lqs_hash_ops;
  
  unsigned long default_bunit_sz = 128 * 1024 * 1024; /* 128M bytes */
  unsigned long default_btune_ratio = 50;             /* 50 percentage */
@@ -207,6 +238,13 @@ check_cur_qunit(struct obd_device *obd,
          if (!sb_any_quota_enabled(sb))
                  RETURN(0);
  
+        spin_lock(&qctxt->lqc_lock);
+        if (!qctxt->lqc_valid){
+                spin_unlock(&qctxt->lqc_lock);
+                RETURN(0);
+        }
+        spin_unlock(&qctxt->lqc_lock);
+
          OBD_ALLOC_PTR(qctl);
          if (qctl == NULL)
                  RETURN(-ENOMEM);
@@ -254,7 +292,7 @@ check_cur_qunit(struct obd_device *obd,
          if (QDATA_IS_BLK(qdata)) {
                  qunit_sz = lqs->lqs_bunit_sz;
                  tune_sz  = lqs->lqs_btune_sz;
-                pending_write = lqs->lqs_bwrite_pending * CFS_PAGE_SIZE;
+                pending_write = lqs->lqs_bwrite_pending;
                  record   = lqs->lqs_blk_rec;
                  LASSERT(!(qunit_sz % QUOTABLOCK_SIZE));
          } else {
@@ -291,6 +329,12 @@ check_cur_qunit(struct obd_device *obd,
                         limit_org > qdata->qd_count + qunit_sz)
                          qdata->qd_count += qunit_sz;
                  ret = 2;
+                /* if there are other pending writes for this uid/gid, releasing
+                 * quota is put off until the last pending write b=16645 */
+                if (ret == 2 && pending_write) {
+                        CDEBUG(D_QUOTA, "delay quota release\n");
+                        ret = 0;
+                }
          }
          CDEBUG(D_QUOTA, "type: %c, limit: "LPU64", usage: "LPU64
                 ", pending_write: "LPU64", record: "LPD64
@@ -361,19 +405,6 @@ out:
          return ret;
  }
  
-/* caller must hold qunit_hash_lock */
-static struct lustre_qunit *dqacq_in_flight(struct lustre_quota_ctxt *qctxt,
-                                            struct qunit_data *qdata)
-{
-        unsigned int hashent = qunit_hashfn(qctxt, qdata);
-        struct lustre_qunit *qunit;
-        ENTRY;
-
-        LASSERT_SPIN_LOCKED(&qunit_hash_lock);
-        qunit = find_qunit(hashent, qctxt, qdata);
-        RETURN(qunit);
-}
-
  static struct lustre_qunit *alloc_qunit(struct lustre_quota_ctxt *qctxt,
                                          struct qunit_data *qdata, int opc)
  {
@@ -412,12 +443,28 @@ static void qunit_put(struct lustre_qunit *qunit)
                  free_qunit(qunit);
  }
  
+/* caller must hold qunit_hash_lock and release ref of qunit after using it */
+static struct lustre_qunit *dqacq_in_flight(struct lustre_quota_ctxt *qctxt,
+                                            struct qunit_data *qdata)
+{
+        unsigned int hashent = qunit_hashfn(qctxt, qdata);
+        struct lustre_qunit *qunit;
+        ENTRY;
+
+        LASSERT_SPIN_LOCKED(&qunit_hash_lock);
+        qunit = find_qunit(hashent, qctxt, qdata);
+        if (qunit)
+                qunit_get(qunit);
+        RETURN(qunit);
+}
+
  static void
  insert_qunit_nolock(struct lustre_quota_ctxt *qctxt, struct lustre_qunit *qunit)
  {
          struct list_head *head;
  
          LASSERT(list_empty(&qunit->lq_hash));
+        qunit_get(qunit);
          head = qunit_hash + qunit_hashfn(qctxt, &qunit->lq_data);
          list_add(&qunit->lq_hash, head);
          QUNIT_SET_STATE(qunit, QUNIT_IN_HASH);
@@ -450,6 +497,7 @@ static void remove_qunit_nolock(struct lustre_qunit *qunit)
  
          list_del_init(&qunit->lq_hash);
          QUNIT_SET_STATE(qunit, QUNIT_RM_FROM_HASH);
+        qunit_put(qunit);
  }
  
  #define INC_QLIMIT(limit, count) (limit == MIN_QLIMIT) ? \
@@ -466,7 +514,8 @@ is_master(struct obd_device *obd, struct lustre_quota_ctxt *qctxt,
  
  static int
  schedule_dqacq(struct obd_device *obd, struct lustre_quota_ctxt *qctxt,
-               struct qunit_data *qdata, int opc, int wait);
+               struct qunit_data *qdata, int opc, int wait,
+               struct obd_trans_info *oti);
  
  static int
  dqacq_completion(struct obd_device *obd, struct lustre_quota_ctxt *qctxt,
@@ -582,6 +631,9 @@ out:
          QUNIT_SET_STATE_AND_RC(qunit, QUNIT_FINISHED, rc);
          wake_up(&qunit->lq_waitq);
  
+        /* this is for dqacq_in_flight() */
+        qunit_put(qunit);
+        /* this is for alloc_qunit() */
          qunit_put(qunit);
          if (rc < 0 && rc != -EDQUOT)
                   RETURN(err);
@@ -614,7 +666,7 @@ out:
          if (rc1 > 0) {
                  int opc;
                  opc = rc1 == 1 ? QUOTA_DQACQ : QUOTA_DQREL;
-                rc1 = schedule_dqacq(obd, qctxt, qdata, opc, 0);
+                rc1 = schedule_dqacq(obd, qctxt, qdata, opc, 0, NULL);
                  QDATA_DEBUG(qdata, "reschedudle opc(%d) rc(%d)\n", opc, rc1);
          }
          RETURN(err);
@@ -644,17 +696,17 @@ static int dqacq_interpret(struct ptlrpc_request *req, void *data, int rc)
          if (!qdata)
                  RETURN(-ENOMEM);
  
-        if (rc == -EIO || rc == -EINTR || rc == -ENOTCONN )
-                /* if a quota req timeouts or is dropped, we should update quota
-                 * statistics which will be handled in dqacq_completion. And in
-                 * this situation we should get qdata from request instead of
-                 * reply */
-                rc1 = quota_get_qdata(req, qdata, QUOTA_REQUEST, QUOTA_IMPORT);
-        else
-                rc1 = quota_get_qdata(req, qdata, QUOTA_REPLY, QUOTA_IMPORT);
+        /* if a quota req timeouts or is dropped, we should update quota
+         * statistics which will be handled in dqacq_completion. And in
+         * this situation we should get qdata from request instead of
+         * reply */
+        rc1 = quota_get_qdata(req, qdata,
+                              (rc != 0) ? QUOTA_REQUEST : QUOTA_REPLY,
+                              QUOTA_IMPORT);
          if (rc1 < 0) {
-                DEBUG_REQ(D_ERROR, req, "error unpacking qunit_data\n");
-                GOTO(exit, rc = -EPROTO);
+                DEBUG_REQ(D_ERROR, req,
+                          "error unpacking qunit_data(rc: %d)\n", rc1);
+                GOTO(exit, rc = rc1);
          }
  
          QDATA_DEBUG(qdata, "qdata: interpret rc(%d).\n", rc);
@@ -694,6 +746,20 @@ exit:
          RETURN(rc);
  }
  
+/* check if quota master is online */
+int check_qm(struct lustre_quota_ctxt *qctxt)
+{
+        int rc;
+        ENTRY;
+
+        spin_lock(&qctxt->lqc_lock);
+        /* quit waiting when mds is back or qctxt is cleaned up */
+        rc = qctxt->lqc_import || !qctxt->lqc_valid;
+        spin_unlock(&qctxt->lqc_lock);
+
+        RETURN(rc);
+}
+
  static int got_qunit(struct lustre_qunit *qunit)
  {
          int rc;
@@ -718,7 +784,8 @@ static int got_qunit(struct lustre_qunit *qunit)
  
  static int
  schedule_dqacq(struct obd_device *obd, struct lustre_quota_ctxt *qctxt,
-               struct qunit_data *qdata, int opc, int wait)
+               struct qunit_data *qdata, int opc, int wait,
+               struct obd_trans_info *oti)
  {
          struct lustre_qunit *qunit, *empty;
          struct l_wait_info lwi = { 0 };
@@ -727,28 +794,30 @@ schedule_dqacq(struct obd_device *obd, struct lustre_quota_ctxt *qctxt,
          int size[2] = { sizeof(struct ptlrpc_body), 0 };
          struct obd_import *imp = NULL;
          struct lustre_qunit_size *lqs = NULL;
+        struct timeval work_start;
+        struct timeval work_end;
+        long timediff;
          int rc = 0;
          ENTRY;
  
+        LASSERT(opc == QUOTA_DQACQ || opc == QUOTA_DQREL);
+        do_gettimeofday(&work_start);
          if ((empty = alloc_qunit(qctxt, qdata, opc)) == NULL)
                  RETURN(-ENOMEM);
  
          spin_lock(&qunit_hash_lock);
          qunit = dqacq_in_flight(qctxt, qdata);
          if (qunit) {
-                if (wait)
-                        qunit_get(qunit);
                  spin_unlock(&qunit_hash_lock);
-                free_qunit(empty);
+                qunit_put(empty);
  
                  goto wait_completion;
          }
          qunit = empty;
+        qunit_get(qunit);
          insert_qunit_nolock(qctxt, qunit);
          spin_unlock(&qunit_hash_lock);
  
-        LASSERT(qunit);
-
          quota_search_lqs(qdata, NULL, qctxt, &lqs);
          if (lqs) {
                  spin_lock(&lqs->lqs_lock);
@@ -772,6 +841,19 @@ schedule_dqacq(struct obd_device *obd, struct lustre_quota_ctxt *qctxt,
                  QDATA_SET_CHANGE_QS(qdata);
                  rc = qctxt->lqc_handler(obd, qdata, opc);
                  rc2 = dqacq_completion(obd, qctxt, qdata, rc, opc);
+                /* this is for qunit_get() */
+                qunit_put(qunit);
+
+                do_gettimeofday(&work_end);
+                timediff = cfs_timeval_sub(&work_end, &work_start, NULL);
+                if (opc == QUOTA_DQACQ)
+                        lprocfs_counter_add(qctxt->lqc_stats,
+                                            wait ? LQUOTA_SYNC_ACQ : LQUOTA_ASYNC_ACQ,
+                                            timediff);
+                else
+                        lprocfs_counter_add(qctxt->lqc_stats,
+                                            wait ? LQUOTA_SYNC_REL : LQUOTA_ASYNC_REL,
+                                            timediff);
                  RETURN(rc ? rc : rc2);
          }
  
@@ -789,7 +871,27 @@ schedule_dqacq(struct obd_device *obd, struct lustre_quota_ctxt *qctxt,
                  QUNIT_SET_STATE_AND_RC(qunit, QUNIT_FINISHED, -EAGAIN);
                  wake_up(&qunit->lq_waitq);
  
+                /* this is for qunit_get() */
                  qunit_put(qunit);
+                /* this for alloc_qunit() */
+                qunit_put(qunit);
+                spin_lock(&qctxt->lqc_lock);
+                if (wait && !qctxt->lqc_import) {
+                        spin_unlock(&qctxt->lqc_lock);
+
+                        LASSERT(oti && oti->oti_thread &&
+                                oti->oti_thread->t_watchdog);
+
+                        lc_watchdog_disable(oti->oti_thread->t_watchdog);
+                        CDEBUG(D_QUOTA, "sleep for quota master\n");
+                        l_wait_event(qctxt->lqc_wait_for_qmaster,
+                                     check_qm(qctxt), &lwi);
+                        CDEBUG(D_QUOTA, "wake up when quota master is back\n");
+                        lc_watchdog_touch(oti->oti_thread->t_watchdog);
+                } else {
+                        spin_unlock(&qctxt->lqc_lock);
+                }
+
                  RETURN(-EAGAIN);
          }
          imp = class_import_get(qctxt->lqc_import);
@@ -805,22 +907,26 @@ schedule_dqacq(struct obd_device *obd, struct lustre_quota_ctxt *qctxt,
          if (!req) {
                  dqacq_completion(obd, qctxt, qdata, -ENOMEM, opc);
                  class_import_put(imp);
+                /* this is for qunit_get() */
+                qunit_put(qunit);
                  RETURN(-ENOMEM);
          }
  
          rc = quota_copy_qdata(req, qdata, QUOTA_REQUEST, QUOTA_IMPORT);
          if (rc < 0) {
-                CDEBUG(D_ERROR, "Can't pack qunit_data\n");
-                RETURN(-EPROTO);
+                CDEBUG(D_ERROR, "Can't pack qunit_data(rc: %d)\n", rc);
+                dqacq_completion(obd, qctxt, qdata, rc, opc);
+                class_import_put(imp);
+                /* this is for qunit_get() */
+                qunit_put(qunit);
+                RETURN(rc);
          }
          ptlrpc_req_set_repsize(req, 2, size);
+        req->rq_no_resend = req->rq_no_delay = 1;
          class_import_put(imp);
  
-        if (wait && qunit)
-                qunit_get(qunit);
-
          CLASSERT(sizeof(*aa) <= sizeof(req->rq_async_args));
-        aa = (struct dqacq_async_args *)&req->rq_async_args;
+        aa = ptlrpc_req_async_args(req);
          aa->aa_ctxt = qctxt;
          aa->aa_qunit = qunit;
  
@@ -848,16 +954,29 @@ wait_completion:
                  spin_unlock(&qunit->lq_lock);
                  CDEBUG(D_QUOTA, "qunit(%p) finishes waiting. (rc:%d)\n",
                         qunit, rc);
-                qunit_put(qunit);
          }
+
+        qunit_put(qunit);
+        do_gettimeofday(&work_end);
+        timediff = cfs_timeval_sub(&work_end, &work_start, NULL);
+        if (opc == QUOTA_DQACQ)
+                lprocfs_counter_add(qctxt->lqc_stats,
+                                    wait ? LQUOTA_SYNC_ACQ : LQUOTA_ASYNC_ACQ,
+                                    timediff);
+        else
+                lprocfs_counter_add(qctxt->lqc_stats,
+                                    wait ? LQUOTA_SYNC_REL : LQUOTA_ASYNC_REL,
+                                    timediff);
+
          RETURN(rc);
  }
  
  int
  qctxt_adjust_qunit(struct obd_device *obd, struct lustre_quota_ctxt *qctxt,
-                   uid_t uid, gid_t gid, __u32 isblk, int wait)
+                   uid_t uid, gid_t gid, __u32 isblk, int wait,
+                   struct obd_trans_info *oti)
  {
-        int ret, rc = 0, i = USRQUOTA;
+        int rc = 0, i = USRQUOTA;
          __u32 id[MAXQUOTAS] = { uid, gid };
          struct qunit_data qdata[MAXQUOTAS];
          ENTRY;
@@ -873,18 +992,21 @@ qctxt_adjust_qunit(struct obd_device *obd, struct lustre_quota_ctxt *qctxt,
                          QDATA_SET_BLK(&qdata[i]);
                  qdata[i].qd_count = 0;
  
-                ret = check_cur_qunit(obd, qctxt, &qdata[i]);
-                if (ret > 0) {
+                rc = check_cur_qunit(obd, qctxt, &qdata[i]);
+                if (rc > 0) {
                          int opc;
                          /* need acquire or release */
-                        opc = ret == 1 ? QUOTA_DQACQ : QUOTA_DQREL;
-                        ret = schedule_dqacq(obd, qctxt, &qdata[i], opc, wait);
-                        if (!rc)
-                                rc = ret;
+                        opc = rc == 1 ? QUOTA_DQACQ : QUOTA_DQREL;
+                        rc = schedule_dqacq(obd, qctxt, &qdata[i], opc,
+                                            wait,oti);
+                        if (rc < 0)
+                                RETURN(rc);
                  } else if (wait == 1) {
                          /* when wait equates 1, that means mds_quota_acquire
                           * or filter_quota_acquire is calling it. */
-                        qctxt_wait_pending_dqacq(qctxt, id[i], i, isblk);
+                        rc = qctxt_wait_pending_dqacq(qctxt, id[i], i, isblk);
+                        if (rc < 0)
+                                RETURN(rc);
                  }
          }
  
@@ -897,9 +1019,14 @@ qctxt_wait_pending_dqacq(struct lustre_quota_ctxt *qctxt, unsigned int id,
  {
          struct lustre_qunit *qunit = NULL;
          struct qunit_data qdata;
+        struct timeval work_start;
+        struct timeval work_end;
+        long timediff;
          struct l_wait_info lwi = { 0 };
+        int rc = 0;
          ENTRY;
  
+        do_gettimeofday(&work_start);
          qdata.qd_id = id;
          qdata.qd_flags = type;
          if (isblk)
@@ -908,11 +1035,6 @@ qctxt_wait_pending_dqacq(struct lustre_quota_ctxt *qctxt, unsigned int id,
  
          spin_lock(&qunit_hash_lock);
          qunit = dqacq_in_flight(qctxt, &qdata);
-        if (qunit)
-                /* grab reference on this qunit to handle races with
-                 * dqacq_completion(). Otherwise, this qunit could be freed just
-                 * after we release the qunit_hash_lock */
-                qunit_get(qunit);
          spin_unlock(&qunit_hash_lock);
  
          if (qunit) {
@@ -922,15 +1044,38 @@ qctxt_wait_pending_dqacq(struct lustre_quota_ctxt *qctxt, unsigned int id,
                  l_wait_event(qunit->lq_waitq, got_qunit(qunit), &lwi);
                  CDEBUG(D_QUOTA, "qunit(%p) finishes waiting. (rc:%d)\n",
                         qunit, qunit->lq_rc);
+                /* keep same as schedule_dqacq() b=17030 */
+                spin_lock(&qunit->lq_lock);
+                if (qunit->lq_rc == 0)
+                        rc = -EAGAIN;
+                else
+                        rc = qunit->lq_rc;
+                spin_unlock(&qunit->lq_lock);
+                /* this is for dqacq_in_flight() */
                  qunit_put(qunit);
+                do_gettimeofday(&work_end);
+                timediff = cfs_timeval_sub(&work_end, &work_start, NULL);
+                lprocfs_counter_add(qctxt->lqc_stats,
+                                    isblk ? LQUOTA_WAIT_PENDING_BLK_QUOTA :
+                                            LQUOTA_WAIT_PENDING_INO_QUOTA,
+                                    timediff);
+        } else {
+                do_gettimeofday(&work_end);
+                timediff = cfs_timeval_sub(&work_end, &work_start, NULL);
+                lprocfs_counter_add(qctxt->lqc_stats,
+                                    isblk ? LQUOTA_NOWAIT_PENDING_BLK_QUOTA :
+                                            LQUOTA_NOWAIT_PENDING_INO_QUOTA,
+                                    timediff);
          }
-        RETURN(0);
+
+        RETURN(rc);
  }
  
  int
-qctxt_init(struct lustre_quota_ctxt *qctxt, struct super_block *sb,
-           dqacq_handler_t handler)
+qctxt_init(struct obd_device *obd, dqacq_handler_t handler)
  {
+        struct lustre_quota_ctxt *qctxt = &obd->u.obt.obt_qctxt;
+        struct super_block *sb = obd->u.obt.obt_sb;
          int rc = 0;
          ENTRY;
  
@@ -940,6 +1085,7 @@ qctxt_init(struct lustre_quota_ctxt *qctxt, struct super_block *sb,
          if (rc)
                  RETURN(rc);
  
+        cfs_waitq_init(&qctxt->lqc_wait_for_qmaster);
          spin_lock_init(&qctxt->lqc_lock);
          spin_lock(&qctxt->lqc_lock);
          qctxt->lqc_handler = handler;
@@ -947,25 +1093,32 @@ qctxt_init(struct lustre_quota_ctxt *qctxt, struct super_block *sb,
          qctxt->lqc_import = NULL;
          qctxt->lqc_recovery = 0;
          qctxt->lqc_switch_qs = 1; /* Change qunit size in default setting */
+        qctxt->lqc_valid = 1;
          qctxt->lqc_cqs_boundary_factor = 4;
          qctxt->lqc_cqs_least_bunit = PTLRPC_MAX_BRW_SIZE;
          qctxt->lqc_cqs_least_iunit = 2;
          qctxt->lqc_cqs_qs_factor = 2;
          qctxt->lqc_flags = 0;
+        QUOTA_MASTER_UNREADY(qctxt);
          qctxt->lqc_bunit_sz = default_bunit_sz;
          qctxt->lqc_btune_sz = default_bunit_sz / 100 * default_btune_ratio;
          qctxt->lqc_iunit_sz = default_iunit_sz;
          qctxt->lqc_itune_sz = default_iunit_sz * default_itune_ratio / 100;
          qctxt->lqc_switch_seconds = 300; /* enlarging will wait 5 minutes
                                            * after the last shrinking */
-        rc = lustre_hash_init(&LQC_HASH_BODY(qctxt), "LQS_HASH",128,
-                              &lqs_hash_operations);
-        if (rc) {
-                CDEBUG(D_ERROR, "initialize hash lqs on ost error!\n");
-                lustre_hash_exit(&LQC_HASH_BODY(qctxt));
-        }
+        qctxt->lqc_sync_blk = 0;
          spin_unlock(&qctxt->lqc_lock);
  
+        qctxt->lqc_lqs_hash = lustre_hash_init("LQS_HASH", 7, 7,
+                                               &lqs_hash_ops, 0);
+        if (!qctxt->lqc_lqs_hash)
+                CERROR("initialize hash lqs for %s error!\n", obd->obd_name);
+
+#ifdef LPROCFS
+        if (lquota_proc_setup(obd, is_master(obd, qctxt, 0, 0)))
+                CERROR("initialize proc for %s error!\n", obd->obd_name);
+#endif
+
          RETURN(rc);
  }
  
@@ -978,6 +1131,10 @@ void qctxt_cleanup(struct lustre_quota_ctxt *qctxt, int force)
  
          INIT_LIST_HEAD(&tmp_list);
  
+        spin_lock(&qctxt->lqc_lock);
+        qctxt->lqc_valid = 0;
+        spin_unlock(&qctxt->lqc_lock);
+
          spin_lock(&qunit_hash_lock);
          for (i = 0; i < NR_DQHASH; i++) {
                  list_for_each_entry_safe(qunit, tmp, &qunit_hash[i], lq_hash) {
@@ -999,9 +1156,23 @@ void qctxt_cleanup(struct lustre_quota_ctxt *qctxt, int force)
                  qunit_put(qunit);
          }
  
-        lustre_hash_exit(&LQC_HASH_BODY(qctxt));
+        lustre_hash_exit(qctxt->lqc_lqs_hash);
+
+        /* after qctxt_cleanup, qctxt might be freed, then check_qm() is
+         * unpredicted. So we must wait until lqc_wait_for_qmaster is empty */
+        while (cfs_waitq_active(&qctxt->lqc_wait_for_qmaster)) {
+                cfs_waitq_signal(&qctxt->lqc_wait_for_qmaster);
+                cfs_schedule_timeout(CFS_TASK_INTERRUPTIBLE,
+                                     cfs_time_seconds(1));
+        }
+
          ptlrpcd_decref();
  
+#ifdef LPROCFS
+        if (lquota_proc_cleanup(qctxt))
+                CERROR("cleanup proc error!\n");
+#endif
+
          EXIT;
  }
  
@@ -1070,7 +1241,8 @@ static int qslave_recovery_main(void *arg)
                          if (ret > 0) {
                                  int opc;
                                  opc = ret == 1 ? QUOTA_DQACQ : QUOTA_DQREL;
-                                rc = schedule_dqacq(obd, qctxt, &qdata, opc, 0);
+                                rc = schedule_dqacq(obd, qctxt, &qdata, opc,
+                                                    0, NULL);
                                  if (rc == -EDQUOT)
                                          rc = 0;
                          } else {
@@ -1114,3 +1286,99 @@ exit:
          EXIT;
  }
  
+
+/*
+ * lqs<->qctxt hash operations
+ */
+
+/* string hashing using djb2 hash algorithm */
+static unsigned
+lqs_hash(lustre_hash_t *lh, void *key, unsigned mask)
+{
+        struct quota_adjust_qunit *lqs_key;
+        unsigned hash;
+        ENTRY;
+
+        LASSERT(key);
+        lqs_key = (struct quota_adjust_qunit *)key;
+        hash = (QAQ_IS_GRP(lqs_key) ? 5381 : 5387) * lqs_key->qaq_id;
+
+        RETURN(hash & mask);
+}
+
+static int
+lqs_compare(void *key, struct hlist_node *hnode)
+{
+        struct quota_adjust_qunit *lqs_key;
+        struct lustre_qunit_size *q;
+        int rc;
+        ENTRY;
+
+        LASSERT(key);
+        lqs_key = (struct quota_adjust_qunit *)key;
+        q = hlist_entry(hnode, struct lustre_qunit_size, lqs_hash);
+
+        spin_lock(&q->lqs_lock);
+        rc = ((lqs_key->qaq_id == q->lqs_id) &&
+              (QAQ_IS_GRP(lqs_key) == LQS_IS_GRP(q)));
+        spin_unlock(&q->lqs_lock);
+
+        RETURN(rc);
+}
+
+static void *
+lqs_get(struct hlist_node *hnode)
+{
+        struct lustre_qunit_size *q = 
+            hlist_entry(hnode, struct lustre_qunit_size, lqs_hash);
+        ENTRY;
+
+        atomic_inc(&q->lqs_refcount);
+        CDEBUG(D_QUOTA, "lqs=%p refcount %d\n",
+               q, atomic_read(&q->lqs_refcount));
+
+        RETURN(q);
+}
+
+static void *
+lqs_put(struct hlist_node *hnode)
+{
+        struct lustre_qunit_size *q = 
+            hlist_entry(hnode, struct lustre_qunit_size, lqs_hash);
+        ENTRY;
+
+        LASSERT(atomic_read(&q->lqs_refcount) > 0);
+        atomic_dec(&q->lqs_refcount);
+        CDEBUG(D_QUOTA, "lqs=%p refcount %d\n",
+               q, atomic_read(&q->lqs_refcount));
+
+        RETURN(q);
+}
+
+static void
+lqs_exit(struct hlist_node *hnode)
+{
+        struct lustre_qunit_size *q;
+        ENTRY;
+
+        q = hlist_entry(hnode, struct lustre_qunit_size, lqs_hash);
+        /* 
+         * Nothing should be left. User of lqs put it and
+         * lqs also was deleted from table by this time
+         * so we should have 0 refs.
+         */
+        LASSERTF(atomic_read(&q->lqs_refcount) == 0, 
+                 "Busy lqs %p with %d refs\n", q,
+                 atomic_read(&q->lqs_refcount));
+        OBD_FREE_PTR(q);
+        EXIT;
+}
+
+static lustre_hash_ops_t lqs_hash_ops = {
+        .lh_hash    = lqs_hash,
+        .lh_compare = lqs_compare,
+        .lh_get     = lqs_get,
+        .lh_put     = lqs_put,
+        .lh_exit    = lqs_exit
+};
+#endif /* HAVE_QUOTA_SUPPORT */
diff --git a/lustre/quota/quota_ctl.c b/lustre/quota/quota_ctl.c

index e11473f..ebdb535 100644 (file)
--- a/lustre/quota/quota_ctl.c
+++ b/lustre/quota/quota_ctl.c
@@ -1,19 +1,42 @@
  /* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
   * vim:expandtab:shiftwidth=8:tabstop=8:
   *
- *  lustre/quota/quota_ctl.c
+ * GPL HEADER START
   *
- *  Copyright (c) 2005 Cluster File Systems, Inc.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   *
- *   This file is part of Lustre, http://www.lustre.org.
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
   *
- *   No redistribution or use is permitted outside of Cluster File Systems, Inc.
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
   *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright  2008 Sun Microsystems, Inc. All rights reserved
+ * Use is subject to license terms.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
   */
  #ifndef EXPORT_SYMTAB
  # define EXPORT_SYMTAB
  #endif
-#define DEBUG_SUBSYSTEM S_MDS
+#define DEBUG_SUBSYSTEM S_LQUOTA
  
  #ifdef __KERNEL__
  # include <linux/version.h>
@@ -21,16 +44,11 @@
  # include <linux/init.h>
  # include <linux/fs.h>
  # include <linux/jbd.h>
-# include <linux/ext3_fs.h>
  # include <linux/quota.h>
-# if (LINUX_VERSION_CODE >= KERNEL_VERSION(2,5,0))
  #  include <linux/smp_lock.h>
  #  include <linux/buffer_head.h>
  #  include <linux/workqueue.h>
  #  include <linux/mount.h>
-# else
-#  include <linux/locks.h>
-# endif
  #else /* __KERNEL__ */
  # include <liblustre.h>
  #endif
@@ -44,13 +62,19 @@
  #include <lustre_quota.h>
  #include "quota_internal.h"
  
+#ifdef HAVE_QUOTA_SUPPORT
  #ifdef __KERNEL__
  int mds_quota_ctl(struct obd_export *exp, struct obd_quotactl *oqctl)
  {
          struct obd_device *obd = exp->exp_obd;
+        struct lustre_quota_ctxt *qctxt = &obd->u.obt.obt_qctxt;
+        struct timeval work_start;
+        struct timeval work_end;
+        long timediff;
          int rc = 0;
          ENTRY;
  
+        do_gettimeofday(&work_start);
          switch (oqctl->qc_cmd) {
          case Q_QUOTAON:
                  rc = mds_quota_on(obd, oqctl);
@@ -90,6 +114,9 @@ int mds_quota_ctl(struct obd_export *exp, struct obd_quotactl *oqctl)
                  CDEBUG(D_INFO, "mds_quotactl admin quota command %d, id %u, "
                                 "type %d, failed: rc = %d\n",
                         oqctl->qc_cmd, oqctl->qc_id, oqctl->qc_type, rc);
+        do_gettimeofday(&work_end);
+        timediff = cfs_timeval_sub(&work_end, &work_start, NULL);
+        lprocfs_counter_add(qctxt->lqc_stats, LQUOTA_QUOTA_CTL, timediff);
  
          RETURN(rc);
  }
@@ -99,9 +126,14 @@ int filter_quota_ctl(struct obd_export *exp, struct obd_quotactl *oqctl)
          struct obd_device *obd = exp->exp_obd;
          struct obd_device_target *obt = &obd->u.obt;
          struct lvfs_run_ctxt saved;
+        struct lustre_quota_ctxt *qctxt = &obd->u.obt.obt_qctxt;
+        struct timeval work_start;
+        struct timeval work_end;
+        long timediff;
          int rc = 0;
          ENTRY;
  
+        do_gettimeofday(&work_start);
          switch (oqctl->qc_cmd) {
          case Q_FINVALIDATE:
          case Q_QUOTAON:
@@ -197,8 +229,8 @@ adjust:
                  else
                          gid = oqctl->qc_id;
  
-                rc = qctxt_adjust_qunit(obd, &obd->u.obt.obt_qctxt, 
-                                        uid, gid, 1, 0);
+                rc = qctxt_adjust_qunit(obd, &obd->u.obt.obt_qctxt,
+                                        uid, gid, 1, 0, NULL);
                  if (rc == -EDQUOT || rc == -EBUSY) {
                          CDEBUG(D_QUOTA, "rc: %d.\n", rc);
                          rc = 0;
@@ -211,16 +243,20 @@ adjust:
                         obd->obd_name, oqctl->qc_cmd);
                  RETURN(-EFAULT);
          }
+        do_gettimeofday(&work_end);
+        timediff = cfs_timeval_sub(&work_end, &work_start, NULL);
+        lprocfs_counter_add(qctxt->lqc_stats, LQUOTA_QUOTA_CTL, timediff);
  
          RETURN(rc);
  }
  #endif /* __KERNEL__ */
+#endif
  
  int client_quota_ctl(struct obd_export *exp, struct obd_quotactl *oqctl)
  {
          struct ptlrpc_request *req;
          struct obd_quotactl *oqc;
-        int size[2] = { sizeof(struct ptlrpc_body), sizeof(*oqctl) };
+        __u32 size[2] = { sizeof(struct ptlrpc_body), sizeof(*oqctl) };
          int ver, opc, rc;
          ENTRY;
  
@@ -244,8 +280,15 @@ int client_quota_ctl(struct obd_export *exp, struct obd_quotactl *oqctl)
          ptlrpc_req_set_repsize(req, 2, size);
  
          rc = ptlrpc_queue_wait(req);
-        oqc = lustre_swab_repbuf(req, REPLY_REC_OFF, sizeof(*oqc),
-                                 lustre_swab_obd_quotactl);
+        if (rc) {
+                CERROR("ptlrpc_queue_wait failed, rc: %d\n", rc);
+                GOTO(out, rc);
+        }
+
+        oqc = NULL;
+        if (req->rq_repmsg)
+                oqc = lustre_swab_repbuf(req, REPLY_REC_OFF, sizeof(*oqc),
+                                         lustre_swab_obd_quotactl);
          if (oqc == NULL) {
                  CERROR ("Can't unpack obd_quotactl\n");
                  GOTO(out, rc = -EPROTO);
@@ -267,9 +310,12 @@ int lov_quota_ctl(struct obd_export *exp, struct obd_quotactl *oqctl)
          int i, rc = 0;
          ENTRY;
  
-        if (oqctl->qc_cmd != Q_QUOTAON && oqctl->qc_cmd != Q_QUOTAOFF &&
-            oqctl->qc_cmd != Q_GETOQUOTA && oqctl->qc_cmd != Q_INITQUOTA &&
-            oqctl->qc_cmd != Q_SETQUOTA && oqctl->qc_cmd != Q_FINVALIDATE) {
+        if (oqctl->qc_cmd != LUSTRE_Q_QUOTAON &&
+            oqctl->qc_cmd != LUSTRE_Q_QUOTAOFF &&
+            oqctl->qc_cmd != Q_GETOQUOTA &&
+            oqctl->qc_cmd != Q_INITQUOTA &&
+            oqctl->qc_cmd != LUSTRE_Q_SETQUOTA &&
+            oqctl->qc_cmd != Q_FINVALIDATE) {
                  CERROR("bad quota opc %x for lov obd", oqctl->qc_cmd);
                  RETURN(-EFAULT);
          }
@@ -306,4 +352,3 @@ int lov_quota_ctl(struct obd_export *exp, struct obd_quotactl *oqctl)
          }
          RETURN(rc);
  }
-
diff --git a/lustre/quota/quota_interface.c b/lustre/quota/quota_interface.c

index 6f7e695..7aa638c 100644 (file)
--- a/lustre/quota/quota_interface.c
+++ b/lustre/quota/quota_interface.c
@@ -1,19 +1,43 @@
  /* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
   * vim:expandtab:shiftwidth=8:tabstop=8:
   *
- * lustre/quota/quota_interface.c
+ * GPL HEADER START
   *
- * Copyright (c) 2001-2005 Cluster File Systems, Inc.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   *
- * This file is part of Lustre, http://www.lustre.org.
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
   *
- * No redistribution or use is permitted outside of Cluster File Systems, Inc.
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
   *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright  2008 Sun Microsystems, Inc. All rights reserved
+ * Use is subject to license terms.
   */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
  #ifndef EXPORT_SYMTAB
  # define EXPORT_SYMTAB
  #endif
-#define DEBUG_SUBSYSTEM S_MDS
+#define DEBUG_SUBSYSTEM S_LQUOTA
  
  #ifdef __KERNEL__
  # include <linux/version.h>
@@ -21,15 +45,10 @@
  # include <linux/init.h>
  # include <linux/fs.h>
  # include <linux/jbd.h>
-# include <linux/ext3_fs.h>
-# if (LINUX_VERSION_CODE >= KERNEL_VERSION(2,5,0))
  #  include <linux/smp_lock.h>
  #  include <linux/buffer_head.h>
  #  include <linux/workqueue.h>
  #  include <linux/mount.h>
-# else
-#  include <linux/locks.h>
-# endif
  #else /* __KERNEL__ */
  # include <liblustre.h>
  #endif
@@ -46,258 +65,10 @@
  
  #ifdef __KERNEL__
  
-/* quota proc file handling functions */
-#ifdef LPROCFS
-
-#define USER_QUOTA      1
-#define GROUP_QUOTA     2
-
-#define MAX_STYPE_SIZE  5
-
-/* The following information about CURRENT quotas is expected on the output:
- * MDS: u for user quotas (administrative+operational) turned on,
- *      g for group quotas (administrative+operational) turned on,
- *      1 for 32-bit operational quotas and 32-bit administrative quotas,
- *      2 for 32-bit operational quotas and 64-bit administrative quotas,
- *      3 for 64-bit operational quotas and 64-bit administrative quotas
- * OST: u for user quotas (operational) turned on,
- *      g for group quotas (operational) turned on,
- *      1 for 32-bit local operational quotas,
- *      3 for 64-bit local operational quotas,
- * Permanent parameters can be read with lctl (?)
- */
-int lprocfs_quota_rd_type(char *page, char **start, off_t off, int count,
-                          int *eof, void *data)
-{
-        struct obd_device *obd = (struct obd_device *)data;
-        char stype[MAX_STYPE_SIZE + 1] = "";
-        int oq_type, rc, is_mds;
-        lustre_quota_version_t aq_version, oq_version;
-        struct obd_device_target *obt;
-
-        LASSERT(obd != NULL);
-
-        obt = &obd->u.obt;
-        is_mds = !strcmp(obd->obd_type->typ_name, LUSTRE_MDS_NAME);
-
-        /* Collect the needed information */
-        oq_type = obd->u.obt.obt_qctxt.lqc_flags;
-        oq_version = obt->obt_qfmt;
-        if (is_mds) {
-                rc = mds_quota_get_version(obd, &aq_version);
-                if (rc)
-                        return -EPROTO;
-                /* Here we can also assert that aq_type == oq_type
-                 * except for quota startup/shutdown states     */
-        }
-
-        /* Transform the collected data into a user-readable string */
-        if (oq_type & LQC_USRQUOTA_FLAG)
-                strcat(stype, "u");
-        if (oq_type & LQC_GRPQUOTA_FLAG)
-                strcat(stype, "g");
-
-        if ((!is_mds || aq_version == LUSTRE_QUOTA_V1) &&
-            oq_version == LUSTRE_QUOTA_V1)
-                strcat(stype, "1");
-#ifdef HAVE_QUOTA64
-        else if ((!is_mds || aq_version == LUSTRE_QUOTA_V2) &&
-                 oq_version == LUSTRE_QUOTA_V2)
-                strcat(stype, "3");
-#endif
-        else if (is_mds && aq_version == LUSTRE_QUOTA_V2 &&
-                 oq_version == LUSTRE_QUOTA_V1)
-                strcat(stype, "2");
-        else
-                return -EPROTO;
-
-        return snprintf(page, count, "%s\n", stype);
-}
-EXPORT_SYMBOL(lprocfs_quota_rd_type);
-
-static int auto_quota_on(struct obd_device *obd, int type,
-                         struct super_block *sb, int is_master)
-{
-        struct obd_quotactl *oqctl;
-        struct lvfs_run_ctxt saved;
-        int rc = 0, id;
-        struct obd_device_target *obt;
-        ENTRY;
-
-        LASSERT(type == USRQUOTA || type == GRPQUOTA || type == UGQUOTA);
-
-        obt = &obd->u.obt;
-
-        OBD_ALLOC_PTR(oqctl);
-        if (!oqctl)
-                RETURN(-ENOMEM);
-
-        if (!atomic_dec_and_test(&obt->obt_quotachecking)) {
-                CDEBUG(D_INFO, "other people are doing quotacheck\n");
-                atomic_inc(&obt->obt_quotachecking);
-                RETURN(-EBUSY);
-        }
-
-        id = UGQUOTA2LQC(type);
-        /* quota already turned on */
-        if ((obt->obt_qctxt.lqc_flags & id) == id) {
-                rc = 0;
-                goto out;
-        }
-
-        oqctl->qc_type = type;
-        oqctl->qc_cmd = Q_QUOTAON;
-        oqctl->qc_id = obt->obt_qfmt;
-
-        push_ctxt(&saved, &obd->obd_lvfs_ctxt, NULL);
-        if (is_master) {
-                struct mds_obd *mds = &obd->u.mds;
-
-                down(&mds->mds_qonoff_sem);
-                /* turn on cluster wide quota */
-                rc = mds_admin_quota_on(obd, oqctl);
-                if (rc)
-                        CDEBUG(rc == -ENOENT ? D_QUOTA : D_ERROR,
-                               "auto-enable admin quota failed. rc=%d\n", rc);
-                up(&mds->mds_qonoff_sem);
-
-        }
-        if (!rc) {
-                /* turn on local quota */
-                rc = fsfilt_quotactl(obd, sb, oqctl);
-                if (rc)
-                        CDEBUG(rc == -ENOENT ? D_QUOTA : D_ERROR,
-                               "auto-enable local quota failed. rc=%d\n", rc);
-                else
-                        obt->obt_qctxt.lqc_flags |= UGQUOTA2LQC(type);
-        }
-
-        pop_ctxt(&saved, &obd->obd_lvfs_ctxt, NULL);
-
-out:
-        atomic_inc(&obt->obt_quotachecking);
-
-        OBD_FREE_PTR(oqctl);
-        RETURN(rc);
-}
-
-static int filter_quota_set_version(struct obd_device *obd, 
-                                    lustre_quota_version_t version)
-{
-        struct obd_device_target *obt = &obd->u.obt;
-
-        if (version != LUSTRE_QUOTA_V1) {
-#ifdef HAVE_QUOTA64
-                if (version != LUSTRE_QUOTA_V2)
-#endif
-                        return -EINVAL;
-        }
-
-        if (!atomic_dec_and_test(&obt->obt_quotachecking)) {
-                CDEBUG(D_INFO, "other people are doing quotacheck\n");
-                atomic_inc(&obt->obt_quotachecking);
-                return -EBUSY;
-        }
-
-        if (obt->obt_qctxt.lqc_flags & (LQC_USRQUOTA_FLAG | LQC_GRPQUOTA_FLAG)) {
-                atomic_inc(&obt->obt_quotachecking);
-                return -EBUSY;
-        }
-
-        obt->obt_qfmt = version;
-
-        atomic_inc(&obt->obt_quotachecking);
-
-        return 0;
-}
+#ifdef HAVE_QUOTA_SUPPORT
  
-/* The following settings of CURRENT quotas is expected on the input:
- * MDS: u for user quotas (administrative+operational) turned on,
- *      g for group quotas (administrative+operational) turned on,
- *      1 for 32-bit operational quotas and 32-bit administrative quotas,
- *      2 for 32-bit operational quotas and 64-bit administrative quotas,
- *      3 for 64-bit operational quotas and 64-bit administrative quotas
- * OST: u for user quotas (operational) turned on,
- *      g for group quotas (operational) turned on,
- *      1 for 32-bit local operational quotas,
- *      2 for 32-bit local operational quotas,
- *      3 for 64-bit local operational quotas,
- * Permanent parameters can be set with lctl/tunefs
- */
-int lprocfs_quota_wr_type(struct file *file, const char *buffer,
-                          unsigned long count, void *data)
-{
-        struct obd_device *obd = (struct obd_device *)data;
-        struct obd_device_target *obt;
-        int type = 0, is_mds, idx;
-        unsigned long i;
-        char stype[MAX_STYPE_SIZE + 1] = "";
-        static const lustre_quota_version_t s2av[3] = {LUSTRE_QUOTA_V1,
-                                                       LUSTRE_QUOTA_V2,
-                                                       LUSTRE_QUOTA_V2},
-                                            s2ov[3] = {LUSTRE_QUOTA_V1,
-                                                       LUSTRE_QUOTA_V1,
-                                                       LUSTRE_QUOTA_V2};
-        LASSERT(obd != NULL);
-
-        obt = &obd->u.obt;
-
-        is_mds = !strcmp(obd->obd_type->typ_name, LUSTRE_MDS_NAME);
-
-        if (count > MAX_STYPE_SIZE)
-                return -EINVAL;
-
-        if (copy_from_user(stype, buffer, count))
-                return -EFAULT;
-
-        for (i = 0 ; i < count ; i++) {
-                int rc;
-
-                switch (stype[i]) {
-                case 'u' :
-                        type |= USER_QUOTA;
-                        break;
-                case 'g' :
-                        type |= GROUP_QUOTA;
-                        break;
-                /* quota version specifiers */
-                case '1' :
-                case '2' :
-                case '3' :
-                        idx = stype[i] - '1';
-#ifndef HAVE_QUOTA64
-                        if (s2ov[idx] == LUSTRE_QUOTA_V2)
-                                return -EINVAL;
-#endif
-                        if (is_mds) {
-                                rc = mds_quota_set_version(obd, s2av[idx]);
-                                if (rc) {
-                                        CDEBUG(D_QUOTA, "failed to set admin "
-                                               "quota to spec %c! %d\n",
-                                               stype[i], rc);
-                                        return rc;
-                                }
-                        }
-                        rc = filter_quota_set_version(obd, s2ov[idx]);
-                        if (rc) {
-                                CDEBUG(D_QUOTA, "failed to set operational quota"
-                                       " to spec %c! %d\n", stype[i], rc);
-                                return rc;
-                        }
-                        break;
-                default  : /* just skip stray symbols like \n */
-                        break;
-                }
-        }
-
-        if (type != 0)
-                auto_quota_on(obd, type - 1, obt->obt_sb, is_mds);
-
-        return count;
-}
-EXPORT_SYMBOL(lprocfs_quota_wr_type);
-
-#endif /* LPROCFS */
+static cfs_time_t last_print = 0;
+static spinlock_t last_print_lock = SPIN_LOCK_UNLOCKED;
  
  static int filter_quota_setup(struct obd_device *obd)
  {
@@ -311,7 +82,7 @@ static int filter_quota_setup(struct obd_device *obd)
          obt->obt_qfmt = LUSTRE_QUOTA_V1;
  #endif
          atomic_set(&obt->obt_quotachecking, 1);
-        rc = qctxt_init(&obt->obt_qctxt, obt->obt_sb, NULL);
+        rc = qctxt_init(obd, NULL);
          if (rc)
                  CERROR("initialize quota context failed! (rc:%d)\n", rc);
  
@@ -327,11 +98,15 @@ static int filter_quota_cleanup(struct obd_device *obd)
  static int filter_quota_setinfo(struct obd_export *exp, struct obd_device *obd)
  {
          struct obd_import *imp;
+        struct lustre_quota_ctxt *qctxt = &obd->u.obt.obt_qctxt;
+        ENTRY;
  
          /* setup the quota context import */
          spin_lock(&obd->u.obt.obt_qctxt.lqc_lock);
          obd->u.obt.obt_qctxt.lqc_import = exp->exp_imp_reverse;
          spin_unlock(&obd->u.obt.obt_qctxt.lqc_lock);
+        CDEBUG(D_QUOTA, "%s: lqc_import(%p) of obd(%p) is reactivated now, \n",
+               obd->obd_name,exp->exp_imp_reverse, obd);
  
          /* make imp's connect flags equal relative exp's connect flags
           * adding it to avoid the scan export list
@@ -342,14 +117,16 @@ static int filter_quota_setinfo(struct obd_export *exp, struct obd_device *obd)
                          (exp->exp_connect_flags &
                           (OBD_CONNECT_QUOTA64 | OBD_CONNECT_CHANGE_QS));
  
+        cfs_waitq_signal(&qctxt->lqc_wait_for_qmaster);
          /* start quota slave recovery thread. (release high limits) */
          qslave_start_recovery(obd, &obd->u.obt.obt_qctxt);
-        return 0;
+        RETURN(0);
  }
  
  static int filter_quota_clearinfo(struct obd_export *exp, struct obd_device *obd)
  {
          struct lustre_quota_ctxt *qctxt = &obd->u.obt.obt_qctxt;
+        ENTRY;
  
          /* lquota may be not set up before destroying export, b=14896 */
          if (!obd->obd_set_up)
@@ -360,10 +137,12 @@ static int filter_quota_clearinfo(struct obd_export *exp, struct obd_device *obd
          if (qctxt->lqc_import == exp->exp_imp_reverse) {
                  spin_lock(&qctxt->lqc_lock);
                  qctxt->lqc_import = NULL;
+                CDEBUG(D_QUOTA, "%s: lqc_import of obd(%p) is invalid now.\n",
+                       obd->obd_name, obd);
                  spin_unlock(&qctxt->lqc_lock);
          }
  
-        return 0;
+        RETURN(0);
  }
  
  static int filter_quota_enforce(struct obd_device *obd, unsigned int ignore)
@@ -373,10 +152,12 @@ static int filter_quota_enforce(struct obd_device *obd, unsigned int ignore)
          if (!sb_any_quota_enabled(obd->u.obt.obt_sb))
                  RETURN(0);
  
-        if (ignore)
-                cap_raise(current->cap_effective, CAP_SYS_RESOURCE);
-        else
-                cap_lower(current->cap_effective, CAP_SYS_RESOURCE);
+        if (ignore) {
+                CDEBUG(D_QUOTA, "blocks will be written with ignoring quota.\n");
+                cfs_cap_raise(CFS_CAP_SYS_RESOURCE);
+        } else {
+                cfs_cap_lower(CFS_CAP_SYS_RESOURCE);
+        }
  
          RETURN(0);
  }
@@ -384,6 +165,7 @@ static int filter_quota_enforce(struct obd_device *obd, unsigned int ignore)
  static int filter_quota_getflag(struct obd_device *obd, struct obdo *oa)
  {
          struct obd_device_target *obt = &obd->u.obt;
+        struct lustre_quota_ctxt *qctxt = &obt->obt_qctxt;
          int err, cnt, rc = 0;
          struct obd_quotactl *oqctl;
          ENTRY;
@@ -391,15 +173,42 @@ static int filter_quota_getflag(struct obd_device *obd, struct obdo *oa)
          if (!sb_any_quota_enabled(obt->obt_sb))
                  RETURN(0);
  
-        oa->o_flags &= ~(OBD_FL_NO_USRQUOTA | OBD_FL_NO_GRPQUOTA);
-
          OBD_ALLOC_PTR(oqctl);
          if (!oqctl) {
                  CERROR("Not enough memory!");
                  RETURN(-ENOMEM);
          }
  
+        /* set over quota flags for a uid/gid */
+        oa->o_valid |= OBD_MD_FLUSRQUOTA | OBD_MD_FLGRPQUOTA;
+        oa->o_flags &= ~(OBD_FL_NO_USRQUOTA | OBD_FL_NO_GRPQUOTA);
+
          for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
+                struct quota_adjust_qunit oqaq_tmp;
+                struct lustre_qunit_size *lqs = NULL;
+
+                oqaq_tmp.qaq_flags = cnt;
+                oqaq_tmp.qaq_id = (cnt == USRQUOTA) ? oa->o_uid : oa->o_gid;
+
+                quota_search_lqs(NULL, &oqaq_tmp, qctxt, &lqs);
+                if (lqs) {
+                        spin_lock(&lqs->lqs_lock);
+                        if (lqs->lqs_bunit_sz <= qctxt->lqc_sync_blk) {
+                                oa->o_flags |= (cnt == USRQUOTA) ?
+                                        OBD_FL_NO_USRQUOTA : OBD_FL_NO_GRPQUOTA;
+                                CDEBUG(D_QUOTA, "set sync flag: bunit(%lu), "
+                                       "sync_blk(%d)\n", lqs->lqs_bunit_sz,
+                                       qctxt->lqc_sync_blk);
+                                spin_unlock(&lqs->lqs_lock);
+                                /* this is for quota_search_lqs */
+                                lqs_putref(lqs);
+                                continue;
+                        }
+                        spin_unlock(&lqs->lqs_lock);
+                        /* this is for quota_search_lqs */
+                        lqs_putref(lqs);
+                }
+
                  memset(oqctl, 0, sizeof(*oqctl));
  
                  oqctl->qc_cmd = Q_GETQUOTA;
@@ -409,12 +218,11 @@ static int filter_quota_getflag(struct obd_device *obd, struct obdo *oa)
                  if (err) {
                          if (!rc)
                                  rc = err;
+                        oa->o_valid &= ~((cnt == USRQUOTA) ? OBD_MD_FLUSRQUOTA :
+                                                             OBD_MD_FLGRPQUOTA);
                          continue;
                  }
  
-                /* set over quota flags for a uid/gid */
-                oa->o_valid |= (cnt == USRQUOTA) ?
-                               OBD_MD_FLUSRQUOTA : OBD_MD_FLGRPQUOTA;
                  if (oqctl->qc_dqblk.dqb_bhardlimit &&
                     (toqb(oqctl->qc_dqblk.dqb_curspace) >=
                      oqctl->qc_dqblk.dqb_bhardlimit))
@@ -426,25 +234,27 @@ static int filter_quota_getflag(struct obd_device *obd, struct obdo *oa)
  }
  
  static int filter_quota_acquire(struct obd_device *obd, unsigned int uid,
-                                unsigned int gid)
+                                unsigned int gid, struct obd_trans_info *oti)
  {
          struct lustre_quota_ctxt *qctxt = &obd->u.obt.obt_qctxt;
          int rc;
          ENTRY;
  
-        rc = qctxt_adjust_qunit(obd, qctxt, uid, gid, LQUOTA_FLAGS_BLK, 1);
+        rc = qctxt_adjust_qunit(obd, qctxt, uid, gid, LQUOTA_FLAGS_BLK, 1, oti);
          RETURN(rc);
  }
  
  /* check whether the left quota of certain uid and gid can satisfy a block_write
   * or inode_create rpc. When need to acquire quota, return QUOTA_RET_ACQUOTA */
  static int quota_check_common(struct obd_device *obd, unsigned int uid,
-                              unsigned int gid, int count, int cycle, int isblk)
+                              unsigned int gid, int count, int cycle, int isblk,
+                              struct inode *inode, int frags, int *pending)
  {
          struct lustre_quota_ctxt *qctxt = &obd->u.obt.obt_qctxt;
          int i;
          __u32 id[MAXQUOTAS] = { uid, gid };
          struct qunit_data qdata[MAXQUOTAS];
+        int mb = 0;
          int rc = 0, rc2[2] = { 0, 0 };
          ENTRY;
  
@@ -452,6 +262,13 @@ static int quota_check_common(struct obd_device *obd, unsigned int uid,
          if (!sb_any_quota_enabled(qctxt->lqc_sb))
                  RETURN(rc);
  
+        spin_lock(&qctxt->lqc_lock);
+        if (!qctxt->lqc_valid){
+                spin_unlock(&qctxt->lqc_lock);
+                RETURN(rc);
+        }
+        spin_unlock(&qctxt->lqc_lock);
+
          for (i = 0; i < MAXQUOTAS; i++) {
                  struct lustre_qunit_size *lqs = NULL;
  
@@ -472,19 +289,32 @@ static int quota_check_common(struct obd_device *obd, unsigned int uid,
                  rc2[i] = compute_remquota(obd, qctxt, &qdata[i], isblk);
                  spin_lock(&lqs->lqs_lock);
                  if (!cycle) {
-                        rc = QUOTA_RET_INC_PENDING;
-                        if (isblk)
-                                lqs->lqs_bwrite_pending += count;
-                        else
-                                lqs->lqs_iwrite_pending += count;
+                        if (isblk) {
+                                *pending = count * CFS_PAGE_SIZE;
+                                /* in order to complete this write, we need extra
+                                 * meta blocks. This function can get it through
+                                 * data needed to be written b=16542 */
+                                mb = *pending;
+                                LASSERT(inode && frags > 0);
+                                if (fsfilt_get_mblk(obd, qctxt->lqc_sb, &mb,
+                                                    inode, frags) < 0)
+                                        CDEBUG(D_ERROR,
+                                               "can't get extra meta blocks.\n");
+                                else
+                                        *pending += mb;
+                                lqs->lqs_bwrite_pending += *pending;
+                        } else {
+                                *pending = count;
+                                lqs->lqs_iwrite_pending += *pending;
+                        }
                  }
  
-                CDEBUG(D_QUOTA, "write pending: %lu, qd_count: "LPU64".\n",
+                CDEBUG(D_QUOTA, "count: %d, lqs pending: %lu, qd_count: "LPU64
+                       ", metablocks: %d, isblk: %d, pending: %d.\n", count,
                         isblk ? lqs->lqs_bwrite_pending : lqs->lqs_iwrite_pending,
-                       qdata[i].qd_count);
+                       qdata[i].qd_count, mb, isblk, *pending);
                  if (rc2[i] == QUOTA_RET_OK) {
-                        if (isblk && qdata[i].qd_count <
-                            lqs->lqs_bwrite_pending * CFS_PAGE_SIZE)
+                        if (isblk && qdata[i].qd_count < lqs->lqs_bwrite_pending)
                                  rc2[i] = QUOTA_RET_ACQUOTA;
                          if (!isblk && qdata[i].qd_count <
                              lqs->lqs_iwrite_pending)
@@ -504,34 +334,57 @@ static int quota_check_common(struct obd_device *obd, unsigned int uid,
          }
  
          if (rc2[0] == QUOTA_RET_ACQUOTA || rc2[1] == QUOTA_RET_ACQUOTA)
-                RETURN(rc | QUOTA_RET_ACQUOTA);
+                RETURN(QUOTA_RET_ACQUOTA);
          else
                  RETURN(rc);
  }
  
  static int quota_chk_acq_common(struct obd_device *obd, unsigned int uid,
                                  unsigned int gid, int count, int *pending,
-                                int isblk, quota_acquire acquire)
+                                int isblk, quota_acquire acquire,
+                                struct obd_trans_info *oti, struct inode *inode,
+                                int frags)
  {
-        int rc = 0, cycle = 0, count_err = 0;
+        struct lustre_quota_ctxt *qctxt = &obd->u.obt.obt_qctxt;
+        struct timeval work_start;
+        struct timeval work_end;
+        long timediff;
+        struct l_wait_info lwi = { 0 };
+        int rc = 0, cycle = 0, count_err = 1;
          ENTRY;
  
+        CDEBUG(D_QUOTA, "check quota for %s\n", obd->obd_name);
          /* Unfortunately, if quota master is too busy to handle the
           * pre-dqacq in time and quota hash on ost is used up, we
           * have to wait for the completion of in flight dqacq/dqrel,
           * in order to get enough quota for write b=12588 */
-        while ((rc = quota_check_common(obd, uid, gid, count, cycle, isblk)) &
-               QUOTA_RET_ACQUOTA) {
+        do_gettimeofday(&work_start);
+        while ((rc = quota_check_common(obd, uid, gid, count, cycle, isblk,
+                                        inode, frags, pending)) & QUOTA_RET_ACQUOTA) {
  
-                if (rc & QUOTA_RET_INC_PENDING)
-                        *pending = 1;
+                spin_lock(&qctxt->lqc_lock);
+                if (!qctxt->lqc_import && oti) {
+                        spin_unlock(&qctxt->lqc_lock);
+
+                        LASSERT(oti && oti->oti_thread &&
+                                oti->oti_thread->t_watchdog);
+
+                        lc_watchdog_disable(oti->oti_thread->t_watchdog);
+                        CDEBUG(D_QUOTA, "sleep for quota master\n");
+                        l_wait_event(qctxt->lqc_wait_for_qmaster, check_qm(qctxt),
+                                     &lwi);
+                        CDEBUG(D_QUOTA, "wake up when quota master is back\n");
+                        lc_watchdog_touch(oti->oti_thread->t_watchdog);
+                } else {
+                        spin_unlock(&qctxt->lqc_lock);
+                }
  
                  cycle++;
                  if (isblk)
                          OBD_FAIL_TIMEOUT(OBD_FAIL_OST_HOLD_WRITE_RPC, 90);
                  /* after acquire(), we should run quota_check_common again
                   * so that we confirm there are enough quota to finish write */
-                rc = acquire(obd, uid, gid);
+                rc = acquire(obd, uid, gid, oti);
  
                  /* please reference to dqacq_completion for the below */
                  /* a new request is finished, try again */
@@ -546,54 +399,79 @@ static int quota_chk_acq_common(struct obd_device *obd, unsigned int uid,
                          break;
                  }
  
-                /* -EBUSY and others, try 10 times */
-                if (rc < 0 && count_err < 10) {
-                        CDEBUG(D_QUOTA, "rc: %d, count_err: %d\n", rc, count_err++);
-                        cfs_schedule_timeout(CFS_TASK_INTERRUPTIBLE, HZ);
-                        continue;
+                /* -EBUSY and others, wait a second and try again */
+                if (rc < 0) {
+                        cfs_waitq_t        waitq;
+                        struct l_wait_info lwi;
+
+                        if (oti && oti->oti_thread && oti->oti_thread->t_watchdog)
+                                lc_watchdog_touch(oti->oti_thread->t_watchdog);
+                        CDEBUG(D_QUOTA, "rc: %d, count_err: %d\n", rc,
+                               count_err++);
+
+                        init_waitqueue_head(&waitq);
+                        lwi = LWI_TIMEOUT(cfs_time_seconds(min(cycle, 10)), NULL,
+                                          NULL);
+                        l_wait_event(waitq, 0, &lwi);
                  }
  
-                if (count_err >= 10 || cycle >= 1000) {
-                        CDEBUG(D_ERROR, "we meet 10 errors or run too many"
-                               " cycles when acquiring quota, quit checking with"
-                               " rc: %d, cycle: %d.\n", rc, cycle);
-                        break;
+                if (rc < 0 || cycle % 10 == 2) {
+                        spin_lock(&last_print_lock);
+                        if (last_print == 0 ||
+                            cfs_time_before((last_print + cfs_time_seconds(30)),
+                                            cfs_time_current())) {
+                                CWARN("still haven't managed to acquire quota "
+                                      "space from the quota master after %d "
+                                      "retries (err=%d, rc=%d)\n",
+                                      cycle, count_err - 1, rc);
+                                last_print = cfs_time_current();
+                        }
+                        spin_unlock(&last_print_lock);
                  }
  
                  CDEBUG(D_QUOTA, "recheck quota with rc: %d, cycle: %d\n", rc,
                         cycle);
          }
  
-        if (!cycle && rc & QUOTA_RET_INC_PENDING)
-                *pending = 1;
+        do_gettimeofday(&work_end);
+        timediff = cfs_timeval_sub(&work_end, &work_start, NULL);
+        lprocfs_counter_add(qctxt->lqc_stats,
+                            isblk ? LQUOTA_WAIT_FOR_CHK_BLK :
+                                    LQUOTA_WAIT_FOR_CHK_INO,
+                            timediff);
  
          RETURN(rc);
  }
  
-
  static int filter_quota_check(struct obd_device *obd, unsigned int uid,
-                              unsigned int gid, int npage, int *flag,
-                              quota_acquire acquire)
+                              unsigned int gid, int npage, int *pending,
+                              quota_acquire acquire, struct obd_trans_info *oti,
+                              struct inode *inode, int frags)
  {
-        return quota_chk_acq_common(obd, uid, gid, npage, flag, LQUOTA_FLAGS_BLK,
-                                    acquire);
+        return quota_chk_acq_common(obd, uid, gid, npage, pending, LQUOTA_FLAGS_BLK,
+                                    acquire, oti, inode, frags);
  }
  
  /* when a block_write or inode_create rpc is finished, adjust the record for
   * pending blocks and inodes*/
  static int quota_pending_commit(struct obd_device *obd, unsigned int uid,
-                                unsigned int gid, int count, int isblk)
+                                unsigned int gid, int pending, int isblk)
  {
          struct lustre_quota_ctxt *qctxt = &obd->u.obt.obt_qctxt;
+        struct timeval work_start;
+        struct timeval work_end;
+        long timediff;
          int i;
          __u32 id[MAXQUOTAS] = { uid, gid };
          struct qunit_data qdata[MAXQUOTAS];
          ENTRY;
  
+        CDEBUG(D_QUOTA, "commit pending quota for  %s\n", obd->obd_name);
          CLASSERT(MAXQUOTAS < 4);
          if (!sb_any_quota_enabled(qctxt->lqc_sb))
                  RETURN(0);
  
+        do_gettimeofday(&work_start);
          for (i = 0; i < MAXQUOTAS; i++) {
                  struct lustre_qunit_size *lqs = NULL;
  
@@ -610,27 +488,27 @@ static int quota_pending_commit(struct obd_device *obd, unsigned int uid,
                  if (lqs) {
                          int flag = 0;
                          spin_lock(&lqs->lqs_lock);
-                        CDEBUG(D_QUOTA, "pending: %lu, count: %d.\n",
-                               isblk ? lqs->lqs_bwrite_pending :
-                               lqs->lqs_iwrite_pending, count);
-
                          if (isblk) {
-                                if (lqs->lqs_bwrite_pending >= count) {
-                                        lqs->lqs_bwrite_pending -= count;
+                                if (lqs->lqs_bwrite_pending >= pending) {
+                                        lqs->lqs_bwrite_pending -= pending;
                                          flag = 1;
                                  } else {
                                          CDEBUG(D_ERROR,
                                                 "there are too many blocks!\n");
                                  }
                          } else {
-                                if (lqs->lqs_iwrite_pending >= count) {
-                                        lqs->lqs_iwrite_pending -= count;
+                                if (lqs->lqs_iwrite_pending >= pending) {
+                                        lqs->lqs_iwrite_pending -= pending;
                                          flag = 1;
                                  } else {
                                          CDEBUG(D_ERROR,
                                                 "there are too many files!\n");
                                  }
                          }
+                        CDEBUG(D_QUOTA, "lqs pending: %lu, pending: %d, "
+                               "isblk: %d.\n",
+                               isblk ? lqs->lqs_bwrite_pending :
+                               lqs->lqs_iwrite_pending, pending, isblk);
  
                          spin_unlock(&lqs->lqs_lock);
                          lqs_putref(lqs);
@@ -640,14 +518,20 @@ static int quota_pending_commit(struct obd_device *obd, unsigned int uid,
                                  lqs_putref(lqs);
                  }
          }
+        do_gettimeofday(&work_end);
+        timediff = cfs_timeval_sub(&work_end, &work_start, NULL);
+        lprocfs_counter_add(qctxt->lqc_stats,
+                            isblk ? LQUOTA_WAIT_FOR_COMMIT_BLK :
+                                    LQUOTA_WAIT_FOR_COMMIT_INO,
+                            timediff);
  
          RETURN(0);
  }
  
  static int filter_quota_pending_commit(struct obd_device *obd, unsigned int uid,
-                                       unsigned int gid, int npage)
+                                       unsigned int gid, int blocks)
  {
-        return quota_pending_commit(obd, uid, gid, npage, LQUOTA_FLAGS_BLK);
+        return quota_pending_commit(obd, uid, gid, blocks, LQUOTA_FLAGS_BLK);
  }
  
  static int mds_quota_init(void)
@@ -677,7 +561,7 @@ static int mds_quota_setup(struct obd_device *obd)
          atomic_set(&obt->obt_quotachecking, 1);
          /* initialize quota master and quota context */
          sema_init(&mds->mds_qonoff_sem, 1);
-        rc = qctxt_init(&obt->obt_qctxt, obt->obt_sb, dqacq_handler);
+        rc = qctxt_init(obd, dqacq_handler);
          if (rc) {
                  CERROR("initialize quota context failed! (rc:%d)\n", rc);
                  RETURN(rc);
@@ -707,20 +591,22 @@ static int mds_quota_fs_cleanup(struct obd_device *obd)
  }
  
  static int mds_quota_check(struct obd_device *obd, unsigned int uid,
-                           unsigned int gid, int inodes, int *flag,
-                           quota_acquire acquire)
+                           unsigned int gid, int inodes, int *pending,
+                           quota_acquire acquire, struct obd_trans_info *oti,
+                           struct inode *inode, int frags)
  {
-        return quota_chk_acq_common(obd, uid, gid, inodes, flag, 0, acquire);
+        return quota_chk_acq_common(obd, uid, gid, inodes, pending, 0,
+                                    acquire, oti, inode, frags);
  }
  
  static int mds_quota_acquire(struct obd_device *obd, unsigned int uid,
-                             unsigned int gid)
+                             unsigned int gid, struct obd_trans_info *oti)
  {
          struct lustre_quota_ctxt *qctxt = &obd->u.obt.obt_qctxt;
          int rc;
          ENTRY;
  
-        rc = qctxt_adjust_qunit(obd, qctxt, uid, gid, 0, 1);
+        rc = qctxt_adjust_qunit(obd, qctxt, uid, gid, 0, 1, oti);
          RETURN(rc);
  }
  
@@ -729,6 +615,7 @@ static int mds_quota_pending_commit(struct obd_device *obd, unsigned int uid,
  {
          return quota_pending_commit(obd, uid, gid, inodes, 0);
  }
+#endif /* HAVE_QUOTA_SUPPORT */
  #endif /* __KERNEL__ */
  
  struct osc_quota_info {
@@ -940,6 +827,7 @@ int osc_quota_exit(void)
  }
  
  #ifdef __KERNEL__
+#ifdef HAVE_QUOTA_SUPPORT
  quota_interface_t mds_quota_interface = {
          .quota_init     = mds_quota_init,
          .quota_exit     = mds_quota_exit,
@@ -970,6 +858,7 @@ quota_interface_t filter_quota_interface = {
          .quota_adjust_qunit   = filter_quota_adjust_qunit,
          .quota_pending_commit = filter_quota_pending_commit,
  };
+#endif
  #endif /* __KERNEL__ */
  
  quota_interface_t mdc_quota_interface = {
@@ -997,13 +886,30 @@ quota_interface_t lov_quota_interface = {
  };
  
  #ifdef __KERNEL__
+
+cfs_proc_dir_entry_t *lquota_type_proc_dir = NULL;
+
  static int __init init_lustre_quota(void)
  {
-        int rc = qunit_cache_init();
+#ifdef HAVE_QUOTA_SUPPORT
+        int rc = 0;
+
+        lquota_type_proc_dir = lprocfs_register(OBD_LQUOTA_DEVICENAME,
+                                                proc_lustre_root,
+                                                NULL, NULL);
+        if (IS_ERR(lquota_type_proc_dir)) {
+                CERROR("LProcFS failed in lquota-init\n");
+                rc = PTR_ERR(lquota_type_proc_dir);
+                return rc;
+        }
+
+        rc = qunit_cache_init();
          if (rc)
                  return rc;
+
          PORTAL_SYMBOL_REGISTER(filter_quota_interface);
          PORTAL_SYMBOL_REGISTER(mds_quota_interface);
+#endif
          PORTAL_SYMBOL_REGISTER(mdc_quota_interface);
          PORTAL_SYMBOL_REGISTER(osc_quota_interface);
          PORTAL_SYMBOL_REGISTER(lov_quota_interface);
@@ -1012,23 +918,30 @@ static int __init init_lustre_quota(void)
  
  static void /*__exit*/ exit_lustre_quota(void)
  {
-        PORTAL_SYMBOL_UNREGISTER(filter_quota_interface);
-        PORTAL_SYMBOL_UNREGISTER(mds_quota_interface);
          PORTAL_SYMBOL_UNREGISTER(mdc_quota_interface);
          PORTAL_SYMBOL_UNREGISTER(osc_quota_interface);
          PORTAL_SYMBOL_UNREGISTER(lov_quota_interface);
+#ifdef HAVE_QUOTA_SUPPORT
+        PORTAL_SYMBOL_UNREGISTER(filter_quota_interface);
+        PORTAL_SYMBOL_UNREGISTER(mds_quota_interface);
  
          qunit_cache_cleanup();
+
+        if (lquota_type_proc_dir)
+                lprocfs_remove(&lquota_type_proc_dir);
+#endif
  }
  
-MODULE_AUTHOR("Cluster File Systems, Inc. <info@clusterfs.com>");
+MODULE_AUTHOR("Sun Microsystems, Inc. <http://www.lustre.org/>");
  MODULE_DESCRIPTION("Lustre Quota");
  MODULE_LICENSE("GPL");
  
  cfs_module(lquota, "1.0.0", init_lustre_quota, exit_lustre_quota);
  
+#ifdef HAVE_QUOTA_SUPPORT
  EXPORT_SYMBOL(mds_quota_interface);
  EXPORT_SYMBOL(filter_quota_interface);
+#endif
  EXPORT_SYMBOL(mdc_quota_interface);
  EXPORT_SYMBOL(osc_quota_interface);
  EXPORT_SYMBOL(lov_quota_interface);
diff --git a/lustre/quota/quota_internal.h b/lustre/quota/quota_internal.h

index fb5fd93..c1c4baa 100644 (file)
--- a/lustre/quota/quota_internal.h
+++ b/lustre/quota/quota_internal.h
@@ -1,14 +1,37 @@
  /* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
   * vim:expandtab:shiftwidth=8:tabstop=8:
   *
- *  lustre/quota/quota_internal.h
+ * GPL HEADER START
   *
- *  Copyright (c) 2001-2005 Cluster File Systems, Inc.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   *
- *   This file is part of Lustre, http://www.lustre.org.
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
   *
- *   No redistribution or use is permitted outside of Cluster File Systems, Inc.
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
   *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright  2008 Sun Microsystems, Inc. All rights reserved
+ * Use is subject to license terms.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
   */
  
  #ifndef __QUOTA_INTERNAL_H
@@ -16,12 +39,15 @@
  
  #include <lustre_quota.h>
  
+#ifdef HAVE_QUOTA_SUPPORT
+
  /* QUSG covnert bytes to blocks when counting block quota */
  #define QUSG(count, isblk)      (isblk ? toqb(count) : count)
  
  /* This flag is set in qc_stat to distinguish if the current getquota
   * operation is for quota recovery */
  #define QUOTA_RECOVERING    0x01
+#define OBD_LQUOTA_DEVICENAME  "lquota"
  
  #ifdef __KERNEL__
  
@@ -75,17 +101,18 @@
  void qunit_cache_cleanup(void);
  int qunit_cache_init(void);
  int qctxt_adjust_qunit(struct obd_device *obd, struct lustre_quota_ctxt *qctxt,
-                       uid_t uid, gid_t gid, __u32 isblk, int wait);
+                       uid_t uid, gid_t gid, __u32 isblk, int wait,
+                       struct obd_trans_info *oti);
  int qctxt_wait_pending_dqacq(struct lustre_quota_ctxt *qctxt, unsigned int id,
                               unsigned short type, int isblk);
-int qctxt_init(struct lustre_quota_ctxt *qctxt, struct super_block *sb,
-               dqacq_handler_t handler);
+int qctxt_init(struct obd_device *obd, dqacq_handler_t handler);
  void qctxt_cleanup(struct lustre_quota_ctxt *qctxt, int force);
  void qslave_start_recovery(struct obd_device *obd,
                             struct lustre_quota_ctxt *qctxt);
  int compute_remquota(struct obd_device *obd,
                       struct lustre_quota_ctxt *qctxt, struct qunit_data *qdata,
                       int isblk);
+int check_qm(struct lustre_quota_ctxt *qctxt);
  /* quota_master.c */
  int lustre_dquot_init(void);
  void lustre_dquot_exit(void);
@@ -118,36 +145,10 @@ int dquot_create_oqaq(struct lustre_quota_ctxt *qctxt, struct lustre_dquot
  /* quota_ctl.c */
  int mds_quota_ctl(struct obd_export *exp, struct obd_quotactl *oqctl);
  int filter_quota_ctl(struct obd_export *exp, struct obd_quotactl *oqctl);
-int client_quota_ctl(struct obd_export *exp, struct obd_quotactl *oqctl);
-int lov_quota_ctl(struct obd_export *exp, struct obd_quotactl *oqctl);
  
  /* quota_chk.c */
  int target_quota_check(struct obd_export *exp, struct obd_quotactl *oqctl);
-int client_quota_check(struct obd_export *exp, struct obd_quotactl *oqctl);
-int lov_quota_check(struct obd_export *exp, struct obd_quotactl *oqctl);
-int client_quota_poll_check(struct obd_export *exp, struct if_quotacheck *qchk);
  
-#ifdef LPROCFS
-void lprocfs_quotactl_test_init_vars(struct lprocfs_static_vars *lvars);
-void lprocfs_quotacheck_test_init_vars(struct lprocfs_static_vars *lvars);
-#else
-static inline void lprocfs_quotactl_test_init_vars
-                                (struct lprocfs_static_vars *lvars)
-{
-        memset(lvars, 0, sizeof(*lvars));
-}
-static inline void lprocfs_quotacheck_test_init_vars
-                                (struct lprocfs_static_vars *lvars)
-{
-        memset(lvars, 0, sizeof(*lvars));
-}
-#endif
-
-/* quota_adjust_qunit.c */
-int client_quota_adjust_qunit(struct obd_export *exp, struct
-                              quota_adjust_qunit *oqaq);
-int lov_quota_adjust_qunit(struct obd_export *exp, struct
-                           quota_adjust_qunit *oqaq);
  int quota_adjust_slave_lqs(struct quota_adjust_qunit *oqaq, struct
                            lustre_quota_ctxt *qctxt);
  void qdata_to_oqaq(struct qunit_data *qdata,
@@ -169,8 +170,13 @@ extern int quote_get_qdata(struct ptlrpc_request *req, struct qunit_data *qdata,
                             int is_req, int is_exp);
  extern int quote_copy_qdata(struct ptlrpc_request *req, struct qunit_data *qdata,
                              int is_req, int is_exp);
-int filter_quota_adjust_qunit(struct obd_export *exp, struct
-                              quota_adjust_qunit *oqaq);
+int filter_quota_adjust_qunit(struct obd_export *exp,
+                              struct quota_adjust_qunit *oqaq,
+                              struct lustre_quota_ctxt *qctxt);
+int lquota_proc_setup(struct obd_device *obd, int is_master);
+int lquota_proc_cleanup(struct lustre_quota_ctxt *qctxt);
+
+extern cfs_proc_dir_entry_t *lquota_type_proc_dir;
  #endif
  
  #define LQS_BLK_DECREASE 1
@@ -180,3 +186,15 @@ int filter_quota_adjust_qunit(struct obd_export *exp, struct
  
  
  #endif
+int client_quota_adjust_qunit(struct obd_export *exp,
+                              struct quota_adjust_qunit *oqaq,
+                              struct lustre_quota_ctxt *qctxt);
+int lov_quota_adjust_qunit(struct obd_export *exp,
+                           struct quota_adjust_qunit *oqaq,
+                           struct lustre_quota_ctxt *qctxt);
+int client_quota_ctl(struct obd_export *exp, struct obd_quotactl *oqctl);
+int lov_quota_ctl(struct obd_export *exp, struct obd_quotactl *oqctl);
+int client_quota_check(struct obd_export *exp, struct obd_quotactl *oqctl);
+int lov_quota_check(struct obd_export *exp, struct obd_quotactl *oqctl);
+int client_quota_poll_check(struct obd_export *exp, struct if_quotacheck *qchk);
+#endif
diff --git a/lustre/quota/quota_master.c b/lustre/quota/quota_master.c

index 5187733..5c18b4f 100644 (file)
--- a/lustre/quota/quota_master.c
+++ b/lustre/quota/quota_master.c
@@ -1,22 +1,50 @@
  /* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
   * vim:expandtab:shiftwidth=8:tabstop=8:
   *
- *  lustre/quota/quota_master.c
- *  Lustre Quota Master request handler
+ * GPL HEADER START
   *
- *  Copyright (c) 2001-2005 Cluster File Systems, Inc.
- *   Author: Niu YaWei <niu@clusterfs.com>
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   *
- *   This file is part of Lustre, http://www.lustre.org.
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
   *
- *   No redistribution or use is permitted outside of Cluster File Systems, Inc.
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
   *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright  2008 Sun Microsystems, Inc. All rights reserved
+ * Use is subject to license terms.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/quota/quota_master.c
+ *
+ * Lustre Quota Master request handler
+ *
+ * Author: Niu YaWei <niu@clusterfs.com>
   */
+
  #ifndef EXPORT_SYMTAB
  # define EXPORT_SYMTAB
  #endif
  
-#define DEBUG_SUBSYSTEM S_MDS
+#define DEBUG_SUBSYSTEM S_LQUOTA
  
  #include <linux/version.h>
  #include <linux/fs.h>
@@ -34,6 +62,8 @@
  
  #include "quota_internal.h"
  
+#ifdef HAVE_QUOTA_SUPPORT
+
  /* lock ordering: mds->mds_qonoff_sem > dquot->dq_sem */
  static struct list_head lustre_dquot_hash[NR_DQHASH];
  static spinlock_t dquot_hash_lock = SPIN_LOCK_UNLOCKED;
@@ -281,7 +311,7 @@ int dqacq_adjust_qunit_sz(struct obd_device *obd, qid_t id, int type,
  
          up(&dquot->dq_sem);
  
-        rc = qctxt_adjust_qunit(obd, qctxt, uid, gid, is_blk, 0);
+        rc = qctxt_adjust_qunit(obd, qctxt, uid, gid, is_blk, 0, NULL);
          if (rc == -EDQUOT || rc == -EBUSY) {
                  CDEBUG(D_QUOTA, "rc: %d.\n", rc);
                  rc = 0;
@@ -294,7 +324,7 @@ int dqacq_adjust_qunit_sz(struct obd_device *obd, qid_t id, int type,
  
          /* only when block qunit is reduced, boardcast to osts */
          if ((adjust_res & LQS_BLK_DECREASE) && QAQ_IS_ADJBLK(oqaq))
-                rc = obd_quota_adjust_qunit(mds->mds_osc_exp, oqaq);
+                rc = obd_quota_adjust_qunit(mds->mds_osc_exp, oqaq, qctxt);
  
  out:
          lustre_dqput(dquot);
@@ -453,21 +483,26 @@ int mds_quota_adjust(struct obd_device *obd, unsigned int qcids[],
          switch (opc) {
          case FSFILT_OP_RENAME:
                  /* acquire/release block quota on owner of original parent */
-                rc2 = qctxt_adjust_qunit(obd, qctxt, qpids[2], qpids[3], 1, 0);
+                rc2 = qctxt_adjust_qunit(obd, qctxt, qpids[2], qpids[3], 1, 0,
+                                         NULL);
                  /* fall-through */
          case FSFILT_OP_SETATTR:
                  /* acquire/release file quota on original owner */
-                rc2 |= qctxt_adjust_qunit(obd, qctxt, qpids[0], qpids[1], 0, 0);
+                rc2 |= qctxt_adjust_qunit(obd, qctxt, qpids[0], qpids[1], 0, 0,
+                                          NULL);
                  /* fall-through */
          case FSFILT_OP_CREATE:
          case FSFILT_OP_UNLINK:
                  /* acquire/release file/block quota on owner of child
                   * (or current owner) */
-                rc2 |= qctxt_adjust_qunit(obd, qctxt, qcids[0], qcids[1], 0, 0);
-                rc2 |= qctxt_adjust_qunit(obd, qctxt, qcids[0], qcids[1], 1, 0);
+                rc2 |= qctxt_adjust_qunit(obd, qctxt, qcids[0], qcids[1], 0, 0,
+                                          NULL);
+                rc2 |= qctxt_adjust_qunit(obd, qctxt, qcids[0], qcids[1], 1, 0,
+                                          NULL);
                  /* acquire/release block quota on owner of parent
                   * (or original owner) */
-                rc2 |= qctxt_adjust_qunit(obd, qctxt, qpids[0], qpids[1], 1, 0);
+                rc2 |= qctxt_adjust_qunit(obd, qctxt, qpids[0], qpids[1], 1, 0,
+                                          NULL);
                  break;
          default:
                  LBUG();
@@ -493,14 +528,17 @@ int filter_quota_adjust(struct obd_device *obd, unsigned int qcids[],
          switch (opc) {
          case FSFILT_OP_SETATTR:
                  /* acquire/release block quota on original & current owner */
-                rc = qctxt_adjust_qunit(obd, qctxt, qcids[0], qcids[1], 1, 0);
-                rc2 = qctxt_adjust_qunit(obd, qctxt, qpids[0], qpids[1], 1, 0);
+                rc = qctxt_adjust_qunit(obd, qctxt, qcids[0], qcids[1], 1, 0,
+                                        NULL);
+                rc2 = qctxt_adjust_qunit(obd, qctxt, qpids[0], qpids[1], 1, 0,
+                                         NULL);
                  break;
          case FSFILT_OP_UNLINK:
                  /* release block quota on this owner */
          case FSFILT_OP_CREATE: /* XXX for write operation on obdfilter */
                  /* acquire block quota on this owner */
-                rc = qctxt_adjust_qunit(obd, qctxt, qcids[0], qcids[1], 1, 0);
+                rc = qctxt_adjust_qunit(obd, qctxt, qcids[0], qcids[1], 1, 0,
+                                        NULL);
                  break;
          default:
                  LBUG();
@@ -1117,7 +1155,8 @@ static int mds_init_slave_ilimits(struct obd_device *obd,
          else
                  gid = oqctl->qc_id;
  
-        rc = qctxt_adjust_qunit(obd, &obd->u.obt.obt_qctxt, uid, gid, 0, 0);
+        rc = qctxt_adjust_qunit(obd, &obd->u.obt.obt_qctxt, uid, gid, 0, 0,
+                                NULL);
          if (rc == -EDQUOT || rc == -EBUSY) {
                  CDEBUG(D_QUOTA, "rc: %d.\n", rc);
                  rc = 0;
@@ -1185,7 +1224,8 @@ static int mds_init_slave_blimits(struct obd_device *obd,
          /* initialize all slave's limit */
          rc = obd_quotactl(mds->mds_osc_exp, ioqc);
  
-        rc = qctxt_adjust_qunit(obd, &obd->u.obt.obt_qctxt, uid, gid, 1, 0);
+        rc = qctxt_adjust_qunit(obd, &obd->u.obt.obt_qctxt, uid, gid, 1, 0,
+                                NULL);
          if (rc == -EDQUOT || rc == -EBUSY) {
                  CDEBUG(D_QUOTA, "rc: %d.\n", rc);
                  rc = 0;
@@ -1199,7 +1239,7 @@ static int mds_init_slave_blimits(struct obd_device *obd,
           * this is will create a lqs for every ost, which will present
           * certain uid/gid is set quota or not */
          QAQ_SET_ADJBLK(oqaq);
-        rc = obd_quota_adjust_qunit(mds->mds_osc_exp, oqaq);
+        rc = obd_quota_adjust_qunit(mds->mds_osc_exp, oqaq, qctxt);
  
          EXIT;
  out:
@@ -1599,7 +1639,8 @@ int mds_quota_recovery(struct obd_device *obd)
  
          mutex_down(&lov->lov_lock);
          if (lov->desc.ld_tgt_count != lov->desc.ld_active_tgt_count) {
-                CWARN("Not all osts are active, abort quota recovery\n");
+                CWARN("Only %u/%u OSTs are active, abort quota recovery\n",
+                      lov->desc.ld_tgt_count, lov->desc.ld_active_tgt_count);
                  mutex_up(&lov->lov_lock);
                  RETURN(rc);
          }
@@ -1615,3 +1656,5 @@ int mds_quota_recovery(struct obd_device *obd)
          wait_for_completion(&data.comp);
          RETURN(rc);
  }
+
+#endif /* HAVE_QUOTA_SUPPORT */
diff --git a/lustre/quota/quotacheck_test.c b/lustre/quota/quotacheck_test.c

deleted file mode 100644 (file)

index a75f723..0000000
--- a/lustre/quota/quotacheck_test.c
+++ /dev/null
@@ -1,223 +0,0 @@
-/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
- * vim:expandtab:shiftwidth=8:tabstop=8:
- *
- *  Copyright (C) 2005 Cluster File Systems, Inc.
- *   Author: Lai Siyao <lsy@clusterfs.com>
- *
- *   This file is part of Lustre, http://www.lustre.org/
- *
- *   No redistribution or use is permitted outside of Cluster File Systems, Inc.
- *
- * A kernel module which tests the fsfilt quotacheck API from the OBD setup function.
- */
-
-#ifndef EXPORT_SYMTAB
-# define EXPORT_SYMTAB
-#endif
-#define DEBUG_SUBSYSTEM S_CLASS
-
-#include <linux/module.h>
-#include <linux/init.h>
-#include <linux/fs.h>
-#include <linux/jbd.h>
-#include <linux/slab.h>
-#include <linux/pagemap.h>
-#include <linux/quotaops.h>
-#include <linux/ext3_fs.h>
-#include <linux/ext3_jbd.h>
-#include <linux/version.h>
-#include <linux/bitops.h>
-
-#include <obd_class.h>
-#include <lustre_fsfilt.h>
-#include <lustre_mds.h>
-#include <obd_ost.h>
-
-char *test_quotafile[] = {"aquotacheck.user", "aquotacheck.group"};
-
-static inline struct ext3_group_desc *
-get_group_desc(struct super_block *sb, int group)
-{
-        unsigned long desc_block, desc;
-        struct ext3_group_desc *gdp;
-
-        desc_block = group / EXT3_DESC_PER_BLOCK(sb);
-        desc = group % EXT3_DESC_PER_BLOCK(sb);
-        gdp = (struct ext3_group_desc *)
-              EXT3_SB(sb)->s_group_desc[desc_block]->b_data;
-
-        return gdp + desc;
-}
-
-static inline struct buffer_head *
-read_inode_bitmap(struct super_block *sb, unsigned long group)
-{
-        struct ext3_group_desc *desc;
-        struct buffer_head *bh;
-
-        desc = get_group_desc(sb, group);
-        bh = sb_bread(sb, le32_to_cpu(desc->bg_inode_bitmap));
-
-        return bh;
-}
-
-static inline struct inode *ext3_iget_inuse(struct super_block *sb,
-                                     struct buffer_head *bitmap_bh,
-                                     int index, unsigned long ino)
-{
-        struct inode *inode = NULL;
-
-        if (ext3_test_bit(index, bitmap_bh->b_data)) {
-                CERROR("i: %d, ino: %lu\n", index, ino);
-                ll_sleep(1);
-                inode = iget(sb, ino);
-        }
-
-        return inode;
-}
-
-static void print_inode(struct inode *inode)
-{
-        loff_t size = 0;
-
-        if (S_ISDIR(inode->i_mode) ||
-            S_ISREG(inode->i_mode) ||
-            S_ISLNK(inode->i_mode))
-                size = inode_get_bytes(inode);
-
-         CERROR("%lu: uid: %u, size: %llu, blocks: %llu, real size: %llu\n",
-               inode->i_ino, inode->i_uid, i_size_read(inode),
-               (long long)inode->i_blocks, size);
-}
-
-/* Test quotaon */
-static int quotacheck_test_1(struct obd_device *obd, struct super_block *sb)
-{
-        struct ext3_sb_info *sbi = EXT3_SB(sb);
-        struct buffer_head *bitmap_bh = NULL;
-        struct inode *inode;
-        unsigned long ino;
-        int i, group;
-        ENTRY;
-
-        for (group = 0; group < sbi->s_groups_count; group++) {
-                ino = group * sbi->s_inodes_per_group + 1;
-                brelse(bitmap_bh);
-                bitmap_bh = read_inode_bitmap(sb, group);
-
-                if (group == 0)
-                        CERROR("groups_count: %lu, inodes_per_group: %lu, first_ino: %u, inodes_count: %u\n",
-                               sbi->s_groups_count, sbi->s_inodes_per_group,
-                               sbi->s_first_ino, le32_to_cpu(sbi->s_es->s_inodes_count));
-
-                for (i = 0; i < sbi->s_inodes_per_group; i++, ino++) {
-                        if (ino < sbi->s_first_ino)
-                                continue;
-                        if (ino > le32_to_cpu(sbi->s_es->s_inodes_count)) {
-                                CERROR("bad inode number: %lu > s_inodes_count\n", ino);
-                                brelse(bitmap_bh);
-                                RETURN(-E2BIG);
-                        }
-                        inode = ext3_iget_inuse(sb, bitmap_bh, i, ino);
-                        if (inode)
-                                print_inode(inode);
-                        iput(inode);
-                }
-        }
-        brelse(bitmap_bh);
-
-        RETURN(0);
-}
-
-/* -------------------------------------------------------------------------
- * Tests above, boring obd functions below
- * ------------------------------------------------------------------------- */
-static int quotacheck_run_tests(struct obd_device *obd, struct obd_device *tgt)
-{
-        int rc;
-        ENTRY;
-
-        if (strcmp(tgt->obd_type->typ_name, LUSTRE_MDS_NAME) &&
-            !strcmp(tgt->obd_type->typ_name, "obdfilter")) {
-                CERROR("TARGET OBD should be mds or ost\n");
-                RETURN(-EINVAL);
-        }
-
-        rc = quotacheck_test_1(tgt, tgt->u.obt.obt_sb);
-
-        return rc;
-}
-
-#ifdef LPROCFS
-static struct lprocfs_vars lprocfs_quotacheck_test_obd_vars[] = { {0} };
-static struct lprocfs_vars lprocfs_quotacheck_test_module_vars[] = { {0} };
-#endif
-
-void lprocfs_quotacheck_test_init_vars(struct lprocfs_static_vars *lvars)
-{
-    lvars->module_vars  = lprocfs_quotacheck_test_module_vars;
-    lvars->obd_vars     = lprocfs_quotacheck_test_obd_vars;
-}
-
-static int quotacheck_test_cleanup(struct obd_device *obd)
-{
-        lprocfs_obd_cleanup(obd);
-        return 0;
-}
-
-static int quotacheck_test_setup(struct obd_device *obd, obd_count len, void *buf)
-{
-        struct lprocfs_static_vars lvars = { 0 };
-        struct lustre_cfg *lcfg = buf;
-        struct obd_device *tgt;
-        int rc;
-        ENTRY;
-
-        if (lcfg->lcfg_bufcount < 1) {
-                CERROR("requires a mds OBD name\n");
-                RETURN(-EINVAL);
-        }
-
-        tgt = class_name2obd(lustre_cfg_string(lcfg, 1));
-        if (!tgt || !tgt->obd_attached || !tgt->obd_set_up) {
-                CERROR("target device not attached or not set up (%s)\n",
-                       lustre_cfg_string(lcfg, 1));
-                RETURN(-EINVAL);
-        }
-
-        rc = quotacheck_run_tests(obd, tgt);
-        if (rc)
-                quotacheck_test_cleanup(obd);
-
-        lprocfs_quotacheck_test_init_vars(&lvars);
-        lprocfs_obd_setup(obd, lvars.obd_vars);
-
-        RETURN(rc);
-}
-
-static struct obd_ops quotacheck_obd_ops = {
-        .o_owner       = THIS_MODULE,
-        .o_setup       = quotacheck_test_setup,
-        .o_cleanup     = quotacheck_test_cleanup,
-};
-
-static int __init quotacheck_test_init(void)
-{
-        struct lprocfs_static_vars lvars = { 0 };
-
-        lprocfs_quotacheck_test_init_vars(&lvars);
-        return class_register_type(&quotacheck_obd_ops, lvars.module_vars,
-                                   "quotacheck_test");
-}
-
-static void __exit quotacheck_test_exit(void)
-{
-        class_unregister_type("quotacheck_test");
-}
-
-MODULE_AUTHOR("Cluster File Systems, Inc. <info@clusterfs.com>");
-MODULE_DESCRIPTION("quotacheck test module");
-MODULE_LICENSE("GPL");
-
-module_init(quotacheck_test_init);
-module_exit(quotacheck_test_exit);
diff --git a/lustre/quota/quotactl_test.c b/lustre/quota/quotactl_test.c

deleted file mode 100644 (file)

index bf5b145..0000000
--- a/lustre/quota/quotactl_test.c
+++ /dev/null
@@ -1,363 +0,0 @@
-/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
- * vim:expandtab:shiftwidth=8:tabstop=8:
- *
- *  Copyright (C) 2005 Cluster File Systems, Inc.
- *   Author: Lai Siyao <lsy@clusterfs.com>
- *
- *   This file is part of Lustre, http://www.lustre.org/
- *
- *   No redistribution or use is permitted outside of Cluster File Systems, Inc.
- *
- * A kernel module which tests the fsfilt quotactl API from the OBD setup function.
- */
-
-#ifndef EXPORT_SYMTAB
-# define EXPORT_SYMTAB
-#endif
-#define DEBUG_SUBSYSTEM S_CLASS
-
-#include <linux/module.h>
-#include <linux/init.h>
-
-#include <obd_class.h>
-#include <lustre_fsfilt.h>
-#include <lustre_mds.h>
-#include <obd_ost.h>
-
-static struct obd_quotactl oqctl;
-
-/* Test quotaon */
-static int quotactl_test_1(struct obd_device *obd, struct super_block *sb)
-{
-        int rc;
-        ENTRY;
-
-        oqctl.qc_cmd = Q_QUOTAON;
-        oqctl.qc_id = obd->u.obt.obt_qfmt;
-        oqctl.qc_type = UGQUOTA;
-        rc = fsfilt_quotactl(obd, sb, &oqctl);
-        if (rc)
-                CERROR("1a: quotactl Q_QUOTAON failed: %d\n", rc);
-        RETURN(rc);
-}
-
-#if 0 /* set/getinfo not supported, this is for cluster-wide quotas */
-/* Test set/getinfo */
-static int quotactl_test_2(struct obd_device *obd, struct super_block *sb)
-{
-        struct obd_quotactl oqctl;
-        int rc;
-        ENTRY;
-
-        oqctl.qc_cmd = Q_SETINFO;
-        oqctl.qc_type = USRQUOTA;
-        oqctl.qc_dqinfo.dqi_bgrace = 1616;
-        oqctl.qc_dqinfo.dqi_igrace = 2828;
-        oqctl.qc_dqinfo.dqi_flags = 0;
-        rc = fsfilt_quotactl(obd, sb, &oqctl);
-        if (rc) {
-                CERROR("2a: quotactl Q_SETINFO failed: %d\n", rc);
-                RETURN(rc);
-        }
-
-        oqctl.qc_cmd = Q_GETINFO;
-        oqctl.qc_type = USRQUOTA;
-        rc = fsfilt_quotactl(obd, sb, &oqctl);
-        if (rc) {
-                CERROR("2b: quotactl Q_GETINFO failed: %d\n", rc);
-                RETURN(rc);
-        }
-        if (oqctl.qc_dqinfo.dqi_bgrace != 1616 ||
-            oqctl.qc_dqinfo.dqi_igrace != 2828 ||
-            oqctl.qc_dqinfo.dqi_flags != 0) {
-                CERROR("2c: quotactl Q_GETINFO get wrong result: %d, %d, %d\n",
-                       oqctl.qc_dqinfo.dqi_bgrace,
-                       oqctl.qc_dqinfo.dqi_igrace,
-                       oqctl.qc_dqinfo.dqi_flags);
-                RETURN(-EINVAL);
-        }
-
-        RETURN(0);
-}
-#endif
-       
-/* Test set/getquota */
-static int quotactl_test_3(struct obd_device *obd, struct super_block *sb)
-{
-        int rc;
-        ENTRY;
-
-        oqctl.qc_cmd = Q_SETQUOTA;
-        oqctl.qc_type = USRQUOTA;
-        oqctl.qc_id = 500;
-        oqctl.qc_dqblk.dqb_bhardlimit = 919;
-        oqctl.qc_dqblk.dqb_bsoftlimit = 818;
-        oqctl.qc_dqblk.dqb_ihardlimit = 616;
-        oqctl.qc_dqblk.dqb_isoftlimit = 515;
-        oqctl.qc_dqblk.dqb_valid = QIF_LIMITS;
-        rc = fsfilt_quotactl(obd, sb, &oqctl);
-        if (rc) {
-                CERROR("3a: quotactl Q_SETQUOTA failed: %d\n", rc);
-                RETURN(rc);
-        }
-
-        oqctl.qc_cmd = Q_GETQUOTA;
-        oqctl.qc_type = USRQUOTA;
-        oqctl.qc_id = 500;
-        rc = fsfilt_quotactl(obd, sb, &oqctl);
-        if (rc) {
-                CERROR("3b: quotactl Q_SETQUOTA failed: %d\n", rc);
-                RETURN(rc);
-        }
-        if (oqctl.qc_dqblk.dqb_bhardlimit != 919 ||
-            oqctl.qc_dqblk.dqb_bsoftlimit != 818 ||
-            oqctl.qc_dqblk.dqb_ihardlimit != 616 ||
-            oqctl.qc_dqblk.dqb_isoftlimit != 515) {
-                CERROR("3c: quotactl Q_GETQUOTA get wrong result:"
-                       LPU64", "LPU64", "LPU64", "LPU64"\n",
-                       oqctl.qc_dqblk.dqb_bhardlimit,
-                       oqctl.qc_dqblk.dqb_bsoftlimit,
-                       oqctl.qc_dqblk.dqb_ihardlimit,
-                       oqctl.qc_dqblk.dqb_isoftlimit);
-                RETURN(-EINVAL);
-        }
-
-        oqctl.qc_cmd = Q_SETQUOTA;
-        oqctl.qc_type = USRQUOTA;
-        oqctl.qc_id = 500;
-        oqctl.qc_dqblk.dqb_curspace = 717;
-        oqctl.qc_dqblk.dqb_curinodes = 414;
-        oqctl.qc_dqblk.dqb_valid = QIF_USAGE;
-        rc = fsfilt_quotactl(obd, sb, &oqctl);
-        if (rc) {
-                CERROR("3d: quotactl Q_SETQUOTA failed: %d\n", rc);
-                RETURN(rc);
-        }
-
-        oqctl.qc_cmd = Q_GETQUOTA;
-        oqctl.qc_type = USRQUOTA;
-        oqctl.qc_id = 500;
-        rc = fsfilt_quotactl(obd, sb, &oqctl);
-        if (rc) {
-                CERROR("3e: quotactl Q_SETQUOTA failed: %d\n", rc);
-                RETURN(rc);
-        }
-        if (oqctl.qc_dqblk.dqb_curspace != 717 ||
-            oqctl.qc_dqblk.dqb_curinodes != 414) {
-                CERROR("3f: quotactl Q_GETQUOTA get wrong result: "
-                       LPU64", "LPU64"\n", oqctl.qc_dqblk.dqb_curspace,
-                       oqctl.qc_dqblk.dqb_curinodes);
-                RETURN(-EINVAL);
-        }
-
-        oqctl.qc_cmd = Q_SETQUOTA;
-        oqctl.qc_type = USRQUOTA;
-        oqctl.qc_dqblk.dqb_btime = 313;
-        oqctl.qc_dqblk.dqb_itime = 212;
-        oqctl.qc_id = 500;
-        oqctl.qc_dqblk.dqb_valid = QIF_TIMES;
-        rc = fsfilt_quotactl(obd, sb, &oqctl);
-        if (rc) {
-                CERROR("3g: quotactl Q_SETQUOTA failed: %d\n", rc);
-                RETURN(rc);
-        }
-
-        oqctl.qc_cmd = Q_GETQUOTA;
-        oqctl.qc_type = USRQUOTA;
-        oqctl.qc_id = 500;
-        rc = fsfilt_quotactl(obd, sb, &oqctl);
-        if (rc) {
-                CERROR("3h: quotactl Q_SETQUOTA failed: %d\n", rc);
-                RETURN(rc);
-        }
-        if (oqctl.qc_dqblk.dqb_btime != 313 ||
-            oqctl.qc_dqblk.dqb_itime != 212) {
-                CERROR("3i: quotactl Q_GETQUOTA get wrong result: "
-                       LPU64", "LPU64"\n", oqctl.qc_dqblk.dqb_btime,
-                       oqctl.qc_dqblk.dqb_itime);
-                RETURN(-EINVAL);
-        }
-
-        oqctl.qc_cmd = Q_SETQUOTA;
-        oqctl.qc_type = USRQUOTA;
-        oqctl.qc_id = 500;
-        oqctl.qc_dqblk.dqb_bhardlimit = 919;
-        oqctl.qc_dqblk.dqb_bsoftlimit = 818;
-        oqctl.qc_dqblk.dqb_curspace = 717;
-        oqctl.qc_dqblk.dqb_ihardlimit = 616;
-        oqctl.qc_dqblk.dqb_isoftlimit = 515;
-        oqctl.qc_dqblk.dqb_curinodes = 414;
-        oqctl.qc_dqblk.dqb_btime = 313;
-        oqctl.qc_dqblk.dqb_itime = 212;
-        oqctl.qc_dqblk.dqb_valid = QIF_ALL;
-        rc = fsfilt_quotactl(obd, sb, &oqctl);
-        if (rc) {
-                CERROR("3j: quotactl Q_SETQUOTA failed: %d\n", rc);
-                RETURN(rc);
-        }
-
-        oqctl.qc_cmd = Q_GETQUOTA;
-        oqctl.qc_type = USRQUOTA;
-        oqctl.qc_id = 500;
-        rc = fsfilt_quotactl(obd, sb, &oqctl);
-        if (rc) {
-                CERROR("3k: quotactl Q_SETQUOTA failed: %d\n", rc);
-                RETURN(rc);
-        }
-        if (oqctl.qc_dqblk.dqb_bhardlimit != 919 ||
-            oqctl.qc_dqblk.dqb_bsoftlimit != 818 ||
-            oqctl.qc_dqblk.dqb_ihardlimit != 616 ||
-            oqctl.qc_dqblk.dqb_isoftlimit != 515 ||
-            oqctl.qc_dqblk.dqb_curspace != 717 ||
-            oqctl.qc_dqblk.dqb_curinodes != 414 ||
-            oqctl.qc_dqblk.dqb_btime != 0 ||
-            oqctl.qc_dqblk.dqb_itime != 0) {
-                CERROR("3l: quotactl Q_GETQUOTA get wrong result:"
-                       LPU64", "LPU64", "LPU64", "LPU64", "LPU64", "LPU64", "
-                       LPU64", "LPU64"\n", oqctl.qc_dqblk.dqb_bhardlimit,
-                       oqctl.qc_dqblk.dqb_bsoftlimit,
-                       oqctl.qc_dqblk.dqb_ihardlimit,
-                       oqctl.qc_dqblk.dqb_isoftlimit,
-                       oqctl.qc_dqblk.dqb_curspace,
-                       oqctl.qc_dqblk.dqb_curinodes,
-                       oqctl.qc_dqblk.dqb_btime,
-                       oqctl.qc_dqblk.dqb_itime);
-                RETURN(-EINVAL);
-        }
-
-        RETURN(0);
-}
-
-/* Test quotaoff */
-static int quotactl_test_4(struct obd_device *obd, struct super_block *sb)
-{
-        int rc;
-        ENTRY;
-
-        oqctl.qc_cmd = Q_QUOTAOFF;
-        oqctl.qc_id = 500;
-        oqctl.qc_type = UGQUOTA;
-        rc = fsfilt_quotactl(obd, sb, &oqctl);
-        if (rc) {
-                CERROR("4a: quotactl Q_QUOTAOFF failed: %d\n", rc);
-                RETURN(rc);
-        }
-
-        RETURN(0);
-}
-
-/* -------------------------------------------------------------------------
- * Tests above, boring obd functions below
- * ------------------------------------------------------------------------- */
-static int quotactl_run_tests(struct obd_device *obd, struct obd_device *tgt)
-{
-        struct super_block *sb;
-        struct lvfs_run_ctxt saved;
-        int rc;
-        ENTRY;
-
-        if (strcmp(tgt->obd_type->typ_name, LUSTRE_MDS_NAME) &&
-            !strcmp(tgt->obd_type->typ_name, "obdfilter")) {
-                CERROR("TARGET OBD should be mds or ost\n");
-                RETURN(-EINVAL);
-        }
-
-        sb = tgt->u.obt.obt_sb;
-
-        push_ctxt(&saved, &tgt->obd_lvfs_ctxt, NULL);
-
-        rc = quotactl_test_1(tgt, sb);
-        if (rc)
-                GOTO(cleanup, rc);
-
-#if 0
-        rc = quotactl_test_2(tgt, sb);
-        if (rc)
-                GOTO(cleanup, rc);
-#endif
-
-        rc = quotactl_test_3(tgt, sb);
-        if (rc)
-                GOTO(cleanup, rc);
-
- cleanup:
-        quotactl_test_4(tgt, sb);
-
-        pop_ctxt(&saved, &tgt->obd_lvfs_ctxt, NULL);
-
-        return rc;
-}
-
-#ifdef LPROCFS
-static struct lprocfs_vars lprocfs_quotactl_test_obd_vars[] = { {0} };
-static struct lprocfs_vars lprocfs_quotactl_test_module_vars[] = { {0} };
-
-void lprocfs_quotactl_test_init_vars(struct lprocfs_static_vars *lvars)
-{
-    lvars->module_vars  = lprocfs_quotactl_test_module_vars;
-    lvars->obd_vars     = lprocfs_quotactl_test_obd_vars;
-}
-#endif
-
-static int quotactl_test_cleanup(struct obd_device *obd)
-{
-        lprocfs_obd_cleanup(obd);
-        return 0;
-}
-
-static int quotactl_test_setup(struct obd_device *obd, obd_count len, void *buf)
-{
-        struct lprocfs_static_vars lvars = { 0 };
-        struct lustre_cfg *lcfg = buf;
-        struct obd_device *tgt;
-        int rc;
-        ENTRY;
-
-        if (lcfg->lcfg_bufcount < 1) {
-                CERROR("requires a mds OBD name\n");
-                RETURN(-EINVAL);
-        }
-
-        tgt = class_name2obd(lustre_cfg_string(lcfg, 1));
-        if (!tgt || !tgt->obd_attached || !tgt->obd_set_up) {
-                CERROR("target device not attached or not set up (%s)\n",
-                       lustre_cfg_string(lcfg, 1));
-                RETURN(-EINVAL);
-        }
-
-        lprocfs_quotactl_test_init_vars(&lvars);
-        lprocfs_obd_setup(obd, lvars.obd_vars);
-
-        rc = quotactl_run_tests(obd, tgt);
-
-        quotactl_test_cleanup(obd);
-
-        RETURN(rc);
-}
-
-static struct obd_ops quotactl_obd_ops = {
-        .o_owner       = THIS_MODULE,
-        .o_setup       = quotactl_test_setup,
-        .o_cleanup     = quotactl_test_cleanup,
-};
-
-static int __init quotactl_test_init(void)
-{
-        struct lprocfs_static_vars lvars = { 0 };
-
-        lprocfs_quotactl_test_init_vars(&lvars);
-        return class_register_type(&quotactl_obd_ops, lvars.module_vars,
-                                   "quotactl_test");
-}
-
-static void __exit quotactl_test_exit(void)
-{
-        class_unregister_type("quotactl_test");
-}
-
-MODULE_AUTHOR("Cluster File Systems, Inc. <info@clusterfs.com>");
-MODULE_DESCRIPTION("quotactl test module");
-MODULE_LICENSE("GPL");
-
-module_init(quotactl_test_init);
-module_exit(quotactl_test_exit);
diff --git a/lustre/scripts/.cvsignore b/lustre/scripts/.cvsignore

index e5a85d6..d524d4b 100644 (file)
--- a/lustre/scripts/.cvsignore
+++ b/lustre/scripts/.cvsignore
@@ -10,6 +10,7 @@ TAGS
  version_tag.pl
  lustre_createcsv
  lustre_config
+lustre_start
  lc_net
  lc_modprobe
  lc_hb
diff --git a/lustre/scripts/Makefile.am b/lustre/scripts/Makefile.am

index f0a44a3..15d05c2 100644 (file)
--- a/lustre/scripts/Makefile.am
+++ b/lustre/scripts/Makefile.am
@@ -1,12 +1,44 @@
-# Copyright (C) 2001  Cluster File Systems, Inc.
  #
-# This code is issued under the GNU General Public License.
-# See the file COPYING in this distribution
+# GPL HEADER START
+#
+# DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License version 2 only,
+# as published by the Free Software Foundation.
+#
+# This program is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+# General Public License version 2 for more details (a copy is included
+# in the LICENSE file that accompanied this code).
+#
+# You should have received a copy of the GNU General Public License
+# version 2 along with this program; If not, see
+# http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+#
+# Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+# CA 95054 USA or visit www.sun.com if you need additional information or
+# have any questions.
+#
+# GPL HEADER END
+#
+
+#
+# Copyright  2008 Sun Microsystems, Inc. All rights reserved
+# Use is subject to license terms.
+#
+
+#
+# This file is part of Lustre, http://www.lustre.org/
+# Lustre is a trademark of Sun Microsystems, Inc.
+#
  
  sbinscripts = lc_servip lustre_up14 lustre_rmmod
  
  # These are scripts that are generated from .in files
-genscripts = lustre_config lc_modprobe lc_net lc_hb lc_cluman lustre_createcsv lc_md lc_lvm
+genscripts = lustre_config lc_modprobe lc_net lc_hb lc_cluman lustre_createcsv \
+    lc_md lc_lvm lustre_start
  
  sbin_SCRIPTS = $(genscripts) $(sbinscripts)
  bin_SCRIPTS = lustre_req_history
diff --git a/lustre/scripts/lc_common b/lustre/scripts/lc_common

index 8b1bcbe..2d67971 100644 (file)
--- a/lustre/scripts/lc_common
+++ b/lustre/scripts/lc_common
@@ -1,38 +1,37 @@
-#
+#!/bin/bash
+
  # vim:expandtab:shiftwidth=4:softtabstop=4:tabstop=4:
+
  #
-# lc_common - This file contains functions to be used by most or all
+# lc_common - This file contains common variables and functions to be used by
  #             Lustre cluster config scripts.
  #
  ################################################################################
  
-# Remote command 
-REMOTE=${REMOTE:-"ssh -x -q"}
-#REMOTE=${REMOTE:-"pdsh -S -R ssh -w"}
-export REMOTE
+#****************************** Common Variables ******************************#
+export PATH=$PATH:/sbin:/usr/sbin
  
-# Lustre utilities
-CMD_PATH=${CMD_PATH:-"/usr/sbin"}
-MKFS=${MKFS:-"$CMD_PATH/mkfs.lustre"}
-TUNEFS=${TUNEFS:-"$CMD_PATH/tunefs.lustre"}
-LCTL=${LCTL:-"$CMD_PATH/lctl"}
+# Remote command
+export REMOTE=${REMOTE:-"ssh -x -q"}
+#export REMOTE=${REMOTE:-"pdsh -S -R ssh -w"}
  
-EXPORT_PATH=${EXPORT_PATH:-"PATH=\$PATH:/sbin:/usr/sbin;"}
+# Lustre utilities
+export MKFS=${MKFS:-"mkfs.lustre"}
+export TUNEFS=${TUNEFS:-"tunefs.lustre"}
+export LCTL=${LCTL:-"lctl"}
  
-# Raid command path
-RAID_CMD_PATH=${RAID_CMD_PATH:-"/sbin"}
-MDADM=${MDADM:-"$RAID_CMD_PATH/mdadm"}
+# Software RAID command
+export MDADM=${MDADM:-"mdadm"}
  
  # Some scripts to be called
-SCRIPTS_PATH=${CLUSTER_SCRIPTS_PATH:-"$(cd `dirname $0`; echo $PWD)"}
-MODULE_CONFIG=${SCRIPTS_PATH}/lc_modprobe
-VERIFY_CLUSTER_NET=${SCRIPTS_PATH}/lc_net
-GEN_HB_CONFIG=${SCRIPTS_PATH}/lc_hb
-GEN_CLUMGR_CONFIG=${SCRIPTS_PATH}/lc_cluman
-SCRIPT_VERIFY_SRVIP=${SCRIPTS_PATH}/lc_servip
-SCRIPT_GEN_MONCF=${SCRIPTS_PATH}/lc_mon
-SCRIPT_CONFIG_MD=${SCRIPTS_PATH}/lc_md
-SCRIPT_CONFIG_LVM=${SCRIPTS_PATH}/lc_lvm
+export MODULE_CONFIG=${MODULE_CONFIG:-"lc_modprobe"}
+export VERIFY_CLUSTER_NET=${VERIFY_CLUSTER_NET:-"lc_net"}
+export GEN_HB_CONFIG=${GEN_HB_CONFIG:-"lc_hb"}
+export GEN_CLUMGR_CONFIG=${GEN_CLUMGR_CONFIG:-"lc_cluman"}
+export SCRIPT_VERIFY_SRVIP=${SCRIPT_VERIFY_SRVIP:-"lc_servip"}
+export SCRIPT_GEN_MONCF=${SCRIPT_GEN_MONCF:-"lc_mon"}
+export SCRIPT_CONFIG_MD=${SCRIPT_CONFIG_MD:-"lc_md"}
+export SCRIPT_CONFIG_LVM=${SCRIPT_CONFIG_LVM:-"lc_lvm"}
  
  # Variables of HA software
  HBVER_HBV1="hbv1"                   # Heartbeat version 1
@@ -62,23 +61,42 @@ FS_TYPE=${FS_TYPE:-"lustre"}        # Lustre filesystem type
  FILE_SUFFIX=${FILE_SUFFIX:-".lustre"}  # Suffix of the generated config files
  
  # Marker of the MD device line
-MD_MARKER=${MD_MARKER:-"MD"}
+export MD_MARKER=${MD_MARKER:-"MD"}
  
  # Marker of the LVM device line
-PV_MARKER=${PV_MARKER:-"PV"}
-VG_MARKER=${VG_MARKER:-"VG"}
-LV_MARKER=${LV_MARKER:-"LV"}
+export PV_MARKER=${PV_MARKER:-"PV"}
+export VG_MARKER=${VG_MARKER:-"VG"}
+export LV_MARKER=${LV_MARKER:-"LV"}
  
-declare -a CONFIG_ITEM              # Items in each line of the csv file
+declare -a CONFIG_ITEM              # Items in each line of the CSV file
  declare -a NODE_NAME                # Hostnames of nodes have been configured
  
-# Nodelist variables
-USE_ALLNODES=false                  # default is not to operate on all the nodes
-SPECIFIED_NODELIST=""               # specified list of nodes to be operated on
-EXCLUDED_NODELIST=""                # list of nodes to be excluded
+declare -a MGS_NODENAME             # Node names of the MGS servers
+declare -a MGS_IDX                  # Indexes of MGSs in the global arrays
+declare -i MGS_NUM                  # Number of MGS servers in the cluster
+declare -i INIT_IDX
+
+# All of the Lustre target items in the CSV file
+declare -a HOST_NAME MODULE_OPTS DEVICE_NAME MOUNT_POINT DEVICE_TYPE FS_NAME
+declare -a MGS_NIDS INDEX FORMAT_OPTIONS MKFS_OPTIONS MOUNT_OPTIONS FAILOVERS
  
-export PATH=$PATH:$CMD_PATH:$SCRIPTS_PATH:$CLUMAN_TOOLS_PATH:$RAID_CMD_PATH:/sbin:/usr/sbin
+# Heartbeat software requires that node names in the configuration directive
+# must (normally) match the "uname -n" of that machine. Since the value of the
+# "failover nids" field in the CSV file is the NID(s) of failover partner node,
+# we have to figure out the corresponding hostname of that node.
+declare -a FAILOVERS_NAMES
  
+export VERIFY_CONNECT=true          # Verify network connectivity by default
+export USE_ALLNODES=false           # Not operating on all the nodes by default
+export SPECIFIED_NODELIST=""        # Specified list of nodes to be operated on
+export EXCLUDED_NODELIST=""         # Specified list of nodes to be excluded
+export NODES_TO_USE=""              # Defacto list of nodes to be operated on
+export NODELIST_OPT=""
+export VERBOSE_OUTPUT=false
+export VERBOSE_OPT=""
+
+
+#****************************** Common Functions ******************************#
  
  # verbose_output string
  # Output verbose information $string
@@ -89,6 +107,24 @@ verbose_output() {
      return 0
  }
  
+# error_output string
+# Output error string to stderr, prefixing with ERROR
+# for easy error parsing from the rest of the output.
+error_output() {
+    echo >&2 "$(basename $0): ERROR: $*"
+    return 0
+}
+
+# error_exit rc string
+# Output error to stderr via error_output and exit with rc.
+error_exit() {
+    local rc=$1
+    shift
+
+    error_output $*
+    exit $rc
+}
+
  # Check whether the reomte command is pdsh
  is_pdsh() {
      if [ "${REMOTE}" = "${REMOTE#*pdsh}" ]; then
@@ -103,13 +139,13 @@ is_pdsh() {
  check_file() {
      # Check argument
      if [ $# -eq 0 ]; then
-        echo >&2 "`basename $0`: check_file() error: Missing csv file!"
+        error_output "check_file(): Missing CSV file!"
          return 1
      fi
  
-    CSV_FILE=$1
+    local CSV_FILE=$1
      if [ ! -s ${CSV_FILE} ]; then
-        echo >&2 "`basename $0`: check_file() error: ${CSV_FILE}"\
+        error_output "check_file(): ${CSV_FILE}"\
                   "does not exist or is empty!"
          return 1
      fi
@@ -118,21 +154,21 @@ check_file() {
  }
  
  # parse_line line
-# Parse a line in the csv file
+# Parse a line in the CSV file
  parse_line() {
      # Check argument
      if [ $# -eq 0 ]; then
-        echo >&2 "`basename $0`: parse_line() error: Missing argument!"
+        error_output "parse_line(): Missing argument!"
          return 1
      fi
  
      declare -i i=0              # Index of the CONFIG_ITEM array
-    declare -i length=0 
+    declare -i length=0
      declare -i idx=0
-    declare -i s_quote_flag=0   # Flag of the single quote character 
+    declare -i s_quote_flag=0   # Flag of the single quote character
      declare -i d_quote_flag=0   # Flag of the double quotes character
      local TMP_LETTER LINE
- 
+
      LINE="$*"
  
      # Initialize the CONFIG_ITEM array
@@ -239,12 +275,12 @@ remote_error() {
      ret_str=$*
  
      if [ "${ret_str}" != "${ret_str#*connect:*}" ]; then
-        echo >&2 "`basename $0`: ${fn_name}() error: ${ret_str}"
+        error_output "${fn_name}(): ${ret_str}"
          return 0
      fi
  
      if [ -z "${ret_str}" ]; then
-        echo >&2 "`basename $0`: ${fn_name}() error:" \
+        error_output "${fn_name}():" \
          "No results from remote!" \
          "Check network connectivity between the local host and ${host_addr}!"
          return 0
@@ -267,7 +303,7 @@ nid2hostname() {
          echo "`basename $0`: nid2hostname() error: Invalid nid - \"${nid}\"!"
          return 1
      fi
-               
+
      case "${nettype}" in
      lo*)    host_name=`hostname`;;
      elan*)  # QsNet
@@ -365,7 +401,7 @@ ip2hostname_single_node() {
                  echo "${host_name}"
                  return 1
              fi
-                       
+
              nid=${host_name}@${nettype}
              ;;
          esac
@@ -449,18 +485,18 @@ exclude_items_from_list() {
             OUTLIST="$OUTLIST,$ITEM"
          fi
      done
-                                
+
      # strip leading comma
      echo ${OUTLIST#,}
  }
  
  # get_csv_nodelist csv_file
-# Get the comma-separated list of all the nodes from the csv file
+# Get the comma-separated list of all the nodes from the CSV file
  get_csv_nodelist() {
      local csv_file=$1
      local all_nodelist
  
-    # Check the csv file
+    # Check the CSV file
      ! check_file ${csv_file} 2>&1 && return 1
  
      all_nodelist=$(egrep -v "([[:space:]]|^)#" ${csv_file} | cut -d, -f 1)
@@ -477,7 +513,7 @@ get_csv_nodelist() {
  get_nodelist() {
      local ALL_NODELIST
  
-    # Get the list of all the nodes in the csv file
+    # Get the list of all the nodes in the CSV file
      ALL_NODELIST=$(get_csv_nodelist ${CSV_FILE})
      [ ${PIPESTATUS[0]} -ne 0 ] && echo "${ALL_NODELIST}" && return 1
  
@@ -513,9 +549,10 @@ check_nodelist() {
      local nodes_to_use=$1
  
      if [ -z "${nodes_to_use}" ]; then
-        echo "`basename $0`: There are no hosts to be operated on."\
+        error_output "There are no nodes to be operated on."\
               "Check the node selection options (-a, -w or -x)."
-        usage
+        usage 1>&2
+        return 1
      else
          verbose_output "Operating on the following nodes: ${nodes_to_use}"
      fi
@@ -548,7 +585,7 @@ nid_in_nidlist() {
  
  # get_mgs_nids mgs_hostname mgs_nids
  # Get the corresponding NID(s) of the MGS node ${mgs_hostname} from the
-# "mgs nids" field of one lustre target in the csv file
+# "mgs nids" field of one lustre target in the CSV file
  get_mgs_nids() {
      local mgs_node="$1"
      local all_mgs_nids="$2"
@@ -565,7 +602,8 @@ get_mgs_nids() {
      done
  
      # Let's use lctl to get the real nids from the mgs node
-    ret_str=$(${REMOTE} ${mgs_node} "${LCTL} list_nids" 2>&1 </dev/null)
+    ret_str=$($REMOTE $mgs_node "PATH=\$PATH:/sbin:/usr/sbin
+$LCTL list_nids" 2>&1 </dev/null)
      if [ ${PIPESTATUS[0]} -ne 0 -a -n "${ret_str}" ]; then
          echo "$(basename $0): get_mgs_nids() error:" \
          "remote command to ${mgs_node} error: ${ret_str}"
@@ -589,3 +627,440 @@ get_mgs_nids() {
  
      return 1
  }
+
+# Check the items required for OSTs, MDTs and MGS
+#
+# When formatting an OST, the following items: hostname,
+# device name, device type and mgs nids, cannot have null value.
+#
+# When formatting an MDT or MGS, the following items: hostname,
+# device name and device type, cannot have null value.
+check_lustre_item() {
+    # Check argument
+    if [ $# -eq 0 ]; then
+        error_output "check_lustre_item(): Missing argument"\
+                  "for function check_lustre_item()!"
+        return 1
+    fi
+
+    declare -i i=$1
+
+    # Check hostname, device name and device type
+    if [ -z "${HOST_NAME[i]}" ] || \
+    [ -z "${DEVICE_NAME[i]}" ] || [ -z "${DEVICE_TYPE[i]}" ]; then
+        error_output "check_lustre_item(): Some required"\
+                  "item has null value! Check hostname,"\
+                  "device name and device type!"
+        return 1
+    fi
+
+    # Check mgs nids
+    if [ "${DEVICE_TYPE[i]}" = "ost" ]&&[ -z "${MGS_NIDS[i]}" ]; then
+        error_output "check_lustre_item(): OST's mgs nids"\
+                  "item has null value!"
+        return 1
+    fi
+
+    # Check mount point
+    if [ -z "${MOUNT_POINT[i]}" ]; then
+        error_output "check_lustre_item(): mount"\
+                  "point item of target ${DEVICE_NAME[i]} has null value!"
+        return 1
+    fi
+
+    return 0
+}
+
+# Get the number of MGS nodes in the cluster
+get_mgs_num() {
+    INIT_IDX=0
+    MGS_NUM=${#MGS_NODENAME[@]}
+    [ -z "${MGS_NODENAME[0]}" ] && let "INIT_IDX += 1" \
+    && let "MGS_NUM += 1"
+}
+
+# is_mgs_node hostname
+# Verify whether @hostname is a MGS node
+is_mgs_node() {
+    local host_name=$1
+    declare -i i
+
+    get_mgs_num
+    for ((i = ${INIT_IDX}; i < ${MGS_NUM}; i++)); do
+        [ "${MGS_NODENAME[i]}" = "${host_name}" ] && return 0
+    done
+
+    return 1
+}
+
+# Check whether the MGS nodes are in the same failover group
+check_mgs_group() {
+    declare -i i
+    declare -i j
+    declare -i idx
+    local mgs_node
+
+    get_mgs_num
+    for ((i = ${INIT_IDX}; i < ${MGS_NUM}; i++)); do
+        mgs_node=${MGS_NODENAME[i]}
+        for ((j = ${INIT_IDX}; j < ${MGS_NUM}; j++)); do
+          [ "${MGS_NODENAME[j]}" = "${mgs_node}" ] && continue 1
+
+          idx=${MGS_IDX[j]}
+          if [ "${FAILOVERS_NAMES[idx]#*$mgs_node*}" = "${FAILOVERS_NAMES[idx]}" ]
+          then
+            error_output "check_mgs_group():"\
+            "MGS node ${mgs_node} is not in the ${HOST_NAME[idx]}"\
+            "failover group!"
+            return 1
+          fi
+        done
+    done
+
+    return 0
+}
+
+# Get and check MGS servers.
+# There should be no more than one MGS specified in the entire CSV file.
+check_mgs() {
+    declare -i i
+    declare -i j
+    declare -i exp_idx    # Index of explicit MGS servers
+    declare -i imp_idx    # Index of implicit MGS servers
+    local is_exp_mgs is_imp_mgs
+    local mgs_node
+
+    # Initialize the MGS_NODENAME and MGS_IDX arrays
+    unset MGS_NODENAME
+    unset MGS_IDX
+
+    exp_idx=1
+    imp_idx=1
+    for ((i = 0; i < ${#HOST_NAME[@]}; i++)); do
+        is_exp_mgs=false
+        is_imp_mgs=false
+
+        # Check whether this node is an explicit MGS node 
+        # or an implicit one
+        if [ "${DEVICE_TYPE[i]#*mgs*}" != "${DEVICE_TYPE[i]}" ]; then
+            verbose_output "Explicit MGS target" \
+            "${DEVICE_NAME[i]} in host ${HOST_NAME[i]}."
+            is_exp_mgs=true
+        fi
+
+        if [ "${DEVICE_TYPE[i]}" = "mdt" -a -z "${MGS_NIDS[i]}" ]; then
+            verbose_output "Implicit MGS target" \
+            "${DEVICE_NAME[i]} in host ${HOST_NAME[i]}."
+            is_imp_mgs=true
+        fi
+
+        # Get and check MGS servers
+        if ${is_exp_mgs} || ${is_imp_mgs}; then
+            # Check whether more than one MGS target in one MGS node
+            if is_mgs_node ${HOST_NAME[i]}; then
+                error_output "check_mgs():"\
+                "More than one MGS target in the same node -"\
+                "\"${HOST_NAME[i]}\"!"
+                return 1
+            fi
+
+            # Get and check primary MGS server and backup MGS server        
+            if [ "${FORMAT_OPTIONS[i]}" = "${FORMAT_OPTIONS[i]#*noformat*}" ]
+            then
+                # Primary MGS server
+                if [ -z "${MGS_NODENAME[0]}" ]; then
+                    if [ "${is_exp_mgs}" = "true" -a ${imp_idx} -gt 1 ] \
+                    || [ "${is_imp_mgs}" = "true" -a ${exp_idx} -gt 1 ]; then
+                        error_output "check_mgs():"\
+                        "There exist both explicit and implicit MGS"\
+                        "targets in the CSV file!"
+                        return 1
+                    fi
+                    MGS_NODENAME[0]=${HOST_NAME[i]}
+                    MGS_IDX[0]=$i
+                else
+                    mgs_node=${MGS_NODENAME[0]}
+                    if [ "${FAILOVERS_NAMES[i]#*$mgs_node*}" = "${FAILOVERS_NAMES[i]}" ]
+                    then
+                        error_output "check_mgs():"\
+                        "More than one primary MGS nodes in the CSV" \
+                        "file - ${MGS_NODENAME[0]} and ${HOST_NAME[i]}!"
+                    else
+                        error_output "check_mgs():"\
+                        "MGS nodes ${MGS_NODENAME[0]} and ${HOST_NAME[i]}"\
+                        "are failover pair, one of them should use"\
+                        "\"--noformat\" in the format options item!"
+                    fi
+                    return 1
+                fi
+            else    # Backup MGS server
+                if [ "${is_exp_mgs}" = "true" -a ${imp_idx} -gt 1 ] \
+                || [ "${is_imp_mgs}" = "true" -a ${exp_idx} -gt 1 ]; then
+                    error_output "check_mgs():"\
+                    "There exist both explicit and implicit MGS"\
+                    "targets in the CSV file!"
+                    return 1
+                fi
+
+                if ${is_exp_mgs}; then # Explicit MGS
+                    MGS_NODENAME[exp_idx]=${HOST_NAME[i]}
+                    MGS_IDX[exp_idx]=$i
+                    exp_idx=$(( exp_idx + 1 ))
+                else    # Implicit MGS
+                    MGS_NODENAME[imp_idx]=${HOST_NAME[i]}
+                    MGS_IDX[imp_idx]=$i
+                    imp_idx=$(( imp_idx + 1 ))
+                fi
+            fi
+        fi #End of "if ${is_exp_mgs} || ${is_imp_mgs}"
+    done
+
+    # Check whether the MGS nodes are in the same failover group
+    if ! check_mgs_group; then
+        return 1
+    fi
+
+    return 0
+}
+
+# Execute remote command to add module options to
+# the module configuration file
+add_module_options() {
+    declare -i i=$1
+    local hostname=$2
+
+    if [ -z "$hostname" ]; then
+        error_output "add_module_options(): Missing hostname!"
+        return 1
+    fi
+
+    [ -z "${MODULE_OPTS[i]}" ] && return 0
+
+    # Execute remote command to add module options to
+    # the module configuration file
+    verbose_output "Adding module options to $hostname"
+    $REMOTE $hostname "PATH=\$PATH:/sbin:/usr/sbin
+echo \"${MODULE_OPTS[i]}\" | $MODULE_CONFIG"
+    local RC=${PIPESTATUS[0]}
+    if [ $RC -ne 0 ]; then
+        error_output "add_module_options():"\
+        "Failed to add module options to $hostname!"
+        return $RC
+    fi
+
+    return 0
+}
+
+# check_lnet_connect hostname_index mgs_hostname
+# Check whether the target node can contact the MGS node @mgs_hostname
+# If @mgs_hostname is null, then it means the primary MGS node
+check_lnet_connect() {
+    declare -i i=$1
+    local mgs_node=$2
+
+    local mgs_prim_nids
+    local nids_str=
+    local mgs_nid 
+    local ping_mgs
+    local try
+
+    # Execute remote command to check that 
+    # this node can contact the MGS node
+    verbose_output "Checking lnet connectivity between" \
+    "${HOST_NAME[i]} and the MGS node ${mgs_node}"
+    mgs_prim_nids=`echo ${MGS_NIDS[i]} | awk -F: '{print $1}'`
+
+    if [ -z "${mgs_node}" -o $MGS_NUM -eq 1 ]; then
+        nids_str=${mgs_prim_nids}    # nids of primary MGS node
+        if [ -z "${nids_str}" ]; then
+            error_output "check_lnet_connect():"\
+            "Check the mgs nids item of host ${HOST_NAME[i]}!"\
+            "Missing nids of the primary MGS node!"
+            return 1
+        fi
+    else
+        # Get the corresponding NID(s) of the MGS node ${mgs_node}
+        # from the "mgs nids" field
+        nids_str=$(get_mgs_nids ${mgs_node} ${MGS_NIDS[i]})
+        if [ ${PIPESTATUS[0]} -ne 0 ]; then
+            error_output "${nids_str}"
+            return 1
+        fi
+    fi
+
+    ping_mgs=false
+    for mgs_nid in ${nids_str//,/ }
+    do
+        for try in $(seq 0 5); do
+            $REMOTE ${HOST_NAME[i]} "PATH=\$PATH:/sbin:/usr/sbin
+$LCTL ping $mgs_nid 5 1>/dev/null"
+            if [ ${PIPESTATUS[0]} -eq 0 ]; then
+                # This node can contact the MGS node
+                verbose_output "${HOST_NAME[i]} can contact the MGS" \
+                "node $mgs_node by using nid \"$mgs_nid\"!"
+                ping_mgs=true
+                break
+            fi
+        done
+    done
+
+    if ! ${ping_mgs}; then
+        error_output "check_lnet_connect():" \
+        "${HOST_NAME[i]} cannot contact the MGS node ${mgs_node}"\
+        "with nids - \"${nids_str}\"! Check ${LCTL} command!"
+        return 1
+    fi
+
+    return 0
+}
+
+# Start lnet network in the cluster node and check that 
+# this node can contact the MGS node
+check_lnet() {
+    if ! $VERIFY_CONNECT; then
+        return 0
+    fi
+
+    # Check argument
+    if [ $# -eq 0 ]; then
+        error_output "check_lnet(): Missing argument!"
+        return 1
+    fi
+
+    declare -i i=$1
+    declare -i j
+    local ret_str
+
+    # Execute remote command to start lnet network
+    verbose_output "Starting lnet network on ${HOST_NAME[i]}"
+    ret_str=$($REMOTE ${HOST_NAME[i]} "PATH=\$PATH:/sbin:/usr/sbin
+modprobe lnet && $LCTL network up" 2>&1)
+    if [ ${PIPESTATUS[0]} -ne 0 ]; then
+        error_output "check_lnet(): start lnet network on" \
+        "${HOST_NAME[i]} error: $ret_str"
+        return 1
+    fi
+
+    if is_mgs_node ${HOST_NAME[i]}; then
+        return 0
+    fi
+
+    # Execute remote command to check that 
+    # this node can contact the MGS node
+    for ((j = 0; j < ${MGS_NUM}; j++)); do
+        if ! check_lnet_connect $i ${MGS_NODENAME[j]}; then
+            return 1
+        fi
+    done
+
+    return 0
+}
+
+# Start lnet network in the MGS node
+start_mgs_lnet() {
+    declare -i i
+    declare -i idx
+
+    if [ -z "${MGS_NODENAME[0]}" -a  -z "${MGS_NODENAME[1]}" ]; then
+        if ${USE_ALLNODES}; then
+            verbose_output "There is no MGS target in the ${CSV_FILE} file."
+        else
+            verbose_output "There is no MGS target in the node list \"${NODES_TO_USE}\"."
+        fi
+        return 0
+    fi
+
+    for ((i = ${INIT_IDX}; i < ${MGS_NUM}; i++)); do
+        # Execute remote command to add lnet options lines to 
+        # the MGS node's modprobe.conf/modules.conf
+        idx=${MGS_IDX[i]}
+        add_module_options $idx ${MGS_NODENAME[i]} || return ${PIPESTATUS[0]}
+
+        # Start lnet network in the MGS node
+        check_lnet $idx || return ${PIPESTATUS[0]}
+    done
+
+    return 0
+}
+
+# Get all the Lustre target items in the CSV file and do some checks.
+get_lustre_items() {
+    # Check argument
+    if [ $# -eq 0 ]; then
+        error_output "get_lustre_items(): Missing argument"\
+                  "for function get_lustre_items()!"
+        return 1
+    fi
+
+    local CSV_FILE=$1
+    local LINE
+    local marker
+    local hostname
+    declare -i line_num=0
+    declare -i idx=0
+
+    exec 9< ${CSV_FILE}
+    while read -u 9 -r LINE; do
+        line_num=${line_num}+1
+        # verbose_output "Parsing line ${line_num}: $LINE"
+
+        # Get rid of the empty line
+        [ -z "`echo ${LINE} | awk '/[[:alnum:]]/ {print $0}'`" ] && continue
+
+        # Get rid of the comment line
+        [ -z "`echo \"${LINE}\" | egrep -v \"([[:space:]]|^)#\"`" ] && continue
+
+        # Skip the Linux MD/LVM line
+        marker=$(echo ${LINE} | cut -d, -f 2)
+        if [ "${marker}" = "${MD_MARKER}" -o "${marker}" = "${PV_MARKER}" ] \
+        || [ "${marker}" = "${VG_MARKER}" -o "${marker}" = "${LV_MARKER}" ]; then
+            continue
+        fi
+
+        # Skip the host which is not specified in the host list
+        if ! ${USE_ALLNODES}; then
+            hostname=$(echo ${LINE} | cut -d, -f 1)
+            ! host_in_hostlist ${hostname} ${NODES_TO_USE} && continue
+        fi
+
+        # Parse the config line into CONFIG_ITEM
+        if ! parse_line "$LINE"; then
+            error_output "parse_line(): Occurred"\
+                  "on line ${line_num} in ${CSV_FILE}: $LINE"
+            return 1
+        fi
+
+        HOST_NAME[idx]=${CONFIG_ITEM[0]}
+        MODULE_OPTS[idx]=${CONFIG_ITEM[1]}
+        DEVICE_NAME[idx]=${CONFIG_ITEM[2]}
+        MOUNT_POINT[idx]=${CONFIG_ITEM[3]}
+        DEVICE_TYPE[idx]=${CONFIG_ITEM[4]}
+        FS_NAME[idx]=${CONFIG_ITEM[5]}
+        MGS_NIDS[idx]=${CONFIG_ITEM[6]}
+        INDEX[idx]=${CONFIG_ITEM[7]}
+        FORMAT_OPTIONS[idx]=${CONFIG_ITEM[8]}
+        MKFS_OPTIONS[idx]=${CONFIG_ITEM[9]}
+        MOUNT_OPTIONS[idx]=${CONFIG_ITEM[10]}
+        FAILOVERS[idx]=${CONFIG_ITEM[11]}
+
+        MODULE_OPTS[idx]=`echo "${MODULE_OPTS[idx]}" | sed 's/"/\\\"/g'`
+
+        # Convert IP addresses in NIDs to hostnames
+        FAILOVERS_NAMES[idx]=$(ip2hostname_multi_node ${FAILOVERS[idx]})
+        if [ ${PIPESTATUS[0]} -ne 0 ]; then
+            error_output "${FAILOVERS_NAMES[idx]}"
+            return 1
+        fi
+
+        # Check some required items for formatting target
+        if ! check_lustre_item $idx; then
+            error_output "check_lustre_item():"\
+                  "Occurred on line ${line_num} in ${CSV_FILE}."
+            return 1    
+        fi
+
+        idx=${idx}+1
+    done
+
+    return 0
+}
diff --git a/lustre/scripts/lc_hb.in b/lustre/scripts/lc_hb.in

index 0fa1fb0..08a8661 100644 (file)
--- a/lustre/scripts/lc_hb.in
+++ b/lustre/scripts/lc_hb.in
@@ -1,4 +1,7 @@
  #!/bin/bash
+
+# vim:expandtab:shiftwidth=4:softtabstop=4:tabstop=4:
+
  #
  # lc_hb - script for generating the Heartbeat HA software's
  #         configuration files
@@ -62,7 +65,7 @@ while getopts "r:n:vd:" OPTION; do
                 HBVER_OPT=$OPTARG
                 if [ "${HBVER_OPT}" != "${HBVER_HBV1}" ] \
                 && [ "${HBVER_OPT}" != "${HBVER_HBV2}" ]; then
-                       echo >&2 $"`basename $0`: Invalid Heartbeat software" \
+                       error_output "Invalid Heartbeat software" \
                                   "version - ${HBVER_OPT}!"
                         usage
                 fi
@@ -71,17 +74,17 @@ while getopts "r:n:vd:" OPTION; do
                 HOSTNAME_OPT=$OPTARG 
                 PRIM_NODENAME=`echo ${HOSTNAME_OPT} | awk -F":" '{print $1}'`
                 if [ -z "${PRIM_NODENAME}" ]; then
-                       echo >&2 $"`basename $0`: Missing primary nodename!"
+                       error_output "Missing primary nodename!"
                         usage
                 fi
                 HOSTNAME_NUM=`echo ${HOSTNAME_OPT} | awk -F":" '{print NF}'`
                 if [ ${HOSTNAME_NUM} -lt 2 ]; then
-                       echo >&2 $"`basename $0`: Missing failover nodenames!"
+                       error_output "Missing failover nodenames!"
                         usage
                 fi
                 if [ "${HBVER_OPT}" = "${HBVER_HBV1}" -a ${HOSTNAME_NUM} -gt 2 ]
                 then
-                       echo >&2 $"`basename $0`: Heartbeat version 1 can" \
+                       error_output "Heartbeat version 1 can" \
                                   "only support 2 nodes!"
                         usage
                 fi
@@ -94,11 +97,11 @@ while getopts "r:n:vd:" OPTION; do
                 TARGET_DEVNAMES[TARGET_NUM]=`echo ${DEVICE_OPT}|awk -F: '{print $1}'`
                 TARGET_MNTPNTS[TARGET_NUM]=`echo ${DEVICE_OPT}|awk -F: '{print $2}'`
                 if [ -z "${TARGET_DEVNAMES[TARGET_NUM]}" ]; then
-                       echo >&2 $"`basename $0`: Missing target device name!"
+                       error_output "Missing target device name!"
                         usage
                 fi
                 if [ -z "${TARGET_MNTPNTS[TARGET_NUM]}" ]; then
-                       echo >&2 $"`basename $0`: Missing mount point for target"\
+                       error_output "Missing mount point for target"\
                                   "${TARGET_DEVNAMES[TARGET_NUM]}!"
                         usage
                 fi
@@ -111,17 +114,17 @@ done
  
  # Check the required parameters
  if [ -z "${HBVER_OPT}" ]; then
-       echo >&2 $"`basename $0`: Missing -r option!"
+       error_output "Missing -r option!"
         usage
  fi
  
  if [ -z "${HOSTNAME_OPT}" ]; then
-       echo >&2 $"`basename $0`: Missing -n option!"
+       error_output "Missing -n option!"
         usage
  fi
  
  if [ -z "${DEVICE_OPT}" ]; then
-       echo >&2 $"`basename $0`: Missing -d option!"
+       error_output "Missing -d option!"
         usage
  fi
  
@@ -152,13 +155,13 @@ check_remote_file() {
         local file_name=$2
  
         if [ -z "${host_name}" ]; then
-               echo >&2 "`basename $0`: check_remote_file() error:"\
+               error_output "check_remote_file():"\
                          "Missing hostname!"
                 return 1
         fi
  
         if [ -z "${file_name}" ]; then
-               echo >&2 "`basename $0`: check_remote_file() error:"\
+               error_output "check_remote_file():"\
                          "Missing file name!"
                 return 1
         fi
@@ -166,7 +169,7 @@ check_remote_file() {
         # Execute remote command to check the file 
         ${REMOTE} ${host_name} "[ -e ${file_name} ]"
         if [ $? -ne 0 ]; then
-               echo >&2 "`basename $0`: check_remote_file() error:"\
+               error_output "check_remote_file():"\
                 "${file_name} does not exist in host ${host_name}!"
                 return 1
         fi
@@ -184,7 +187,7 @@ hb_running() {
         ret_str=`${REMOTE} ${host_name} "${CL_STATUS} hbstatus" 2>&1`
         if [ $? -ne 0 ]; then
                 if [ "${ret_str}" = "${ret_str#*stop*}" ]; then
-                       echo >&2 "`basename $0`: hb_running() error:"\
+                       error_output "hb_running():"\
                         "remote command to ${host_name} error: ${ret_str}!"
                         return 2
                 else
@@ -202,9 +205,10 @@ stop_heartbeat() {
         local host_name=$1
         local ret_str
  
-       ret_str=`${REMOTE} ${host_name} "/sbin/service heartbeat stop" 2>&1`
+       ret_str=$(${REMOTE} ${host_name} "PATH=\$PATH:/sbin:/usr/sbin
+service heartbeat stop < /dev/null" 2>&1)
         if [ $? -ne 0 ]; then
-               echo >&2 "`basename $0`: stop_heartbeat() error:"\
+               error_output "stop_heartbeat():"\
                 "remote command to ${host_name} error: ${ret_str}!"
                 return 1
         fi
@@ -223,7 +227,7 @@ check_heartbeat() {
         for ((idx = 0; idx < ${#NODE_NAMES[@]}; idx++)); do
                 # Check Heartbeat configuration directory
                 if ! check_remote_file ${NODE_NAMES[idx]} ${HA_DIR}; then
-                       echo >&2 "`basename $0`: check_heartbeat() error:"\
+                       error_output "check_heartbeat():"\
                         "Is Heartbeat package installed?"
                         return 1
                 fi
@@ -231,8 +235,8 @@ check_heartbeat() {
                 if [ "${HBVER_OPT}" = "${HBVER_HBV1}" ]; then
                         # Check mon configuration directory
                         if ! check_remote_file ${NODE_NAMES[idx]} ${MON_DIR}; then
-                               echo >&2 "`basename $0`: check_heartbeat()"\
-                               "error: Is mon package installed?"
+                               error_output "check_heartbeat():"\
+                               "Is mon package installed?"
                                 return 1
                         fi
                 fi
@@ -240,8 +244,8 @@ check_heartbeat() {
                 if [ "${HBVER_OPT}" = "${HBVER_HBV2}" ]; then
                         # Check crm directory
                         if ! check_remote_file ${NODE_NAMES[idx]} ${CIB_DIR}; then
-                               echo >&2 "`basename $0`: check_heartbeat()"\
-                               "error: Is Heartbeat v2 package installed?"
+                               error_output "check_heartbeat():"\
+                               "Is Heartbeat v2 package installed?"
                                 return 1
                         fi
                 fi
@@ -284,8 +288,8 @@ get_srvname() {
         local ret_str
  
         # Execute remote command to get the target server name
-       ret_str=`${REMOTE} ${host_name} \
-               "${TUNEFS} --print --verbose ${target_devname} | grep Target:" 2>&1`
+       ret_str=$(${REMOTE} ${host_name} "PATH=\$PATH:/sbin:/usr/sbin
+${TUNEFS} --print --verbose ${target_devname} | grep Target:" 2>&1)
         if [ $? -ne 0 ]; then
                 echo "`basename $0`: get_srvname() error:" \
                      "from host ${host_name} - ${ret_str}"
@@ -321,7 +325,7 @@ get_srvnames() {
                 TARGET_SRVNAMES[i]=$(get_srvname ${PRIM_NODENAME} \
                                      ${TARGET_DEVNAMES[i]})
                 if [ $? -ne 0 ]; then
-                       echo >&2 "${TARGET_SRVNAMES[i]}"
+                       error_output "${TARGET_SRVNAMES[i]}"
                         return 1
                 fi
         done
@@ -397,7 +401,7 @@ create_hacf() {
                 touch ${TMP_DIR}$"/ha.cf."${NODE_NAMES[idx]}
                 scp ${HACF_LUSTRE} ${NODE_NAMES[idx]}:${HA_DIR}/
                 if [ $? -ne 0 ]; then
-                       echo >&2 "`basename $0`: Failed to scp ha.cf file"\
+                       error_output "Failed to scp ha.cf file"\
                                  "to node ${NODE_NAMES[idx]}!"
                         return 1
                 fi
@@ -444,7 +448,7 @@ create_haresources() {
                 python ${CIB_GEN_SCRIPT} --stdout \
                 ${HARES_LUSTRE} > ${CIB_LUSTRE}
                 if [ $? -ne 0 ]; then
-                       echo >&2 "`basename $0`: Failed to generate cib.xml file"\
+                       error_output "Failed to generate cib.xml file"\
                                  "for node ${PRIM_NODENAME}!"
                         return 1
                 fi
@@ -455,7 +459,7 @@ create_haresources() {
                 /bin/cp -f ${HARES_LUSTRE} ${TMP_DIR}$"/haresources."${NODE_NAMES[idx]}
                 scp ${HARES_LUSTRE} ${NODE_NAMES[idx]}:${HA_DIR}/
                 if [ $? -ne 0 ]; then
-                       echo >&2 "`basename $0`: Failed to scp haresources file"\
+                       error_output "Failed to scp haresources file"\
                                  "to node ${NODE_NAMES[idx]}!"
                         return 1
                 fi
@@ -463,7 +467,7 @@ create_haresources() {
                 if [ "${HBVER_OPT}" = "${HBVER_HBV2}" ]; then
                         scp ${CIB_LUSTRE} ${NODE_NAMES[idx]}:${CIB_DIR}/
                         if [ $? -ne 0 ]; then
-                               echo >&2 "`basename $0`: Failed to scp cib.xml"\
+                               error_output "Failed to scp cib.xml"\
                                          "file to node ${NODE_NAMES[idx]}!"
                                 return 1
                         fi
@@ -491,7 +495,7 @@ create_authkeys() {
                 touch ${TMP_DIR}$"/authkeys."${NODE_NAMES[idx]}
                 scp -p ${AUTHKEYS_TEMP} ${NODE_NAMES[idx]}:${HA_DIR}/
                 if [ $? -ne 0 ]; then
-                       echo >&2 "`basename $0`: Failed to scp authkeys file"\
+                       error_output "Failed to scp authkeys file"\
                                  "to node ${NODE_NAMES[idx]}!"
                         return 1
                 fi
@@ -547,7 +551,7 @@ create_moncf() {
  
         ${SCRIPT_GEN_MONCF} ${params}
         if [ $? -ne 0 ]; then
-               echo >&2 "`basename $0`: Failed to generate mon.cf file"\
+               error_output "Failed to generate mon.cf file"\
                          "by using ${SCRIPT_GEN_MONCF}!"
                 return 1
         fi
@@ -560,7 +564,7 @@ create_moncf() {
  
                 scp ${MONCF_LUSTRE} ${NODE_NAMES[idx]}:${MON_DIR}/
                 if [ $? -ne 0 ]; then
-                       echo >&2 "`basename $0`: Failed to scp mon.cf file"\
+                       error_output "Failed to scp mon.cf file"\
                                  "to node ${NODE_NAMES[idx]}!"
                         return 1
                 fi
diff --git a/lustre/scripts/lc_lvm.in b/lustre/scripts/lc_lvm.in

index 98248d7..3f0d616 100644 (file)
--- a/lustre/scripts/lc_lvm.in
+++ b/lustre/scripts/lc_lvm.in
@@ -162,17 +162,19 @@ shift  `expr $OPTIND - 1`
  
  # Here we expect the csv file
  if [ $# -eq 0 ]; then
-    echo >&2 "`basename $0`: Missing csv file!"
+    error_output "Missing csv file!"
      usage
  fi
  
+CSV_FILE=$1
+
  # check_lvm_item index
  #
  # Check the items required for managing LVM device ${LVM_NAME[index]}
  check_lvm_item() {
      # Check argument
      if [ $# -eq 0 ]; then
-        echo >&2 "`basename $0`: check_lvm_item() error:"\
+        error_output "check_lvm_item():"\
                   "Missing argument!"
          return 1
      fi
@@ -181,7 +183,7 @@ check_lvm_item() {
  
      # Check hostname
      if [ -z "${HOST_NAME[i]}" ]; then
-        echo >&2 "`basename $0`: check_lvm_item() error:"\
+        error_output "check_lvm_item():"\
                   "hostname item has null value!"
          return 1
      fi
@@ -190,7 +192,7 @@ check_lvm_item() {
      if [ -z "${LVM_NAME[i]}" ] \
      && [ "${LINE_MARKER[i]}" != "${LV_MARKER}" -a "${OP_MODE[i]}" != "remove" ]
      then
-        echo >&2 "`basename $0`: check_lvm_item() error:"\
+        error_output "check_lvm_item():"\
                   "LVM component name item has null value!"
          return 1
      fi
@@ -199,7 +201,7 @@ check_lvm_item() {
      if [ -n "${OP_MODE[i]}" ] \
      && [ "${OP_MODE[i]}" != "create" -a "${OP_MODE[i]}" != "remove" ]
      then
-        echo >&2 "`basename $0`: check_lvm_item() error:"\
+        error_output "check_lvm_item():"\
                   "Invalid operation mode item - \"${OP_MODE[i]}\"!"
          return 1
      fi
@@ -208,20 +210,20 @@ check_lvm_item() {
      if [ -z "${OP_MODE[i]}" -o "${OP_MODE[i]}" = "create" ]; then
          if [ "${LINE_MARKER[i]}" = "${VG_MARKER}" -a -z "${SIXTH_ITEM[i]}" ]
          then
-            echo >&2 "`basename $0`: check_lvm_item() error:"\
+            error_output "check_lvm_item():"\
              "pv paths item of vg ${LVM_NAME[i]} has null value!"
              return 1
          fi
  
          if [ "${LINE_MARKER[i]}" = "${LV_MARKER}" ]; then
              if [ -z "${SIXTH_ITEM[i]}" ]; then
-                echo >&2 "`basename $0`: check_lvm_item() error:"\
+                error_output "check_lvm_item():"\
                           "lv size item has null value!"
                  return 1
              fi
  
              if [ -z "${SEVENTH_ITEM[i]}" ]; then
-                echo >&2 "`basename $0`: check_lvm_item() error:"\
+                error_output "check_lvm_item():"\
                           "vg name item has null value!"
                  return 1
              fi
@@ -237,11 +239,11 @@ check_lvm_item() {
  get_lvm_items() {
      # Check argument
      if [ $# -eq 0 ]; then
-        echo >&2 "`basename $0`: get_lvm_items() error: Missing csv file!"
+        error_output "get_lvm_items(): Missing csv file!"
          return 1
      fi
  
-    CSV_FILE=$1
+    local CSV_FILE=$1
      local LINE line_marker
      local hostname
      declare -i line_num=0
@@ -280,7 +282,7 @@ get_lvm_items() {
  
          # Check some required items
          if ! check_lvm_item $idx; then
-            echo >&2 "`basename $0`: check_lvm_item() error:"\
+            error_output "check_lvm_item():"\
                       "Occurred on line ${line_num} in ${CSV_FILE}."
              return 1    
          fi
@@ -473,7 +475,7 @@ construct_lvm_cmdline() {
                      fi
                      ;;
              *)
-                echo >&2 "`basename $0`: construct_lvm_cmdline() error:"\
+                error_output "construct_lvm_cmdline():"\
                           "Invalid operation mode - \"${OP_MODE[i]}\"!"
                  return 1
                  ;;
@@ -511,7 +513,7 @@ config_lvm_devs() {
      verbose_output "Configuring LVM devices in host ${host_name}..."
      verbose_output "Configure command line is: \"${LVM_CMDLINE}\""
      REMOTE_CMD[pid_num]="${REMOTE} ${host_name} \"${LVM_CMDLINE}\""
-    ${REMOTE} ${host_name} "(${EXPORT_PATH} ${LVM_CMDLINE})" >&2 &
+    $REMOTE $host_name "export PATH=\$PATH:/sbin:/usr/sbin; $LVM_CMDLINE" &
      REMOTE_PID[pid_num]=$!
      let "pid_num += 1"
  
@@ -553,7 +555,7 @@ config_lvm() {
      for ((pid_num = 0; pid_num < ${#REMOTE_PID[@]}; pid_num++)); do
          wait ${REMOTE_PID[${pid_num}]}
          if [ ${PIPESTATUS[0]} -ne 0 ]; then
-            echo >&2 "`basename $0`: config_lvm() error: Failed"\
+            error_output "config_lvm(): Failed"\
                   "to execute \"${REMOTE_CMD[${pid_num}]}\"!"
              failed_status=true
          fi
@@ -569,13 +571,10 @@ config_lvm() {
  
  # Main flow
  # Check the csv file
-if ! check_file $1; then
-    exit 1    
-fi
+check_file $CSV_FILE || exit ${PIPESTATUS[0]}
  
  # Get the list of nodes to be operated on
-NODES_TO_USE=$(get_nodelist)
-[ ${PIPESTATUS[0]} -ne 0 ] && echo >&2 "${NODES_TO_USE}" && exit 1
+NODES_TO_USE=$(get_nodelist) || error_exit ${PIPESTATUS[0]} "$NODES_TO_USE"
  
  # Check the node list
  check_nodelist ${NODES_TO_USE} || exit 1
diff --git a/lustre/scripts/lc_md.in b/lustre/scripts/lc_md.in

index ab741af..0790ebc 100644 (file)
--- a/lustre/scripts/lc_md.in
+++ b/lustre/scripts/lc_md.in
@@ -114,17 +114,19 @@ shift  `expr $OPTIND - 1`
  
  # Here we expect the csv file
  if [ $# -eq 0 ]; then
-    echo >&2 "`basename $0`: Missing csv file!"
+    error_output "Missing csv file!"
      usage
  fi
  
+CSV_FILE=$1
+
  # check_md_item index
  #
  # Check the items required for managing MD device ${MD_NAME[index]}
  check_md_item() {
      # Check argument
      if [ $# -eq 0 ]; then
-        echo >&2 "`basename $0`: check_md_item() error:"\
+        error_output "check_md_item():"\
                   "Missing argument!"
          return 1
      fi
@@ -133,7 +135,7 @@ check_md_item() {
  
      # Check hostname
      if [ -z "${HOST_NAME[i]}" ]; then
-        echo >&2 "`basename $0`: check_md_item() error:"\
+        error_output "check_md_item():"\
                   "hostname item has null value!"
          return 1
      fi
@@ -142,19 +144,19 @@ check_md_item() {
      if [ -z "${OP_MODE[i]}" -o "${OP_MODE[i]}" = "create" ]; then
          # Check MD device name 
          if [ -z "${MD_NAME[i]}" ]; then
-            echo >&2 "`basename $0`: check_md_item() error:"\
+            error_output "check_md_item():"\
              "md name item has null value!"
              return 1
          fi
  
          if [ -z "${RAID_LEVEL[i]}" ]; then
-            echo >&2 "`basename $0`: check_md_item() error:"\
+            error_output "check_md_item():"\
              "raid level item of MD device ${MD_NAME[i]} has null value!"
              return 1
          fi
  
          if [ -z "${MD_DEVS[i]}" ]; then
-            echo >&2 "`basename $0`: check_md_item() error:"\
+            error_output "check_md_item():"\
              "component devices item of ${MD_NAME[i]} has null value!"
              return 1
          fi
@@ -169,11 +171,11 @@ check_md_item() {
  get_md_items() {
      # Check argument
      if [ $# -eq 0 ]; then
-        echo >&2 "`basename $0`: get_md_items() error: Missing csv file!"
+        error_output "get_md_items(): Missing csv file!"
          return 1
      fi
  
-    CSV_FILE=$1
+    local CSV_FILE=$1
      local LINE
      local hostname
      declare -i line_num=0
@@ -208,7 +210,7 @@ get_md_items() {
  
          # Check some required items
          if ! check_md_item $idx; then
-            echo >&2 "`basename $0`: check_md_item() error:"\
+            error_output "check_md_item():"\
                       "Occurred on line ${line_num} in ${CSV_FILE}."
              return 1    
          fi
@@ -231,7 +233,7 @@ md_is_active() {
      ret_str=$(${REMOTE} ${host_name} "${cmd}" 2>&1)
      if [ ${PIPESTATUS[0]} -ne 0 ]; then
          if [ -n "${ret_str}" ]; then
-            echo >&2 "`basename $0`: md_is_active() error:"\
+            error_output "md_is_active():"\
              "remote command to ${host_name} error: ${ret_str}!"
              return 2    # Error occurred
          else
@@ -365,7 +367,7 @@ construct_mdadm_cmdline() {
                      # Construct the create command line
                      mdadm_cmd=$(construct_mdadm_create_cmdline ${i})
                      if [ ${PIPESTATUS[0]} -ne 0 ]; then
-                        echo >&2 "${mdadm_cmd}"
+                        error_output "${mdadm_cmd}"
                          return 1
                      fi
  
@@ -429,7 +431,7 @@ config_md_devs() {
      verbose_output "Configuring MD devices in host ${host_name}..."
      verbose_output "Configure command line is: \"${MDADM_CMDLINE}\""
      REMOTE_CMD[pid_num]="${REMOTE} ${host_name} \"${MDADM_CMDLINE}\""
-    ${REMOTE} ${host_name} "${MDADM_CMDLINE}" >&2 &
+    $REMOTE $host_name "export PATH=\$PATH:/sbin:/usr/sbin; $MDADM_CMDLINE" &
      REMOTE_PID[pid_num]=$!
      let "pid_num += 1"
      sleep 1
@@ -471,7 +473,7 @@ config_md() {
      for ((pid_num = 0; pid_num < ${#REMOTE_PID[@]}; pid_num++)); do
          wait ${REMOTE_PID[${pid_num}]}
          if [ ${PIPESTATUS[0]} -ne 0 ]; then
-            echo >&2 "`basename $0`: config_md() error: Failed"\
+            error_output "config_md(): Failed"\
                   "to execute \"${REMOTE_CMD[${pid_num}]}\"!"
              failed_status=true
          fi
@@ -487,13 +489,10 @@ config_md() {
  
  # Main flow
  # Check the csv file
-if ! check_file $1; then
-    exit 1    
-fi
+check_file $CSV_FILE || exit ${PIPESTATUS[0]}
  
  # Get the list of nodes to be operated on
-NODES_TO_USE=$(get_nodelist)
-[ ${PIPESTATUS[0]} -ne 0 ] && echo >&2 "${NODES_TO_USE}" && exit 1
+NODES_TO_USE=$(get_nodelist) || error_exit ${PIPESTATUS[0]} "$NODES_TO_USE"
  
  # Check the node list
  check_nodelist ${NODES_TO_USE} || exit 1
diff --git a/lustre/scripts/lc_net.in b/lustre/scripts/lc_net.in

index d618c69..16196d3 100644 (file)
--- a/lustre/scripts/lc_net.in
+++ b/lustre/scripts/lc_net.in
@@ -1,4 +1,7 @@
  #!/bin/bash
+
+# vim:expandtab:shiftwidth=4:softtabstop=4:tabstop=4:
+
  #
  # lc_net - script for Lustre cluster network verification
  #
@@ -58,7 +61,7 @@ shift  `expr $OPTIND - 1`
  
  # Here we expect the csv file
  if [ $# -eq 0 ]; then
-       echo >&2 $"`basename $0`: Missing csv file!"
+       error_output "Missing csv file!"
         usage
  fi
  
@@ -76,7 +79,7 @@ get_hostnames() {
  
         # Get the list of nodes to be operated on
         NODES_TO_USE=$(get_nodelist)
-       [ ${PIPESTATUS[0]} -ne 0 ] && echo >&2 "${NODES_TO_USE}" && return 1
+       [ ${PIPESTATUS[0]} -ne 0 ] && error_output "${NODES_TO_USE}" && return 1
  
         # Check the node list
         if [ -z "${NODES_TO_USE}" ]; then
@@ -133,7 +136,7 @@ local_check() {
         # and get the IP address of this host from ping
         HOST_IPADDRS[i]=$(ping_host ${HOST_NAMES[i]})
         if [ ${PIPESTATUS[0]} -ne 0 ]; then
-               echo >&2 "${HOST_IPADDRS[i]}"
+               error_output "${HOST_IPADDRS[i]}"
                 return 1
         fi
  
@@ -153,13 +156,13 @@ remote_check() {
         cmd="ping -c1 ${HOST_NAMES[i]} 2>&1"
         ret_str=$(${REMOTE} ${HOST_NAMES[i]} "${cmd}" 2>&1)
         if [ ${PIPESTATUS[0]} -ne 0 -a -n "${ret_str}" ]; then
-               echo >&2 "`basename $0`: remote_check() error:"\
+               error_output "remote_check():"\
                 "remote to ${HOST_NAMES[i]} error: ${ret_str}!"
                 return 1
         fi
  
         if [ -z "${ret_str}" ]; then
-               echo >&2 "`basename $0`: remote_check() error:"\
+               error_output "remote_check():"\
                 "No results from ${HOST_NAMES[i]}! Check the network"\
                 "connectivity between local host and ${HOST_NAMES[i]}!"
                 return 1
@@ -177,7 +180,7 @@ remote_check() {
         # Check whether ${HOST_NAMES[i]} agrees with the local host
         # about what its name is resolved to.
         if [ "${ip_addr}" != "${HOST_IPADDRS[i]}" ]; then
-               echo >&2 "`basename $0`: remote_check() error:"\
+               error_output "remote_check():"\
                 "Local host resolves ${HOST_NAMES[i]} to IP address"\
                 "\"${HOST_IPADDRS[i]}\", while its own resolution is"\
                 "\"${ip_addr}\". They are not the same!"
diff --git a/lustre/scripts/license-status b/lustre/scripts/license-status

index 5407b91..de2cb02 100755 (executable)
--- a/lustre/scripts/license-status
+++ b/lustre/scripts/license-status
@@ -1,6 +1,6 @@
  #! /bin/sh
  # license-status - Display the status of files in the current directory
-# Copyright (C) 2001  Cluster File Systems, Inc.
+# Copyright 2008 Sun Microsystems, Inc.
  #
  # This code is issued under the GNU General Public License.
  # See the file COPYING in this distribution
diff --git a/lustre/scripts/lustre_config.in b/lustre/scripts/lustre_config.in

index 3fb13e8..fcdf45f 100644 (file)
--- a/lustre/scripts/lustre_config.in
+++ b/lustre/scripts/lustre_config.in
@@ -17,9 +17,9 @@
  
  # Usage
  usage() {
-    cat >&2 <<EOF
+    cat <<EOF
  
-Usage:  `basename $0` [options] <csv file>
+Usage: $(basename $0) [options] <-a|-w|-x> <csv file>
  
      This script is used to format and set up multiple lustre servers from a
      csv file.
@@ -53,7 +53,6 @@ Usage:  `basename $0` [options] <csv file>
                  (separated by commas) for each target in a Lustre cluster
  
  EOF
-    exit 1
  }
  
  # Samples 
@@ -233,29 +232,12 @@ EOF
  . @scriptlibdir@/lc_common
  
  #***************************** Global variables *****************************#
-declare -a MGS_NODENAME             # node names of the MGS servers
-declare -a MGS_IDX                  # indexes of MGSs in the global arrays
-declare -i MGS_NUM                  # number of MGS servers in the cluster
-declare -i INIT_IDX
-
  declare -a NODE_NAMES               # node names in the failover group
  declare -a TARGET_OPTS              # target services in one failover group
  
-# All the items in the csv file
-declare -a HOST_NAME MODULE_OPTS DEVICE_NAME MOUNT_POINT DEVICE_TYPE FS_NAME
-declare -a MGS_NIDS INDEX FORMAT_OPTIONS MKFS_OPTIONS MOUNT_OPTIONS FAILOVERS
-
-# Heartbeat software requires that node names in the configuration directive
-# must (normally) match the "uname -n" of that machine. Since the value of the
-# "failover nids" field in the csv file is the NID(s) of failover partner node,
-# we have to figure out the corresponding hostname of that node.
-declare -a FAILOVERS_NAMES
-
-VERIFY_CONNECT=true
  CONFIG_MD_LVM=false
  MODIFY_FSTAB=true
  UPGRADE_TARGET=false
-VERBOSE_OUTPUT=false
  # Get and check the positional parameters
  while getopts "aw:x:t:ndfmuhv" OPTION; do
      case $OPTION in
@@ -279,9 +261,10 @@ while getopts "aw:x:t:ndfmuhv" OPTION; do
          if [ "${HATYPE_OPT}" != "${HBVER_HBV1}" ] \
          && [ "${HATYPE_OPT}" != "${HBVER_HBV2}" ] \
          && [ "${HATYPE_OPT}" != "${HATYPE_CLUMGR}" ]; then
-            echo >&2 $"`basename $0`: Invalid HA software type" \
+            error_output "Invalid HA software type" \
                        "- ${HATYPE_OPT}!"
-            usage
+            usage 1>&2
+            exit 1
          fi
          ;;
      n)
@@ -300,6 +283,7 @@ while getopts "aw:x:t:ndfmuhv" OPTION; do
          UPGRADE_TARGET=true 
          ;;
      h)
+        usage
          sample
          ;;
      v)
@@ -307,7 +291,9 @@ while getopts "aw:x:t:ndfmuhv" OPTION; do
          VERBOSE_OUTPUT=true
          ;;
      ?)
-        usage 
+        usage 1>&2
+        exit 1
+        ;;
      esac
  done
  
@@ -316,210 +302,18 @@ shift  `expr $OPTIND - 1`
  
  # Here we expect the csv file
  if [ $# -eq 0 ]; then
-    echo >&2 $"`basename $0`: Missing csv file!"
-    usage
+    error_output "Missing csv file!"
+    usage 1>&2
+    exit 1
  fi
  
-# Check the items required for OSTs, MDTs and MGS
-#
-# When formatting an OST, the following items: hostname, module_opts,
-# device name, device type and mgs nids, cannot have null value.
-#
-# When formatting an MDT or MGS, the following items: hostname,
-# module_opts, device name and device type, cannot have null value.
-check_item() {
-    # Check argument
-    if [ $# -eq 0 ]; then
-        echo >&2 $"`basename $0`: check_item() error: Missing argument"\
-                  "for function check_item()!"
-        return 1
-    fi
-
-    declare -i i=$1
-
-    # Check hostname, module_opts, device name and device type
-    if [ -z "${HOST_NAME[i]}" ]||[ -z "${MODULE_OPTS[i]}" ]\
-    ||[ -z "${DEVICE_NAME[i]}" ]||[ -z "${DEVICE_TYPE[i]}" ]; then
-        echo >&2 $"`basename $0`: check_item() error: Some required"\
-                  "item has null value! Check hostname, module_opts,"\
-                  "device name and device type!"
-        return 1
-    fi
-
-    # Check mgs nids
-    if [ "${DEVICE_TYPE[i]}" = "ost" ]&&[ -z "${MGS_NIDS[i]}" ]; then
-        echo >&2 $"`basename $0`: check_item() error: OST's mgs nids"\
-                  "item has null value!"
-        return 1
-    fi
-
-    # Check mount point
-    if [ -z "${MOUNT_POINT[i]}" ]; then
-        echo >&2 $"`basename $0`: check_item() error: mount"\
-                  "point item of target ${DEVICE_NAME[i]} has null value!"
-        return 1
-    fi
-
-    return 0
-}
-
-# Get the number of MGS nodes in the cluster
-get_mgs_num() {
-    INIT_IDX=0
-    MGS_NUM=${#MGS_NODENAME[@]}
-    [ -z "${MGS_NODENAME[0]}" ] && let "INIT_IDX += 1" \
-    && let "MGS_NUM += 1"
-}
-
-# is_mgs_node hostname
-# Verify whether @hostname is a MGS node
-is_mgs_node() {
-    local host_name=$1
-    declare -i i
-
-    get_mgs_num
-    for ((i = ${INIT_IDX}; i < ${MGS_NUM}; i++)); do
-        [ "${MGS_NODENAME[i]}" = "${host_name}" ] && return 0
-    done
-
-    return 1
-}
-
-# Check whether the MGS nodes are in the same failover group
-check_mgs_group() {
-    declare -i i
-    declare -i j
-    declare -i idx
-    local mgs_node
-
-    get_mgs_num
-    for ((i = ${INIT_IDX}; i < ${MGS_NUM}; i++)); do
-        mgs_node=${MGS_NODENAME[i]}
-        for ((j = ${INIT_IDX}; j < ${MGS_NUM}; j++)); do
-          [ "${MGS_NODENAME[j]}" = "${mgs_node}" ] && continue 1
-
-          idx=${MGS_IDX[j]}
-          if [ "${FAILOVERS_NAMES[idx]#*$mgs_node*}" = "${FAILOVERS_NAMES[idx]}" ]
-          then
-            echo >&2 $"`basename $0`: check_mgs_group() error:"\
-            "MGS node ${mgs_node} is not in the ${HOST_NAME[idx]}"\
-            "failover group!"
-            return 1
-          fi
-        done
-    done
-
-    return 0
-}
-
-# Get and check MGS servers.
-# There should be no more than one MGS specified in the entire csv file.
-check_mgs() {
-    declare -i i
-    declare -i j
-    declare -i exp_idx    # Index of explicit MGS servers
-    declare -i imp_idx    # Index of implicit MGS servers
-    local is_exp_mgs is_imp_mgs
-    local mgs_node
-
-    # Initialize the MGS_NODENAME and MGS_IDX arrays
-    unset MGS_NODENAME
-    unset MGS_IDX
-
-    exp_idx=1
-    imp_idx=1
-    for ((i = 0; i < ${#HOST_NAME[@]}; i++)); do
-        is_exp_mgs=false
-        is_imp_mgs=false
-
-        # Check whether this node is an explicit MGS node 
-        # or an implicit one
-        if [ "${DEVICE_TYPE[i]#*mgs*}" != "${DEVICE_TYPE[i]}" ]; then
-            verbose_output "Explicit MGS target" \
-            "${DEVICE_NAME[i]} in host ${HOST_NAME[i]}."
-            is_exp_mgs=true
-        fi
-
-        if [ "${DEVICE_TYPE[i]}" = "mdt" -a -z "${MGS_NIDS[i]}" ]; then
-            verbose_output "Implicit MGS target" \
-            "${DEVICE_NAME[i]} in host ${HOST_NAME[i]}."
-            is_imp_mgs=true
-        fi
-
-        # Get and check MGS servers
-        if ${is_exp_mgs} || ${is_imp_mgs}; then
-            # Check whether more than one MGS target in one MGS node
-            if is_mgs_node ${HOST_NAME[i]}; then
-                echo >&2 $"`basename $0`: check_mgs() error:"\
-                "More than one MGS target in the same node -"\
-                "\"${HOST_NAME[i]}\"!"
-                return 1
-            fi
-
-            # Get and check primary MGS server and backup MGS server        
-            if [ "${FORMAT_OPTIONS[i]}" = "${FORMAT_OPTIONS[i]#*noformat*}" ]
-            then
-                # Primary MGS server
-                if [ -z "${MGS_NODENAME[0]}" ]; then
-                    if [ "${is_exp_mgs}" = "true" -a ${imp_idx} -gt 1 ] \
-                    || [ "${is_imp_mgs}" = "true" -a ${exp_idx} -gt 1 ]; then
-                        echo >&2 $"`basename $0`: check_mgs() error:"\
-                        "There exist both explicit and implicit MGS"\
-                        "targets in the csv file!"
-                        return 1
-                    fi
-                    MGS_NODENAME[0]=${HOST_NAME[i]}
-                    MGS_IDX[0]=$i
-                else
-                    mgs_node=${MGS_NODENAME[0]}
-                    if [ "${FAILOVERS_NAMES[i]#*$mgs_node*}" = "${FAILOVERS_NAMES[i]}" ]
-                    then
-                        echo >&2 $"`basename $0`: check_mgs() error:"\
-                        "More than one primary MGS nodes in the csv" \
-                        "file - ${MGS_NODENAME[0]} and ${HOST_NAME[i]}!"
-                    else
-                        echo >&2 $"`basename $0`: check_mgs() error:"\
-                        "MGS nodes ${MGS_NODENAME[0]} and ${HOST_NAME[i]}"\
-                        "are failover pair, one of them should use"\
-                        "\"--noformat\" in the format options item!"
-                    fi
-                    return 1
-                fi
-            else    # Backup MGS server
-                if [ "${is_exp_mgs}" = "true" -a ${imp_idx} -gt 1 ] \
-                || [ "${is_imp_mgs}" = "true" -a ${exp_idx} -gt 1 ]; then
-                    echo >&2 $"`basename $0`: check_mgs() error:"\
-                    "There exist both explicit and implicit MGS"\
-                    "targets in the csv file!"
-                    return 1
-                fi
-
-                if ${is_exp_mgs}; then # Explicit MGS
-                    MGS_NODENAME[exp_idx]=${HOST_NAME[i]}
-                    MGS_IDX[exp_idx]=$i
-                    exp_idx=$(( exp_idx + 1 ))
-                else    # Implicit MGS
-                    MGS_NODENAME[imp_idx]=${HOST_NAME[i]}
-                    MGS_IDX[imp_idx]=$i
-                    imp_idx=$(( imp_idx + 1 ))
-                fi
-            fi
-        fi #End of "if ${is_exp_mgs} || ${is_imp_mgs}"
-    done
-
-    # Check whether the MGS nodes are in the same failover group
-    if ! check_mgs_group; then
-        return 1
-    fi
-
-    return 0
-}
+CSV_FILE=$1
  
  # Construct the command line of mkfs.lustre
  construct_mkfs_cmdline() {
      # Check argument
      if [ $# -eq 0 ]; then
-        echo >&2 $"`basename $0`: construct_mkfs_cmdline() error:"\
+        error_output "construct_mkfs_cmdline():"\
                    "Missing argument for function construct_mkfs_cmdline()!"
          return 1
      fi
@@ -548,7 +342,7 @@ construct_mkfs_cmdline() {
          MKFS_CMD="$MKFS_CMD --mgs --mdt"
          ;;
      *)
-        echo >&2 $"`basename $0`: construct_mkfs_cmdline() error:"\
+        error_output "construct_mkfs_cmdline():"\
                    "Invalid device type - \"${DEVICE_TYPE[i]}\"!"
          return 1
          ;;
@@ -596,7 +390,7 @@ construct_mkfs_cmdline() {
  get_nodenames() {
      # Check argument
      if [ $# -eq 0 ]; then
-        echo >&2 $"`basename $0`: get_nodenames() error: Missing"\
+        error_output "get_nodenames(): Missing"\
                    "argument for function get_nodenames()!"
          return 1
      fi
@@ -615,7 +409,7 @@ get_nodenames() {
      do
          NODE_NAMES[idx]=$(nids2hostname ${nids})
          if [ ${PIPESTATUS[0]} -ne 0 ]; then
-            echo >&2 "${NODE_NAMES[idx]}"
+            error_output "${NODE_NAMES[idx]}"
              return 1
          fi
      
@@ -645,7 +439,7 @@ gen_ha_config() {
      HOSTNAME_OPT=${HOST_NAME[i]}
  
      if ! get_nodenames $i; then
-        echo >&2 $"`basename $0`: gen_ha_config() error: Can not get the"\
+        error_output "gen_ha_config(): Can not get the"\
          "failover nodenames from failover nids - \"${FAILOVERS[i]}\" in"\
          "the \"${HOST_NAME[i]}\" failover group!"
          return 1
@@ -746,238 +540,6 @@ config_ha() {
      return 0
  }
  
-# Get all the items in the csv file and do some checks.
-get_items() {
-    # Check argument
-    if [ $# -eq 0 ]; then
-        echo >&2 $"`basename $0`: get_items() error: Missing argument"\
-                  "for function get_items()!"
-        return 1
-    fi
-
-    CSV_FILE=$1
-    local LINE
-    local marker
-    local hostname
-    declare -i line_num=0
-    declare -i idx=0
-
-    exec 9< ${CSV_FILE}
-    while read -u 9 -r LINE; do
-        line_num=${line_num}+1
-        # verbose_output "Parsing line ${line_num}: $LINE"
-
-        # Get rid of the empty line
-        if [ -z "`echo ${LINE}|awk '/[[:alnum:]]/ {print $0}'`" ]; then
-            continue
-        fi
-
-        # Get rid of the comment line
-        if [ -z "`echo \"${LINE}\" | egrep -v \"([[:space:]]|^)#\"`" ]
-        then
-            continue
-        fi
-
-        # Skip the Linux MD/LVM line
-        marker=$(echo ${LINE} | cut -d, -f 2)
-        if [ "${marker}" = "${MD_MARKER}" -o "${marker}" = "${PV_MARKER}" ] \
-        || [ "${marker}" = "${VG_MARKER}" -o "${marker}" = "${LV_MARKER}" ]; then
-            continue
-        fi
-
-        # Skip the host which is not specified in the host list
-        if ! ${USE_ALLNODES}; then
-            hostname=$(echo ${LINE} | cut -d, -f 1)
-            ! host_in_hostlist ${hostname} ${NODES_TO_USE} && continue
-        fi
-
-        # Parse the config line into CONFIG_ITEM
-        if ! parse_line "$LINE"; then
-            echo >&2 $"`basename $0`: parse_line() error: Occurred"\
-                  "on line ${line_num} in ${CSV_FILE}: $LINE"
-            return 1    
-        fi
-
-        HOST_NAME[idx]=${CONFIG_ITEM[0]}
-        MODULE_OPTS[idx]=${CONFIG_ITEM[1]}
-        DEVICE_NAME[idx]=${CONFIG_ITEM[2]}
-        MOUNT_POINT[idx]=${CONFIG_ITEM[3]}
-        DEVICE_TYPE[idx]=${CONFIG_ITEM[4]}
-        FS_NAME[idx]=${CONFIG_ITEM[5]}
-        MGS_NIDS[idx]=${CONFIG_ITEM[6]}
-        INDEX[idx]=${CONFIG_ITEM[7]}
-        FORMAT_OPTIONS[idx]=${CONFIG_ITEM[8]}
-        MKFS_OPTIONS[idx]=${CONFIG_ITEM[9]}
-        MOUNT_OPTIONS[idx]=${CONFIG_ITEM[10]}
-        FAILOVERS[idx]=${CONFIG_ITEM[11]}
-
-        MODULE_OPTS[idx]=`echo "${MODULE_OPTS[idx]}" | sed 's/"/\\\"/g'`
-
-        # Convert IP addresses in NIDs to hostnames
-        FAILOVERS_NAMES[idx]=$(ip2hostname_multi_node ${FAILOVERS[idx]})
-        if [ ${PIPESTATUS[0]} -ne 0 ]; then
-            echo >&2 "${FAILOVERS_NAMES[idx]}"
-            return 1
-        fi
-
-        # Check some required items for formatting target
-        if ! check_item $idx; then
-            echo >&2 $"`basename $0`: check_item() error:"\
-                  "Occurred on line ${line_num} in ${CSV_FILE}."
-            return 1    
-        fi
-
-        idx=${idx}+1
-    done
-
-    return 0
-}
-
-# check_lnet_connect hostname_index mgs_hostname
-# Check whether the target node can contact the MGS node @mgs_hostname
-# If @mgs_hostname is null, then it means the primary MGS node
-check_lnet_connect() {
-    declare -i i=$1
-    local mgs_node=$2
-
-    local COMMAND RET_STR
-    local mgs_prim_nids
-    local nids_str=
-    local mgs_nid 
-    local ping_mgs
-
-    # Execute remote command to check that 
-    # this node can contact the MGS node
-    verbose_output "Checking lnet connectivity between" \
-    "${HOST_NAME[i]} and the MGS node ${mgs_node}"
-    mgs_prim_nids=`echo ${MGS_NIDS[i]} | awk -F: '{print $1}'`
-
-    if [ -z "${mgs_node}" -o $MGS_NUM -eq 1 ]; then
-        nids_str=${mgs_prim_nids}    # nids of primary MGS node
-        if [ -z "${nids_str}" ]; then
-            echo >&2 $"`basename $0`: check_lnet_connect() error:"\
-            "Check the mgs nids item of host ${HOST_NAME[i]}!"\
-            "Missing nids of the primary MGS node!"
-            return 1
-        fi
-    else
-        # Get the corresponding NID(s) of the MGS node ${mgs_node}
-        # from the "mgs nids" field
-        nids_str=$(get_mgs_nids ${mgs_node} ${MGS_NIDS[i]})
-        if [ ${PIPESTATUS[0]} -ne 0 ]; then
-            echo >&2 "${nids_str}"
-            return 1
-        fi
-    fi
-
-    ping_mgs=false
-    for mgs_nid in ${nids_str//,/ }
-    do
-        COMMAND=$"${LCTL} ping ${mgs_nid} 5 || echo failed 2>&1"
-        RET_STR=$(${REMOTE} ${HOST_NAME[i]} "${COMMAND}" 2>&1)
-        if [ ${PIPESTATUS[0]} -eq 0 -a "${RET_STR}" = "${RET_STR#*failed*}" ]
-        then
-            # This node can contact the MGS node
-            verbose_output "${HOST_NAME[i]} can contact the MGS" \
-            "node ${mgs_node} by using nid \"${mgs_nid}\"!"
-            ping_mgs=true
-            break
-        fi
-    done
-
-    if ! ${ping_mgs}; then
-        echo >&2 "`basename $0`: check_lnet_connect() error:" \
-        "${HOST_NAME[i]} cannot contact the MGS node ${mgs_node}"\
-        "with nids - \"${nids_str}\"! Check ${LCTL} command!"
-        return 1
-    fi
-
-    return 0
-}
-
-# Start lnet network in the cluster node and check that 
-# this node can contact the MGS node
-check_lnet() {
-    if ! ${VERIFY_CONNECT}; then
-        return 0
-    fi
-
-    # Check argument
-    if [ $# -eq 0 ]; then
-        echo >&2 $"`basename $0`: check_lnet() error: Missing"\
-              "argument for function check_lnet()!"
-        return 1
-    fi
-
-    declare -i i=$1
-    declare -i j
-    local COMMAND RET_STR
-
-    # Execute remote command to start lnet network
-    verbose_output "Starting lnet network in ${HOST_NAME[i]}"
-    COMMAND="PATH=\$PATH:/sbin:/usr/sbin modprobe lnet; ${LCTL} network up 2>&1"
-    RET_STR=$(${REMOTE} ${HOST_NAME[i]} "${COMMAND}" 2>&1)
-    if [ ${PIPESTATUS[0]} -ne 0 -o "${RET_STR}" = "${RET_STR#*LNET configured*}" ]
-    then
-        echo >&2 "`basename $0`: check_lnet() error: remote" \
-                 "${HOST_NAME[i]} error: ${RET_STR}"
-        return 1
-    fi
-
-    if is_mgs_node ${HOST_NAME[i]}; then
-        return 0
-    fi
-
-    # Execute remote command to check that 
-    # this node can contact the MGS node
-    for ((j = 0; j < ${MGS_NUM}; j++)); do
-        if ! check_lnet_connect $i ${MGS_NODENAME[j]}; then
-            return 1
-        fi
-    done
-
-    return 0
-}
-
-# Start lnet network in the MGS node
-start_mgs_lnet() {
-    declare -i i
-    declare -i idx
-    local COMMAND
-
-    if [ -z "${MGS_NODENAME[0]}" -a  -z "${MGS_NODENAME[1]}" ]; then
-        if ${USE_ALLNODES}; then
-            verbose_output "There is no MGS target in the ${CSV_FILE} file."
-        else
-            verbose_output "There is no MGS target in the node list \"${NODES_TO_USE}\"."
-        fi
-        return 0
-    fi
-
-    for ((i = ${INIT_IDX}; i < ${MGS_NUM}; i++)); do
-        # Execute remote command to add lnet options lines to 
-        # the MGS node's modprobe.conf/modules.conf
-        idx=${MGS_IDX[i]}
-        COMMAND=$"echo \"${MODULE_OPTS[${idx}]}\"|${MODULE_CONFIG}"
-        verbose_output "Adding lnet module options to ${MGS_NODENAME[i]}"
-        ${REMOTE} ${MGS_NODENAME[i]} "${COMMAND}" >&2 
-        if [ ${PIPESTATUS[0]} -ne 0 ]; then
-            echo >&2 "`basename $0`: start_mgs_lnet() error:"\
-                 "Failed to execute remote command to" \
-                 "add module options to ${MGS_NODENAME[i]}!"\
-                 "Check ${MODULE_CONFIG}!"
-            return 1
-        fi
-
-        # Start lnet network in the MGS node
-        if ! check_lnet ${idx}; then
-            return 1    
-        fi
-    done
-
-    return 0
-}
-
  # Execute remote command to add lnet options lines to remote nodes'
  # modprobe.conf/modules.conf and format(mkfs.lustre) Lustre targets
  mass_config() {
@@ -986,9 +548,10 @@ mass_config() {
      declare -a REMOTE_CMD 
      declare -i pid_num=0
      declare -i i=0
+    local checked_hosts=""
  
      if [ ${#HOST_NAME[@]} -eq 0 ]; then
-        verbose_output "There are no lustre targets specified."
+        verbose_output "There are no Lustre targets specified."
          return 0
      fi
  
@@ -1009,30 +572,22 @@ mass_config() {
                         "${HOST_NAME[i]}"
          ${REMOTE} ${HOST_NAME[i]} "${COMMAND}" >&2 
          if [ ${PIPESTATUS[0]} -ne 0 ]; then
-            echo >&2 "`basename $0`: mass_config() error:"\
+            error_output "mass_config():"\
                   "Failed to execute remote command to"\
                   "create the mountpoint on ${HOST_NAME[i]}!"
              return 1
          fi
  
-        if ! $UPGRADE_TARGET && ! is_mgs_node ${HOST_NAME[i]}; then
+        if ! $UPGRADE_TARGET && ! is_mgs_node ${HOST_NAME[i]} && \
+        ! host_in_hostlist ${HOST_NAME[i]} $checked_hosts; then
              # Execute remote command to add lnet options lines to 
              # modprobe.conf/modules.conf
-            COMMAND=$"echo \"${MODULE_OPTS[i]}\"|${MODULE_CONFIG}"
-            verbose_output "Adding lnet module options to" \
-                       "${HOST_NAME[i]}"
-            ${REMOTE} ${HOST_NAME[i]} "${COMMAND}" >&2 
-            if [ ${PIPESTATUS[0]} -ne 0 ]; then
-                echo >&2 "`basename $0`: mass_config() error:"\
-                     "Failed to execute remote command to"\
-                     "add module options to ${HOST_NAME[i]}!"
-                return 1
-            fi
+            add_module_options $i ${HOST_NAME[i]} || return ${PIPESTATUS[0]}
  
              # Check lnet networks
-            if ! check_lnet $i; then
-                return 1    
-            fi
+            check_lnet $i || return ${PIPESTATUS[0]}
+
+            checked_hosts="$checked_hosts,${HOST_NAME[i]}"
          fi
  
          # Execute remote command to format or upgrade Lustre target
@@ -1040,7 +595,7 @@ mass_config() {
          $UPGRADE_TARGET && OP="Upgrading" || OP="Formatting"
          verbose_output "$OP Lustre target ${DEVICE_NAME[i]} on ${HOST_NAME[i]}..."
  
-        COMMAND="$EXPORT_PATH $MKFS_CMD"
+        COMMAND="export PATH=\$PATH:/sbin:/usr/sbin; $MKFS_CMD"
          REMOTE_CMD[$pid_num]="$REMOTE ${HOST_NAME[i]} \"$COMMAND\""
          verbose_output "$OP command line is: ${REMOTE_CMD[$pid_num]}"
  
@@ -1056,7 +611,7 @@ mass_config() {
      for ((pid_num = 0; pid_num < ${#REMOTE_PID[@]}; pid_num++)); do
          wait ${REMOTE_PID[${pid_num}]}
          if [ ${PIPESTATUS[0]} -ne 0 ]; then
-            echo >&2 "`basename $0`: mass_config() error: Failed"\
+            error_output "mass_config(): Failed"\
              "to execute \"${REMOTE_CMD[${pid_num}]}\"!"
              fail_exit_status=true
          fi
@@ -1122,12 +677,12 @@ modify_fstab() {
          # Get mount options
          if [ -n "${MOUNT_OPTIONS[i]}" ]; then
              # The mount options already specified in the csv file.
-            mntopts=${MOUNT_OPTIONS[i]}
+            mntopts="${MOUNT_OPTIONS[i]}"
          else
              mntopts=$(get_mntopts ${HOST_NAME[i]} ${DEVICE_NAME[i]}\
                      ${FAILOVERS[i]})
              if [ ${PIPESTATUS[0]} -ne 0 ]; then
-                echo >&2 "${mntopts}"
+                error_output "${mntopts}"
                  return 1
              fi
          fi
@@ -1142,7 +697,7 @@ modify_fstab() {
                  echo -e \"${mntent}\" >> \$(fcanon /etc/fstab)"
          ${REMOTE} ${HOST_NAME[i]} "${COMMAND}" >&2
          if [ ${PIPESTATUS[0]} -ne 0 ]; then
-            echo >&2 "`basename $0`: modify_fstab() error:"\
+            error_output "modify_fstab():"\
              "Failed to modify /etc/fstab of host ${HOST_NAME[i]}"\
              "to add Lustre target ${DEVICE_NAME[i]}!"
              return 1
@@ -1152,18 +707,16 @@ modify_fstab() {
      return 0
  }
  
-# Main flow
+#********************************* Main Flow **********************************#
+
  # Check the csv file
-if ! check_file $1; then
-    exit 1 
-fi
+check_file $CSV_FILE || exit ${PIPESTATUS[0]}
  
  # Get the list of nodes to be operated on
-NODES_TO_USE=$(get_nodelist)
-[ ${PIPESTATUS[0]} -ne 0 ] && echo >&2 "${NODES_TO_USE}" && exit 1
+NODES_TO_USE=$(get_nodelist) || error_exit ${PIPESTATUS[0]} "$NODES_TO_USE"
  
  # Check the node list
-check_nodelist ${NODES_TO_USE} || exit 1
+check_nodelist $NODES_TO_USE || exit ${PIPESTATUS[0]}
  
  if ${VERIFY_CONNECT}; then
  # Check the network connectivity and hostnames
@@ -1192,22 +745,17 @@ if $CONFIG_MD_LVM && ! $UPGRADE_TARGET; then
  fi
  
  # Configure the Lustre cluster
-echo "`basename $0`: ******** Lustre cluster configuration START ********"
-if ! get_items ${CSV_FILE}; then
-    exit 1
-fi
+echo "`basename $0`: ******** Lustre cluster configuration BEGIN ********"
  
-if ! check_mgs; then
-    exit 1
-fi
+get_lustre_items $CSV_FILE || exit ${PIPESTATUS[0]}
  
-if ! mass_config; then
-    exit 1
-fi
+check_mgs || exit ${PIPESTATUS[0]}
  
-if ! modify_fstab; then
-    exit 1
-fi
+# Format or upgrade Lustre server targets
+mass_config || exit ${PIPESTATUS[0]}
+
+# Modify /etc/fstab to add the new Lustre server targets
+modify_fstab || exit ${PIPESTATUS[0]}
  
  # Produce HA software's configuration files
  if ! config_ha; then
diff --git a/lustre/scripts/lustre_createcsv.in b/lustre/scripts/lustre_createcsv.in

index 752f3cd..37d6ecd 100644 (file)
--- a/lustre/scripts/lustre_createcsv.in
+++ b/lustre/scripts/lustre_createcsv.in
@@ -1,4 +1,7 @@
  #!/bin/bash
+
+# vim:expandtab:shiftwidth=4:softtabstop=4:tabstop=4:
+
  #
  # lustre_createcsv - generate a csv file from a running lustre cluster
  #
@@ -13,7 +16,7 @@
  
  # Usage
  usage() {
-       cat >&2 <<EOF
+       cat <<EOF
  
  Usage: `basename $0` [-t HAtype] [-d] [-h] [-v] [-f csv_filename]
  
@@ -33,7 +36,6 @@ Usage:        `basename $0` [-t HAtype] [-d] [-h] [-v] [-f csv_filename]
                         Default is lustre_config.csv.
  
  EOF
-       exit 1
  }
  
  # Get the library of functions
@@ -111,35 +113,36 @@ while getopts "t:dhvf:" OPTION; do
                 if [ "${HATYPE_OPT}" != "${HBVER_HBV1}" ] \
                 && [ "${HATYPE_OPT}" != "${HBVER_HBV2}" ] \
                 && [ "${HATYPE_OPT}" != "${HATYPE_CLUMGR}" ]; then
-                       echo >&2 "`basename $0`: Invalid HA software type" \
+                       error_output "Invalid HA software type" \
                                  "- ${HATYPE_OPT}!"
-                       usage
+                       usage 1>&2
+            exit 1
                 fi
                 ;;
         d)      GET_MDLVM_INFO=true;;
-       h)      usage;;
+       h)      usage && exit 0;;
         v)      VERBOSE_OUTPUT=true;;
         f)      LUSTRE_CSV_FILE=$OPTARG;;
-        ?)     usage 
+    ?)         usage 1>&2 && exit 1;;
         esac
  done
  
  # Verify the local host is the MGS node
  mgs_node() {
         if [ ! -e ${LUSTRE_PROC_DEVICES} ]; then
-               echo >&2 "`basename $0`: error: ${LUSTRE_PROC_DEVICES} does" \
+               error_output "${LUSTRE_PROC_DEVICES} does" \
                          "not exist. Lustre kernel modules may not be loaded!"
                 return 1
         fi
  
         if [ -z "`cat ${LUSTRE_PROC_DEVICES}`" ]; then
-               echo >&2 "`basename $0`: error: ${LUSTRE_PROC_DEVICES} is" \
+               error_output "${LUSTRE_PROC_DEVICES} is" \
                          "empty. Lustre services may not be started!"
                 return 1
         fi
  
         if [ -z "`grep ${MGS_TYPE} ${LUSTRE_PROC_DEVICES}`" ]; then
-               echo >&2 "`basename $0`: error: This node is not a MGS node." \
+               error_output "This node is not a MGS node." \
                           "The script should be run on the MGS node!"
                 return 1
         fi
@@ -159,7 +162,7 @@ get_hostnames() {
         fi
  
         if [ ! -e ${LNET_PROC_PEERS} ]; then
-               echo >&2 "`basename $0`: error: ${LNET_PROC_PEERS} does not" \
+               error_output "${LNET_PROC_PEERS} does not" \
                           "exist. LNET kernel modules may not be loaded" \
                          "or LNET network may not be up!"
                 return 1
@@ -188,14 +191,14 @@ get_hostnames() {
         # Get the hostnames of the nodes
         for ((idx = 1, i = 1; idx < ${#HOST_NIDS[@]}; idx++, i++)); do
                 if [ -z "${HOST_NIDS[idx]}" ]; then
-                       echo >&2 "`basename $0`: get_hostnames() error:" \
+                       error_output "get_hostnames():" \
                                  "Invalid nid - \"${HOST_NIDS[idx]}\"!"
                         return 1
                 fi
  
                 HOST_NAMES[i]=$(nid2hostname ${HOST_NIDS[idx]})
                 if [ $? -ne 0 ]; then
-                       echo >&2 "${HOST_NAMES[i]}"
+                       error_output "${HOST_NAMES[i]}"
                         return 1
                 fi
  
@@ -247,7 +250,8 @@ get_md_configs() {
                 if [ "${first_item}" != "${first_item#devices=}" ]; then
                         MD_DEVS[j]=`echo "${line}" | sed -e 's/devices=//' -e 's/,/ /g'`
                 fi
-        done < <(${REMOTE} ${host_name} "${MDADM} --detail --scan --verbose")
+        done < <(${REMOTE} ${host_name} "PATH=\$PATH:/sbin:/usr/sbin
+        ${MDADM} --detail --scan --verbose")
  
         if [ $i -eq 0 ]; then
                 verbose_output "There are no active MD devices" \
@@ -265,11 +269,12 @@ get_pv_configs() {
         local cmd ret_str
  
         # Execute remote command to get all the PV informations.
-       cmd="${EXPORT_PATH} pvdisplay -c | awk -F: '{print \$1}' | xargs"
+       cmd="PATH=\$PATH:/sbin:/usr/sbin \
+pvdisplay -c | awk -F: '{print \$1}' | xargs"
         ret_str=`${REMOTE} ${host_name} "${cmd}" 2>&1`
         if [ $? -ne 0 ]; then
                 if [ -n "${ret_str}" ]; then
-                       echo >&2 "`basename $0`: get_pv_configs() error:" \
+                       error_output "get_pv_configs():" \
                         "remote command to ${host_name} error: ${ret_str}"
                 else
                         remote_error "get_pv_configs" ${host_name}
@@ -295,7 +300,7 @@ get_vg_pvnames() {
         local cmd ret_str
  
         # Execute remote command to get the PV names.
-       cmd="${EXPORT_PATH} vgdisplay -v ${vg_name} 2>/dev/null\
+       cmd="PATH=\$PATH:/sbin:/usr/sbin vgdisplay -v ${vg_name} 2>/dev/null\
              | grep \"PV Name\" | awk '{print \$3}' | xargs"
         ret_str=`${REMOTE} ${host_name} "${cmd}" 2>&1`
         if [ $? -ne 0 ]; then
@@ -333,12 +338,12 @@ get_vg_configs() {
         unset VG_PVNAMES
  
         # Execute remote command to get all the VG names.
-       cmd="${EXPORT_PATH} vgdisplay \
+       cmd="PATH=\$PATH:/sbin:/usr/sbin vgdisplay \
              | grep \"VG Name\" | awk '{print \$3}' | xargs"
         ret_str=`${REMOTE} ${host_name} "${cmd}" 2>&1`
         if [ $? -ne 0 ]; then
                 if [ -n "${ret_str}" ]; then
-                       echo >&2 "`basename $0`: get_vg_configs() error:" \
+                       error_output "get_vg_configs():" \
                         "remote command to ${host_name} error: ${ret_str}"
                 else
                         remote_error "get_vg_configs" ${host_name}
@@ -357,7 +362,7 @@ get_vg_configs() {
                 VG_NAME[i]=${vg_name}
                 VG_PVNAMES[i]=$(get_vg_pvnames ${host_name} ${VG_NAME[i]})
                 if [ $? -ne 0 ]; then
-                       echo >&2 "${VG_PVNAMES[i]}"
+                       error_output "${VG_PVNAMES[i]}"
                         return 1
                 fi
                 let "i += 1"
@@ -395,7 +400,7 @@ get_lv_configs() {
                 LV_SIZE[i]=`echo "${line}" | awk -F: '{print $7}' | sed -e 's/.*/&K/'`
  
                 let "i += 1"
-        done < <(${REMOTE} ${host_name} "${EXPORT_PATH} lvdisplay -c")
+        done < <(${REMOTE} ${host_name} "PATH=\$PATH:/sbin:/usr/sbin lvdisplay -c")
  
         if [ $i -eq 0 ]; then
                 verbose_output "There are no LVs in the host ${host_name}"
@@ -439,7 +444,7 @@ get_module_opts() {
         # Execute remote command to get the kernel version
         ret_str=`${REMOTE} ${host_name} "uname -r" 2>&1`
         if [ $? -ne 0 -a -n "${ret_str}" ]; then
-               echo >&2 "`basename $0`: get_module_opts() error:" \
+               error_output "get_module_opts():" \
                          "remote command error: ${ret_str}"
                 return 1
         fi
@@ -519,7 +524,7 @@ is_ha_target() {
         ret_str=`${REMOTE} ${host_name} \
                 "grep ${target_svname} ${res_file}" 2>&1`
         if [ $? -ne 0 -a -n "${ret_str}" ]; then
-               echo >&2 "`basename $0`: is_ha_target() error:" \
+               error_output "is_ha_target():" \
                          "remote command error: ${ret_str}"
                 return 1
         fi
@@ -577,7 +582,7 @@ get_hb_configs() {
          done < <(${REMOTE} ${host_name} "cat ${HA_CF}")
  
         if [ -z "${HB_CHANNELS}" ]; then
-               echo >&2 "`basename $0`: get_hb_configs() error:" \
+               error_output "get_hb_configs():" \
                          "There are no heartbeat channel configs in ${HA_CF}" \
                          "of host ${host_name} or ${HA_CF} does not exist!"
                 return 0
@@ -607,7 +612,7 @@ get_hb_configs() {
                 done < <(${REMOTE} ${host_name} "cat ${HA_RES}")
         
                 if [ -z "${SRV_IPADDRS}" ]; then
-                       echo >&2 "`basename $0`: get_hb_configs() error: There"\
+                       error_output "get_hb_configs(): There"\
                                  "are no service address in ${HA_RES} of host"\
                                  "${host_name} or ${HA_RES} does not exist!"
                         return 0
@@ -726,14 +731,14 @@ get_cluman_configs() {
                 # Execute remote command to get Heartbeat channel
                 HB_CHANNELS=$(get_cluman_channel ${host_name})
                 if [ $? -ne 0 ]; then
-                       echo >&2 "${HB_CHANNELS}"
+                       error_output "${HB_CHANNELS}"
                 fi
  
                 # Execute remote command to get service IP address 
                 SRV_IPADDRS=$(get_cluman_srvaddr ${host_name} \
                               ${TARGET_SVNAMES[i]})
                 if [ $? -ne 0 ]; then
-                       echo >&2 "${SRV_IPADDRS}"
+                       error_output "${SRV_IPADDRS}"
                         return 0
                 fi
  
@@ -828,7 +833,7 @@ get_svnames(){
                         let "i += 1"
                         let "j += 1"
                 else
-                       echo >&2 "`basename $0`: get_svnames() error: Invalid"\
+                       error_output "get_svnames(): Invalid"\
                               "line in ${host_name}'s ${LUSTRE_PROC_DEVICES}"\
                               "- \"${line}\"!"
                         return 1
@@ -869,7 +874,7 @@ get_devname() {
         if [ "${target_svname}" = "${MGS_SVNAME}" ]; then
                 # Execute remote command to get the device name of mgs target
                 ret_str=`${REMOTE} ${host_name} \
-                       "/sbin/findfs LABEL=${target_svname}" 2>&1`
+                       "PATH=\$PATH:/sbin:/usr/sbin findfs LABEL=${target_svname}" 2>&1`
                 if [ $? -ne 0 -a -n "${ret_str}" ]; then
                         if [ "${ret_str}" = "${ret_str#*Unable to resolve*}" ]
                         then
@@ -926,7 +931,7 @@ get_devsize() {
  
         # Execute remote command to get the device size
         ret_str=`${REMOTE} ${host_name} \
-               "/sbin/blockdev --getsize ${target_devname}" 2>&1`
+               "PATH=\$PATH:/sbin:/usr/sbin blockdev --getsize ${target_devname}" 2>&1`
         if [ $? -ne 0 -a -n "${ret_str}" ]; then
                 echo "`basename $0`: get_devsize() error:" \
                      "remote command error: ${ret_str}"
@@ -962,7 +967,7 @@ get_realdevname() {
  
         # Execute remote command to get the real device name
         ret_str=`${REMOTE} ${host_name} \
-               "/sbin/losetup ${loop_dev}" 2>&1`
+               "PATH=\$PATH:/sbin:/usr/sbin losetup ${loop_dev}" 2>&1`
         if [ $? -ne 0 -a -n "${ret_str}" ]; then
                 echo "`basename $0`: get_realdevname() error:" \
                      "remote command error: ${ret_str}"
@@ -1038,7 +1043,7 @@ get_devnames(){
                 TARGET_DEVNAMES[i]=$(get_devname ${host_name} \
                                      ${TARGET_SVNAMES[i]})
                 if [ $? -ne 0 ]; then
-                       echo >&2 "${TARGET_DEVNAMES[i]}"
+                       error_output "${TARGET_DEVNAMES[i]}"
                         return 1
                 fi
  
@@ -1048,7 +1053,7 @@ get_devnames(){
                                                "target in ${host_name}."
                                 continue
                         else
-                               echo >&2 "`basename $0`: get_devname() error:"\
+                               error_output "get_devname():"\
                                          "No device corresponding to target" \
                                          "${TARGET_SVNAMES[i]} in ${host_name}!"
                                 return 1
@@ -1059,7 +1064,7 @@ get_devnames(){
                 TARGET_MNTPNTS[i]=$(get_mntpnt ${host_name} \
                                      ${TARGET_DEVNAMES[i]})
                 if [ $? -ne 0 ]; then
-                       echo >&2 "${TARGET_MNTPNTS[i]}"
+                       error_output "${TARGET_MNTPNTS[i]}"
                         return 1
                 fi
  
@@ -1070,7 +1075,7 @@ get_devnames(){
                         TARGET_DEVSIZES[i]=$(get_devsize ${host_name} \
                                              ${TARGET_DEVNAMES[i]})
                         if [ $? -ne 0 ]; then
-                               echo >&2 "${TARGET_DEVSIZES[i]}"
+                               error_output "${TARGET_DEVSIZES[i]}"
                                 return 1
                         fi
  
@@ -1078,7 +1083,7 @@ get_devnames(){
                         TARGET_DEVNAMES[i]=$(get_realdevname ${host_name} \
                                              ${TARGET_DEVNAMES[i]})
                         if [ $? -ne 0 ]; then
-                               echo >&2 "${TARGET_DEVNAMES[i]}"
+                               error_output "${TARGET_DEVNAMES[i]}"
                                 return 1
                         fi
                 fi
@@ -1095,7 +1100,7 @@ is_target() {
         "ost") let "ret = $2 & LDD_F_SV_TYPE_OST";;
         "mgs") let "ret = $2 & LDD_F_SV_TYPE_MGS";;
         "*") 
-               echo >&2 "`basename $0`: is_target() error: Invalid" \
+               error_output "is_target(): Invalid" \
                 "target service type - \"$1\"!"
                 return 1
                 ;;
@@ -1262,8 +1267,8 @@ get_stripecount() {
                 stripe_count=`echo ${ret_str} | awk '{print $1}'`
         fi
  
-       if [ -z "`echo ${stripe_count}|awk '/^[[:digit:]]/ {print $0}'`" ]
-       then
+    if [ "$stripe_count" != "-1" ] && \
+    [ -z "`echo ${stripe_count}|awk '/^[[:digit:]]/ {print $0}'`" ]; then
                 echo "`basename $0`: get_stripecount() error: can't" \
                 "get stripe count of ${target_fsname} in ${host_name}!"
                 return 1
@@ -1359,7 +1364,7 @@ get_ldds(){
                 ${TUNEFS} --print --verbose ${TARGET_DEVNAMES[i]} 2>/dev/null")
  
                 if [ -z "${flags}" ]; then
-                       echo >&2 "`basename $0`: get_ldds() error: Invalid" \
+                       error_output "get_ldds(): Invalid" \
                                  "ldd_flags of target ${TARGET_DEVNAMES[i]}" \
                                  "in host ${host_name} - it's value is null!"\
                                  "Check ${TUNEFS} command!"
@@ -1376,7 +1381,7 @@ get_ldds(){
                 # Get the lustre target service type
                 TARGET_DEVTYPES[i]=$(get_devtype ${flags})
                 if [ $? -ne 0 ]; then
-                       echo >&2 "${TARGET_DEVTYPES[i]} From device" \
+                       error_output "${TARGET_DEVTYPES[i]} From device" \
                         "${TARGET_DEVNAMES[i]} in host ${host_name}!"
                         return 1
                 fi
@@ -1390,7 +1395,7 @@ get_ldds(){
                 # Get failover nids of the lustre target
                 TARGET_FAILNIDS[i]=$(get_failnids "${params}")
                 if [ $? -ne 0 ]; then
-                       echo >&2 "${TARGET_FAILNIDS[i]} From device" \
+                       error_output "${TARGET_FAILNIDS[i]} From device" \
                         "${TARGET_DEVNAMES[i]} in host ${host_name}!"
                         return 1
                 fi
@@ -1398,7 +1403,7 @@ get_ldds(){
                 # Get other format options of the lustre target
                 TARGET_FMTOPTS[i]=$(get_fmtopts ${TARGET_DEVNAMES[i]} ${host_name} "${params}")
                 if [ $? -ne 0 ]; then
-                       echo >&2 "${TARGET_FMTOPTS[i]}"
+                       error_output "${TARGET_FMTOPTS[i]}"
                         return 1
                 fi
  
@@ -1422,7 +1427,7 @@ get_ldds(){
                         # Get the stripe count option
                         stripecount_opt=$(get_stripecount_opt ${host_name} ${TARGET_FSNAMES[i]})
                         if [ $? -ne 0 ]; then
-                               echo >&2 "${stripecount_opt}"
+                               error_output "${stripecount_opt}"
                                 return 1
                         fi
  
@@ -1453,8 +1458,8 @@ get_journalsize() {
         local ret_str
  
         # Execute remote command to get the journal inode number
-       ret_str=`${REMOTE} ${host_name} "/sbin/debugfs -R 'stats -h' \
-                ${target_devname} | grep 'Journal inode:'" 2>&1`
+       ret_str=`${REMOTE} ${host_name} "PATH=\$PATH:/sbin:/usr/sbin \
+debugfs -R 'stats -h' ${target_devname} | grep 'Journal inode:'" 2>&1`
         if [ $? -ne 0 -a -n "${ret_str}" ]; then
                 echo "`basename $0`: get_journalsize() error:" \
                      "remote command error: ${ret_str}"
@@ -1471,8 +1476,8 @@ get_journalsize() {
         fi
  
         # Execute remote command to get the journal size
-       ret_str=`${REMOTE} ${host_name} "/sbin/debugfs -R \
-               'stat <${journal_inode}>' ${target_devname}|grep '^User:'" 2>&1`
+       ret_str=`${REMOTE} ${host_name} "PATH=\$PATH:/sbin:/usr/sbin \
+debugfs -R 'stat <${journal_inode}>' ${target_devname}|grep '^User:'" 2>&1`
         if [ $? -ne 0 -a -n "${ret_str}" ]; then
                 echo "`basename $0`: get_journalsize() error:" \
                      "remote command error: ${ret_str}"
@@ -1531,8 +1536,8 @@ figure_journal_size() {
         declare -i journal_size
  
         # Execute remote command to get the block count 
-       ret_str=`${REMOTE} ${host_name} "/sbin/debugfs -R 'stats -h' \
-                ${target_devname} | grep 'Block count:'" 2>&1`
+       ret_str=`${REMOTE} ${host_name} "PATH=\$PATH:/sbin:/usr/sbin \
+debugfs -R 'stats -h' ${target_devname} | grep 'Block count:'" 2>&1`
         if [ $? -ne 0 -a -n "${ret_str}" ]; then
                 echo "`basename $0`: figure_journal_size() error:" \
                      "remote command error: ${ret_str}"
@@ -1614,8 +1619,8 @@ get_ratio() {
         local ret_str
  
         # Execute remote command to get the inode count
-       ret_str=`${REMOTE} ${host_name} "/sbin/debugfs -R 'stats -h' \
-                ${target_devname} | grep 'Inode count:'" 2>&1`
+       ret_str=`${REMOTE} ${host_name} "PATH=\$PATH:/sbin:/usr/sbin \
+debugfs -R 'stats -h' ${target_devname} | grep 'Inode count:'" 2>&1`
         if [ $? -ne 0 -a -n "${ret_str}" ]; then
                 echo "`basename $0`: get_ratio() error:" \
                      "remote command error: ${ret_str}"
@@ -1632,8 +1637,8 @@ get_ratio() {
         fi
  
         # Execute remote command to get the block count
-       ret_str=`${REMOTE} ${host_name} "/sbin/debugfs -R 'stats -h' \
-                ${target_devname} | grep 'Block count:'" 2>&1`
+       ret_str=`${REMOTE} ${host_name} "PATH=\$PATH:/sbin:/usr/sbin \
+debugfs -R 'stats -h' ${target_devname} | grep 'Block count:'" 2>&1`
         if [ $? -ne 0 -a -n "${ret_str}" ]; then
                 echo "`basename $0`: get_ratio() error:" \
                      "remote command error: ${ret_str}"
@@ -1714,8 +1719,8 @@ get_isize() {
         local ret_str
  
         # Execute remote command to get the inode size 
-       ret_str=`${REMOTE} ${host_name} "/sbin/debugfs -R 'stats -h' \
-                ${target_devname} | grep 'Inode size:'" 2>&1`
+       ret_str=`${REMOTE} ${host_name} "PATH=\$PATH:/sbin:/usr/sbin \
+debugfs -R 'stats -h' ${target_devname} | grep 'Inode size:'" 2>&1`
         if [ $? -ne 0 -a -n "${ret_str}" ]; then
                 echo "`basename $0`: get_isize() error:" \
                      "remote command error: ${ret_str}"
@@ -1851,7 +1856,7 @@ get_mkfsopts(){
                         TARGET_DEVSIZES[i]=$(get_devsize ${host_name} \
                                          ${TARGET_DEVNAMES[i]})
                         if [ $? -ne 0 ]; then
-                               echo >&2 "${TARGET_DEVSIZES[i]}"
+                               error_output "${TARGET_DEVSIZES[i]}"
                                 return 1
                         fi
                 fi
@@ -1860,7 +1865,7 @@ get_mkfsopts(){
                 journal_opt=$(get_J_opt ${host_name} ${TARGET_DEVNAMES[i]} \
                               ${TARGET_DEVSIZES[i]})
                 if [ $? -ne 0 ]; then
-                       echo >&2 "${journal_opt}"
+                       error_output "${journal_opt}"
                         return 1
                 fi
  
@@ -1876,7 +1881,7 @@ get_mkfsopts(){
                 ratio_opt=$(get_i_opt ${host_name} ${TARGET_DEVNAMES[i]} \
                             ${TARGET_DEVTYPES[i]} ${TARGET_DEVSIZES[i]})
                 if [ $? -ne 0 ]; then
-                       echo >&2 "${ratio_opt}"
+                       error_output "${ratio_opt}"
                         return 1
                 fi
  
@@ -1892,7 +1897,7 @@ get_mkfsopts(){
                 inode_size_opt=$(get_I_opt ${host_name} ${TARGET_DEVNAMES[i]} \
                                  ${TARGET_DEVTYPES[i]} ${TARGET_FSNAMES[i]})
                 if [ $? -ne 0 ]; then
-                       echo >&2 "${inode_size_opt}"
+                       error_output "${inode_size_opt}"
                         return 1
                 fi
  
@@ -1956,7 +1961,7 @@ get_target_configs() {
  get_configs() {
         # Check the hostname
         if [ -z "$1" ]; then
-               echo >&2 "`basename $0`: get_configs() error:" \
+               error_output "get_configs():" \
                          "Missing hostname!"
                 return 1
         fi
diff --git a/lustre/scripts/lustre_req_history b/lustre/scripts/lustre_req_history

index de28076..7902bc6 100644 (file)
--- a/lustre/scripts/lustre_req_history
+++ b/lustre/scripts/lustre_req_history
@@ -1,5 +1,5 @@
  #!/bin/sh
-# Copyright (C) 2006  Cluster File Systems, Inc.
+# Copyright 2008 Sun Microsystems, Inc.
  
  # Purpose:
  # This script displays the history of requests from the local client  
diff --git a/lustre/scripts/lustre_start.in b/lustre/scripts/lustre_start.in

new file mode 100644 (file)

index 0000000..7493bb8
--- /dev/null
+++ b/lustre/scripts/lustre_start.in
@@ -0,0 +1,372 @@
+#!/bin/bash
+
+# vim:expandtab:shiftwidth=4:softtabstop=4:tabstop=4:
+
+#
+# lustre_start - start or stop multiple Lustre servers from a CSV file
+#
+# This script is used to parse each line of a CSV (Comma-Separated Value) file
+# and execute remote command to start/stop the service on every Lustre server
+# target that will be part of the Lustre cluster.
+#
+################################################################################
+
+# Usage
+usage() {
+    cat <<EOF
+
+Usage: $(basename $0) [options] <-a|-w|-x> <CSV file>
+
+    This script is used to start or stop multiple Lustre servers from a
+    CSV file.
+
+    Options:
+    -a          select all the nodes from the CSV file to operate on
+    -w hostname,hostname,...
+                select the specified list of nodes (separated by commas) to
+                operate on rather than all the nodes in the CSV file
+    -x hostname,hostname,...
+                exclude the specified list of nodes (separated by commas)
+    -n          no net - don't verify network connectivity and hostnames
+                in the cluster
+    -m          pass "mount options" item in the CSV file to mount command line
+    -k          stop the services on Lustre server targets
+    -v          verbose mode
+    -h          help
+    CSV file    a comma-separated value file that contains configuration
+                parameters for each target in a Lustre cluster
+
+    Please refer to "lustre_config -h" for the description of CSV file formats.
+
+EOF
+}
+
+# Get the library of functions
+. @scriptlibdir@/lc_common
+
+SPECIFY_MNTOPTS=false
+STOP_SERVICE=false
+# Get and check the positional parameters
+while getopts "aw:x:nmkhv" OPTION; do
+    case $OPTION in
+    a)
+        [ -z "$SPECIFIED_NODELIST" ] && [ -z "$EXCLUDED_NODELIST" ] \
+        && USE_ALLNODES=true
+        NODELIST_OPT="$NODELIST_OPT -a"
+        ;;
+    w)
+        USE_ALLNODES=false
+        SPECIFIED_NODELIST=$OPTARG
+        NODELIST_OPT="$NODELIST_OPT -w $SPECIFIED_NODELIST"
+        ;;
+    x)
+        USE_ALLNODES=false
+        EXCLUDED_NODELIST=$OPTARG
+        NODELIST_OPT="$NODELIST_OPT -x $EXCLUDED_NODELIST"
+        ;;
+    n)
+        VERIFY_CONNECT=false
+        ;;
+    m)
+        SPECIFY_MNTOPTS=true
+        ;;
+    k)
+        STOP_SERVICE=true 
+        ;;
+    h)
+        usage
+        exit 0
+        ;;
+    v)
+        VERBOSE_OPT="-v"
+        VERBOSE_OUTPUT=true
+        ;;
+    ?)
+        usage 1>&2
+        exit 1
+        ;;
+    esac
+done
+
+# Toss out the parameters we've already processed
+shift $((OPTIND - 1))
+
+# Here we expect the CSV file
+if [ $# -eq 0 ]; then
+    error_output "Missing CSV file!"
+    usage 1>&2
+    exit 1
+fi
+
+CSV_FILE=$1
+
+# get_fstab_mntopts host_name device_name mount_point
+# Get the mount options from the /etc/fstab file
+get_fstab_mntopts() {
+    local host_name=$1
+    local dev_name=$2
+    local mnt_pnt=$3
+
+    local mnt_opts=""
+
+    if [ -z "$host_name" -o -z "$dev_name" -o -z "$mnt_pnt" ]; then
+        echo "get_fstab_mntopts(): Missing argument!"
+        return 1
+    fi
+
+    # Execute remote command to get the mount options from /etc/fstab file
+    mnt_opts=$($REMOTE $host_name "grep -w ^$dev_name /etc/fstab | \
+grep -w $mnt_pnt | awk '{print \$4}'" 2>/dev/null)
+
+    mnt_opts=${mnt_opts//$host_name: /}
+
+    echo $mnt_opts
+    return 0
+}
+
+# Start the service on one Lustre server target
+start_service() {
+    declare -i i=$1
+    shift
+    local extra_mntopts="$*"
+    local mntopts=""
+
+    # Get mount options
+    if $SPECIFY_MNTOPTS; then
+        # Use the "mount options" item from the CSV file
+        [ -n "${MOUNT_OPTIONS[i]}" ] && mntopts=${MOUNT_OPTIONS[i]}
+    else
+        # Do not use the "mount options" item from the CSV file
+        mntopts=$(get_fstab_mntopts ${HOST_NAME[i]} ${DEVICE_NAME[i]} \
+                ${MOUNT_POINT[i]})
+        [ ${PIPESTATUS[0]} -ne 0 ] && error_output "$mntopts" && return 1
+    fi
+
+    [ -n "$mntopts" ] && mntopts="-o $mntopts"
+    [ -n "$extra_mntopts" ] && mntopts="$mntopts $extra_mntopts"
+    # Strip of any leading space
+    mntopts=${mntopts# }
+
+    # Execute remote command to start the service
+    verbose_output "Mounting Lustre ${DEVICE_TYPE[i]} target"\
+    "${DEVICE_NAME[i]} (opts: $mntopts) on ${HOST_NAME[i]}:${MOUNT_POINT[i]}..."
+    $REMOTE ${HOST_NAME[i]} "PATH=\$PATH:/sbin:/usr/sbin
+error() { set +x; echo \"ERROR: \$2: \$1\"; echo \"XXRETCODE:\$1\"; exit \$1; }
+mkdir -p ${MOUNT_POINT[i]} || \\
+    error \${PIPESTATUS[0]} \"failed to mkdir ${MOUNT_POINT[i]}\"
+mount -t $FS_TYPE $mntopts ${DEVICE_NAME[i]} ${MOUNT_POINT[i]} || \\
+    error \${PIPESTATUS[0]} \\
+        \"failed to mount ${DEVICE_NAME[i]} on host ${HOST_NAME[i]}\""
+    return ${PIPESTATUS[0]}
+}
+
+# Stop the service on one Lustre server target
+stop_service() {
+    declare -i i=$1
+
+    # Execute remote command to stop the service
+    verbose_output "Unmounting Lustre ${DEVICE_TYPE[i]} target"\
+    "${DEVICE_NAME[i]} on ${HOST_NAME[i]}:${MOUNT_POINT[i]}..."
+    $REMOTE ${HOST_NAME[i]} "PATH=\$PATH:/sbin:/usr/sbin
+error() { set +x; echo \"ERROR: \$2: \$1\"; echo \"XXRETCODE:\$1\"; exit \$1; }
+if grep -q \" ${MOUNT_POINT[i]} \" /proc/mounts; then
+    umount -d -f ${MOUNT_POINT[i]} || \\
+        error \${PIPESTATUS[0]} \\
+            \"failed to unmount ${DEVICE_NAME[i]} on host ${HOST_NAME[i]}\"
+else
+    echo \"${DEVICE_NAME[i]} was not mounted on\"\\
+    \"${HOST_NAME[i]}:${MOUNT_POINT[i]}\"
+fi"
+    return ${PIPESTATUS[0]}
+}
+
+# mass_op op_type target_type
+# Start/stop the services on Lustre server targets in parallel
+mass_op() {
+    local op_type=$1
+    local target_type=$2
+
+    local op_func
+    declare -i i
+    declare -i pid_num=0
+    declare -a REMOTE_PID
+    local RC=0
+
+    if [ -z "$op_type" -o -z "$target_type" ]; then
+        error_output "mass_op(): Missing argument!"
+        return 1
+    fi
+
+    case "$op_type" in
+    "start")  op_func=start_service;;
+    "stop")   op_func=stop_service;;
+    *)  error_output "mass_op(): Invalid op type \"$op_type\"!" && return 1;;
+    esac
+
+    for ((i = 0; i < ${#HOST_NAME[@]}; i++)); do
+        if [ "${DEVICE_TYPE[i]}" = "$target_type" ] \
+        && [[ "${FORMAT_OPTIONS[i]}" != *noformat* ]]; then
+            eval "$op_func $i &"
+            REMOTE_PID[$pid_num]=$!
+            let pid_num=$pid_num+1
+        fi
+    done
+
+    for ((pid_num = 0; pid_num < ${#REMOTE_PID[@]}; pid_num++)); do
+        wait ${REMOTE_PID[${pid_num}]}
+        local RC1=${PIPESTATUS[0]}
+        [ $RC1 -ne 0 ] && RC=$RC1
+    done
+
+    [ $RC -ne 0 ] && return $RC
+
+    return 0
+}
+
+# Unload the modules to make cleanup
+unload_modules() {
+    local command
+    local host
+    local host_list
+
+    host_list=$(comma_list "${HOST_NAME[@]}")
+    [ -z "$host_list" ] && return 0
+
+    command="PATH=\$PATH:/sbin:/usr/sbin
+if grep -q libcfs /proc/modules; then
+    lctl net down 1>/dev/null 2>&1
+    lustre_rmmod
+fi"
+
+    if is_pdsh; then
+        $REMOTE $host_list "$command"
+    else
+        for host in ${host_list//,/ }; do
+            $REMOTE $host "$command"
+        done
+    fi
+}
+
+# Start the services on Lustre server targets
+mass_start() {
+    declare -i i
+    local combo_mgs_mdt=false
+
+    if [ ${#HOST_NAME[@]} -eq 0 ]; then
+        verbose_output "There are no Lustre targets specified."
+        return 0
+    fi
+
+    # Start lnet network on the MGS node
+    start_mgs_lnet || return ${PIPESTATUS[0]}
+
+    local checked_hosts=""
+    for ((i = 0; i < ${#HOST_NAME[@]}; i++)); do
+        host_in_hostlist ${HOST_NAME[i]} $checked_hosts && continue
+        if ! is_mgs_node ${HOST_NAME[i]}; then
+            # Add module options to the module configuration file
+            add_module_options $i ${HOST_NAME[i]} || return ${PIPESTATUS[0]}
+
+            # Check lnet networks
+            check_lnet $i || return ${PIPESTATUS[0]}
+
+            checked_hosts="$checked_hosts,${HOST_NAME[i]}"
+        fi
+    done
+
+    # Start MGS or the MGS service on combo MGS/MDT (with "-o nosvc -n" options)
+    if [ -n "${MGS_NODENAME[0]}" ]; then
+        local idx=${MGS_IDX[0]}
+        if [ "${DEVICE_TYPE[idx]#*mdt*}" != "${DEVICE_TYPE[idx]}" ]; then
+            # Combo MGS/MDT
+            combo_mgs_mdt=true
+            start_service ${MGS_IDX[0]} "-o nosvc -n" || return ${PIPESTATUS[0]}
+        else
+            start_service ${MGS_IDX[0]} || return ${PIPESTATUS[0]}
+        fi
+    fi
+
+    # Start OST(s)
+    mass_op "start" "ost" || return ${PIPESTATUS[0]}
+
+    # Start the MDT service on combo MGS/MDT (with "-o nomgs" option)
+    if $combo_mgs_mdt; then
+        start_service ${MGS_IDX[0]} "-o nomgs" || return ${PIPESTATUS[0]}
+    fi
+
+    # Start MDT(s)
+    mass_op "start" "mdt" || return ${PIPESTATUS[0]}
+
+    verbose_output "Success on all Lustre targets!"
+    return 0
+}
+
+# Stop the services on Lustre server targets
+mass_stop() {
+    declare -i i
+
+    if [ ${#HOST_NAME[@]} -eq 0 ]; then
+        verbose_output "There are no Lustre targets specified."
+        return 0
+    fi
+
+    # Stop MDT(s)
+    mass_op "stop" "mdt" || return ${PIPESTATUS[0]}
+
+    # Stop the MDT service on combo MGS/MDT
+    if [ -n "${MGS_NODENAME[0]}" ]; then
+        local idx=${MGS_IDX[0]}
+        if [ "${DEVICE_TYPE[idx]#*mdt*}" != "${DEVICE_TYPE[idx]}" ]; then
+            # Combo MGS/MDT
+            stop_service ${MGS_IDX[0]} || return ${PIPESTATUS[0]}
+        fi
+    fi
+
+    # Stop OST(s)
+    mass_op "stop" "ost" || return ${PIPESTATUS[0]}
+    
+    # Stop MGS or the MGS service on combo MGS/MDT
+    if [ -n "${MGS_NODENAME[0]}" ]; then
+        stop_service ${MGS_IDX[0]} || return ${PIPESTATUS[0]}
+    fi
+
+    unload_modules
+
+    return 0
+}
+
+#********************************* Main Flow **********************************#
+
+# Check the CSV file
+check_file $CSV_FILE || exit ${PIPESTATUS[0]}
+
+# Get the list of nodes to be operated on
+NODES_TO_USE=$(get_nodelist) || error_exit ${PIPESTATUS[0]} "$NODES_TO_USE"
+
+# Check the node list
+check_nodelist $NODES_TO_USE || exit ${PIPESTATUS[0]}
+
+# Check the network connectivity and hostnames
+if $VERIFY_CONNECT; then
+    verbose_output "Checking the cluster network connectivity and hostnames..."
+    $VERIFY_CLUSTER_NET $NODELIST_OPT $VERBOSE_OPT $CSV_FILE || \
+        exit ${PIPESTATUS[0]}
+    verbose_output "Check the cluster network connectivity and hostnames OK!"
+fi
+
+# Configure the Lustre cluster
+echo "$(basename $0): ******** Lustre cluster configuration BEGIN ********"
+
+get_lustre_items $CSV_FILE || exit ${PIPESTATUS[0]}
+
+check_mgs || exit ${PIPESTATUS[0]}
+
+if ! $STOP_SERVICE; then
+    mass_start || exit ${PIPESTATUS[0]}
+else
+    mass_stop || exit ${PIPESTATUS[0]}
+fi
+
+echo "$(basename $0): ******** Lustre cluster configuration END **********"
+
+exit 0
diff --git a/lustre/scripts/maketags.sh b/lustre/scripts/maketags.sh

index 9bd9f87..6f87d8d 100755 (executable)
--- a/lustre/scripts/maketags.sh
+++ b/lustre/scripts/maketags.sh
@@ -1,5 +1,5 @@
  #!/bin/sh
-# Copyright (C) 2001  Cluster File Systems, Inc.
+# Copyright 2008 Sun Microsystems, Inc.
  #
  # This code is issued under the GNU General Public License.
  # See the file COPYING in this distribution
diff --git a/lustre/scripts/nodelustre b/lustre/scripts/nodelustre

index b5e6540..91f47c4 100755 (executable)
--- a/lustre/scripts/nodelustre
+++ b/lustre/scripts/nodelustre
@@ -1,6 +1,6 @@
  #! /bin/sh
  # nodelustre - Start and stop Lustre on MCR nodes
-# Copyright (C) 2002  Cluster File Systems, Inc.
+# Copyright 2008 Sun Microsystems, Inc.
  # Gord Eagle <gord@clusterfs.com>, 2002-09-10
  
  # Set this to the shared config file.
diff --git a/lustre/tests/.cvsignore b/lustre/tests/.cvsignore

index f8e2b2d..14cc38d 100644 (file)
--- a/lustre/tests/.cvsignore
+++ b/lustre/tests/.cvsignore
@@ -71,7 +71,7 @@ rmdirmany
  flock_test
  flocks_test
  writemany
-random-reads
+reads
  chownmany
  llverdev
  llverfs
diff --git a/lustre/tests/Makefile.am b/lustre/tests/Makefile.am

index f82d16e..c4c393b 100644 (file)
--- a/lustre/tests/Makefile.am
+++ b/lustre/tests/Makefile.am
@@ -11,9 +11,18 @@ noinst_SCRIPTS += sanity.sh rundbench acceptance-small.sh compile.sh
  noinst_SCRIPTS += conf-sanity.sh insanity.sh lfscktest.sh oos.sh oos2.sh
  noinst_SCRIPTS += llog-test.sh recovery-small.sh replay-dual.sh sanity-quota.sh
  noinst_SCRIPTS += replay-ost-single.sh replay-single.sh run-llog.sh sanityN.sh
+noinst_SCRIPTS += large-scale.sh runracer
+noinst_SCRIPTS += performance-sanity.sh mdsrate-create-small.sh
+noinst_SCRIPTS += mdsrate-create-large.sh mdsrate-lookup-1dir.sh
+noinst_SCRIPTS += mdsrate-stat-small.sh mdsrate-stat-large.sh replay-vbr.sh
  noinst_SCRIPTS += lockorder.sh socketclient socketserver runmultiop_bg_pause
+noinst_SCRIPTS += recovery-mds-scale.sh run_dd.sh run_tar.sh run_iozone.sh
+noinst_SCRIPTS += run_dbench.sh
  nobase_noinst_SCRIPTS = cfg/local.sh
  nobase_noinst_SCRIPTS += acl/make-tree acl/run cfg/ncli.sh
+nobase_noinst_SCRIPTS += racer/dir_create.sh racer/file_create.sh racer/file_list.sh
+nobase_noinst_SCRIPTS += racer/file_rm.sh racer/racer.sh racer/file_concat.sh
+nobase_noinst_SCRIPTS += racer/file_link.sh racer/file_rename.sh racer/file_symlink.sh
  nobase_noinst_DATA = acl/cp.test acl/getfacl-noacl.test acl/inheritance.test
  nobase_noinst_DATA += acl/misc.test acl/permissions.test acl/setfacl.test
  
@@ -25,14 +34,14 @@ noinst_PROGRAMS = openunlink truncate directio openme writeme mlink utime it_tes
  noinst_PROGRAMS += tchmod toexcl fsx test_brw openclose createdestroy
  noinst_PROGRAMS += createmany chownmany statmany multifstat createtest
  noinst_PROGRAMS += opendirunlink opendevunlink unlinkmany fchdir_test checkstat
-noinst_PROGRAMS += wantedi statone runas openfile getdents o_directory rmdirmany
+noinst_PROGRAMS += statone runas openfile getdents o_directory rmdirmany
  noinst_PROGRAMS += small_write multiop sleeptest ll_sparseness_verify cmknod
  noinst_PROGRAMS += ll_sparseness_write mrename ll_dirstripe_verify mkdirmany
  noinst_PROGRAMS += openfilleddirunlink rename_many memhog iopentest1 iopentest2
  noinst_PROGRAMS += mmap_sanity flock_test writemany reads flocks_test
  noinst_PROGRAMS += ll_getstripe_info
  if MPITESTS
-noinst_PROGRAMS += parallel_grouplock write_append_truncate createmany_mpi
+noinst_PROGRAMS += parallel_grouplock write_append_truncate createmany_mpi mdsrate
  endif
  # noinst_PROGRAMS += ldaptest copy_attr mkdirdeep 
  bin_PROGRAMS = mcreate munlink
@@ -53,14 +62,20 @@ multiop_LDADD=$(LIBLUSTREAPI)
  ll_dirstripe_verify_SOURCES= ll_dirstripe_verify.c
  ll_dirstripe_verify_LDADD= -L$(top_builddir)/lustre/utils -llustreapi
  
+flocks_test_SOURCES=flocks_test.c
+flocks_test_LDADD=-lpthread
+
  if MPITESTS
-LAM_LD_FLAGS=-L/opt/lam/lib -lmpi -llam -lpthread
+#LAM_LD_FLAGS=-L/opt/lam/lib -lmpi -llam -lpthread
+LAM_LD_FLAGS=-lmpich -lpthread
  write_append_truncate_SOURCES=write_append_truncate.c
  write_append_truncate_LDADD=$(LAM_LD_FLAGS)
  createmany_mpi_SOURCES=createmany-mpi.c
  createmany_mpi_LDADD=$(LAM_LD_FLAGS)
-parallel_grouplock_SOURCES=parallel_grouplock.c lp_utils.c
+parallel_grouplock_SOURCES=parallel_grouplock.c lp_utils.c lp_utils.h
  parallel_grouplock_LDADD=$(LAM_LD_FLAGS)
+mdsrate_SOURCES=mdsrate.c
+mdsrate_LDADD=$(LAM_LD_FLAGS) -L$(top_builddir)/lustre/utils -llustreapi
  endif
  
  #copy_attr_LDADD= -lattr
diff --git a/lustre/tests/acceptance-metadata-double.sh b/lustre/tests/acceptance-metadata-double.sh

index 9c9df63..6027db7 100644 (file)
--- a/lustre/tests/acceptance-metadata-double.sh
+++ b/lustre/tests/acceptance-metadata-double.sh
@@ -26,12 +26,12 @@ display_elapsed_time() {
  
  debug_client_on()
  {
-       echo -1 > /proc/sys/lnet/debug
+       lctl set_param -n debug=-1
  }
  
  debug_client_off()
  {
-       echo 0x3f0400 > /proc/sys/lnet/debug
+       lctl set_param -n debug=0x3f0400
  }
  
  MNT=${MNT:-/mnt/lustre}
diff --git a/lustre/tests/acceptance-metadata-single.sh b/lustre/tests/acceptance-metadata-single.sh

index ad927fa..685f9f4 100644 (file)
--- a/lustre/tests/acceptance-metadata-single.sh
+++ b/lustre/tests/acceptance-metadata-single.sh
@@ -26,12 +26,12 @@ display_elapsed_time() {
  
  debug_client_on()
  {
-       echo -1 > /proc/sys/lnet/debug
+       lctl set_param -n debug=-1
  }
  
  debug_client_off()
  {
-       echo 0x3f0400 > /proc/sys/lnet/debug
+       lctl set_param -n debug=0x3f0400
  }
  
  MNT=${MNT:-/mnt/lustre}
diff --git a/lustre/tests/acceptance-small.sh b/lustre/tests/acceptance-small.sh

index ea59da9..ef8c61d 100755 (executable)
--- a/lustre/tests/acceptance-small.sh
+++ b/lustre/tests/acceptance-small.sh
@@ -23,13 +23,14 @@ fi
  [ "$DEBUG_OFF" ] || DEBUG_OFF="eval lctl set_param debug=\"$DEBUG_LVL\""
  [ "$DEBUG_ON" ] || DEBUG_ON="eval lctl set_param debug=0x33f0484"
  
-export TESTSUITE_LIST="RUNTESTS SANITY DBENCH BONNIE IOZONE FSX SANITYN LFSCK LIBLUSTRE REPLAY_SINGLE CONF_SANITY RECOVERY_SMALL REPLAY_OST_SINGLE REPLAY_DUAL INSANITY SANITY_QUOTA"
+export TESTSUITE_LIST="RUNTESTS SANITY DBENCH BONNIE IOZONE FSX SANITYN LFSCK LIBLUSTRE RACER REPLAY_SINGLE CONF_SANITY RECOVERY_SMALL REPLAY_OST_SINGLE REPLAY_DUAL REPLAY_VBR INSANITY SANITY_QUOTA PERFORMANCE_SANITY LARGE_SCALE RECOVERY_MDS_SCALE"
  
  if [ "$ACC_SM_ONLY" ]; then
      for O in $TESTSUITE_LIST; do
         export ${O}="no"
      done
      for O in $ACC_SM_ONLY; do
+       O=`echo ${O%.sh} | tr "-" "_"`
         O=`echo $O | tr "[:lower:]" "[:upper:]"`
         export ${O}="yes"
      done
@@ -49,15 +50,51 @@ FORMAT=${FORMAT:-formatall}
  CLEANUP=${CLEANUP:-stopall}
  
  setup_if_needed() {
-    mount | grep $MOUNT && return
-    $FORMAT && $SETUP
+    local MOUNTED=$(mounted_lustre_filesystems)
+    if $(echo $MOUNTED | grep -w -q $MOUNT); then
+        check_config $MOUNT
+        init_versions_vars
+        return
+    fi
+
+    echo "Lustre is not mounted, trying to do setup SETUP=$SETUP ... "
+    [ "$REFORMAT" ] && $FORMAT
+    $SETUP
+
+    MOUNTED=$(mounted_lustre_filesystems)
+    if ! $(echo $MOUNTED | grep -w -q $MOUNT); then
+        echo "Lustre is not mounted after setup! SETUP=$SETUP"
+        exit 1
+    fi
  }
  
  title() {
+    # update titlebar if stdin is attaached to an xterm
+    if ${UPDATE_TITLEBAR:-false}; then
+       if tty -s; then
+           case $TERM in 
+               xterm*)
+                   echo -ne "\033]2; acceptance-small: $* \007" >&0
+                   ;;
+           esac
+       fi
+    fi 
      log "-----============= acceptance-small: "$*" ============----- `date`"
      RANTEST=${RANTEST}$*", "
  }
  
+skip_remost()
+{
+       remote_ost_nodsh && log "SKIP: $1: remote OST with nodsh" && return 0
+       return 1
+}
+
+skip_remmds()
+{
+       remote_mds_nodsh && log "SKIP: $1: remote MDS with nodsh" && return 0
+       return 1
+}
+
  for NAME in $CONFIGS; do
         export NAME MOUNT START CLEAN
         . $LUSTRE/tests/cfg/$NAME.sh
@@ -73,6 +110,8 @@ for NAME in $CONFIGS; do
  
         setup_if_needed
  
+       MSKIPPED=0
+       OSKIPPED=0
         if [ "$RUNTESTS" != "no" ]; then
                 title runtests
                 bash runtests
@@ -101,19 +140,19 @@ for NAME in $CONFIGS; do
                 $DEBUG_OFF
                 myUID=$RUNAS_ID
                 myRUNAS=$RUNAS
-               FAIL_ON_ERROR=false check_runas_id_ret $myUID $myRUNAS || { myRUNAS="" && myUID=$UID; }
+               FAIL_ON_ERROR=false check_runas_id_ret $myUID $myUID $myRUNAS || { myRUNAS="" && myUID=$UID; }
                 chown $myUID:$myUID $DBENCHDIR
                 duration=""
                 [ "$SLOW" = "no" ] && duration=" -t 120"
                 if [ "$SLOW" != "no" -o $DB_THREADS -eq 1 ]; then
-                       DIR=$DBENCHDIR $myRUNAS bash rundbench 1 $duration
+                       $myRUNAS bash rundbench -D $DBENCHDIR 1 $duration || error "dbench failed!"
                         $DEBUG_ON
                         $CLEANUP
                         $SETUP
                 fi
                 if [ $DB_THREADS -gt 1 ]; then
                         $DEBUG_OFF
-                       DIR=$DBENCHDIR $myRUNAS bash rundbench $DB_THREADS $duration
+                       $myRUNAS bash rundbench -D $DBENCHDIR $DB_THREADS $duration
                         $DEBUG_ON
                         $CLEANUP
                         $SETUP
@@ -129,14 +168,14 @@ for NAME in $CONFIGS; do
                 mkdir -p $BONDIR
                 $LFS setstripe -c -1 $BONDIR
                 sync
-               MIN=`cat /proc/fs/lustre/osc/*/kbytesavail | sort -n | head -n1`
+               MIN=`lctl get_param -n osc.*.kbytesavail | sort -n | head -n1`
                 SPACE=$(( OSTCOUNT * MIN ))
                 [ $SPACE -lt $SIZE ] && SIZE=$((SPACE * 3 / 4))
                 log "min OST has ${MIN}kB available, using ${SIZE}kB file size"
                 $DEBUG_OFF
                 myUID=$RUNAS_ID
                 myRUNAS=$RUNAS
-               FAIL_ON_ERROR=false check_runas_id_ret $myUID $myRUNAS || { myRUNAS="" && myUID=$UID; }
+               FAIL_ON_ERROR=false check_runas_id_ret $myUID $myUID $myRUNAS || { myRUNAS="" && myUID=$UID; }
                 chown $myUID:$myUID $BONDIR             
                 $myRUNAS bonnie++ -f -r 0 -s$((SIZE / 1024)) -n 10 -u$myUID:$myUID -d$BONDIR
                 $DEBUG_ON
@@ -155,19 +194,23 @@ for NAME in $CONFIGS; do
                 mkdir -p $IOZDIR
                 $LFS setstripe -c -1 $IOZDIR
                 sync
-               MIN=`cat /proc/fs/lustre/osc/*/kbytesavail | sort -n | head -n1`
+               MIN=`lctl get_param -n osc.*.kbytesavail | sort -n | head -n1`
                 SPACE=$(( OSTCOUNT * MIN ))
                 [ $SPACE -lt $SIZE ] && SIZE=$((SPACE * 3 / 4))
                 log "min OST has ${MIN}kB available, using ${SIZE}kB file size"
-               IOZONE_OPTS="-i 0 -i 1 -i 2 -e -+d -r $RSIZE -s $SIZE"
+               IOZONE_OPTS="-i 0 -i 1 -i 2 -e -+d -r $RSIZE"
                 IOZFILE="$IOZDIR/iozone"
+               IOZLOG=$TMP/iozone.log
                 # $SPACE was calculated with all OSTs
                 $DEBUG_OFF
                 myUID=$RUNAS_ID
                 myRUNAS=$RUNAS
-               FAIL_ON_ERROR=false check_runas_id_ret $myUID $myRUNAS || { myRUNAS="" && myUID=$UID; }
+               FAIL_ON_ERROR=false check_runas_id_ret $myUID $myUID $myRUNAS || { myRUNAS="" && myUID=$UID; }
                 chown $myUID:$myUID $IOZDIR
-               $myRUNAS iozone $IOZONE_OPTS -f $IOZFILE
+               $myRUNAS iozone $IOZONE_OPTS -s $SIZE -f $IOZFILE 2>&1 | tee $IOZLOG
+               tail -1 $IOZLOG | grep -q complete || \
+                       { error "iozone (1) failed" && false; }
+               rm -f $IOZLOG
                 $DEBUG_ON
                 $CLEANUP
                 $SETUP
@@ -176,14 +219,17 @@ for NAME in $CONFIGS; do
                 if [ -z "$O_DIRECT" ]; then
                         touch $MOUNT/f.iozone
                         if ! ./directio write $MOUNT/f.iozone 0 1; then
+                               log "SKIP iozone DIRECT IO test"
                                 O_DIRECT=no
                         fi
                         rm -f $MOUNT/f.iozone
                 fi
                 if [ "$O_DIRECT" != "no" -a "$IOZONE_DIR" != "no" ]; then
                         $DEBUG_OFF
-                       # cd TMP to have write permission for tmp file iozone writes
-                       ( cd $TMP && $myRUNAS iozone -I $IOZONE_OPTS $IOZFILE.odir )
+                       $myRUNAS iozone -I $IOZONE_OPTS -s $SIZE -f $IOZFILE.odir 2>&1 | tee $IOZLOG
+                       tail -1 $IOZLOG | grep -q complete || \
+                               { error "iozone (2) failed" && false; }
+                       rm -f $IOZLOG
                         $DEBUG_ON
                         $CLEANUP
                         $SETUP
@@ -197,12 +243,15 @@ for NAME in $CONFIGS; do
                         $LFS setstripe -c -1 $IOZDIR
                         $DEBUG_OFF
                         THREAD=1
-                       IOZFILE="-F "
+                       IOZFILE=" "
                         while [ $THREAD -le $IOZ_THREADS ]; do
                                 IOZFILE="$IOZFILE $IOZDIR/iozone.$THREAD"
                                 THREAD=$((THREAD + 1))
                         done
-                       $myRUNAS iozone $IOZONE_OPTS -t $IOZ_THREADS $IOZFILE
+                       $myRUNAS iozone $IOZONE_OPTS -s $((SIZE / IOZ_THREADS)) -t $IOZ_THREADS -F $IOZFILE 2>&1 | tee $IOZLOG
+                       tail -1 $IOZLOG | grep -q complete || \
+                               { error "iozone (3) failed" && false; }
+                       rm -f $IOZLOG
                         $DEBUG_ON
                         $CLEANUP
                         $SETUP
@@ -215,10 +264,15 @@ for NAME in $CONFIGS; do
  
         if [ "$FSX" != "no" ]; then
                 title fsx
+               FSX_SIZE=$((RAMKB / 2))
                 SPACE=`df -P $MOUNT | tail -n 1 | awk '{ print $4 }'`
-               [ $SPACE -lt $SIZE ] && SIZE=$((SPACE * 3 / 4))
+               [ $SPACE -lt $FSX_SIZE ] && FSX_SIZE=$((SPACE * 3 / 4))
                 $DEBUG_OFF
-               ./fsx -c 50 -p 1000 -P $TMP -l $SIZE \
+               FSX_SEED=${FSX_SEED:-$RANDOM}
+               rm -f $MOUNT/fsxfile
+               $LFS setstripe -c -1 $MOUNT/fsxfile
+               echo Using FSX_SEED=$FSX_SEED FSX_SIZE=$FSX_SIZE COUNT=$COUNT
+               ./fsx -c 50 -p 1000 -S $FSX_SEED -P $TMP -l $FSX_SIZE \
                         -N $(($COUNT * 100)) $MOUNT/fsxfile
                 $DEBUG_ON
                 $CLEANUP
@@ -234,7 +288,7 @@ for NAME in $CONFIGS; do
                 mount_client $MOUNT2
                 #echo "can't mount2 for '$NAME', skipping sanityN.sh"
                 START=: CLEAN=: bash sanityN.sh
-               umount $MOUNT2
+               [ "$(mount | grep $MOUNT2)" ] && umount $MOUNT2
  
                 $DEBUG_ON
                 $CLEANUP
@@ -242,43 +296,32 @@ for NAME in $CONFIGS; do
                 SANITYN="done"
         fi
  
-       remote_mds && log "Remote MDS, skipping LFSCK test" && LFSCK=no
-       remote_ost && log "Remote OST, skipping LFSCK test" && LFSCK=no
-
-       if [ "$LFSCK" != "no" -a -x /usr/sbin/lfsck ]; then
+       [ "$LFSCK" != "no" ] && remote_mds && log "Remote MDS, skipping LFSCK test" && LFSCK=no && MSKIPPED=1
+       [ "$LFSCK" != "no" ] && remote_ost && log "Remote OST, skipping LFSCK test" && LFSCK=no && OSKIPPED=1
+       if [ "$LFSCK" != "no" ]; then
                 title lfsck
-               E2VER=`e2fsck -V 2>&1 | head -n 1 | cut -d' ' -f 2`
-               if [ `echo $E2VER | cut -d. -f2` -ge 39 ] && \
-                  [ "`echo $E2VER | grep cfs`" -o \
-                       "`echo $E2VER | grep sun`" ]; then
-                               bash lfscktest.sh
+               if [ -x /usr/sbin/lfsck ]; then
+                       bash lfscktest.sh
                 else
-                       e2fsck -V
-                       echo "e2fsck does not support lfsck, skipping"
+                       log "$(e2fsck -V)"
+                       log "SKIP: e2fsck does not support lfsck"
                 fi
                 LFSCK="done"
         fi
  
         [ "$NETTYPE" = "tcp" -o "$NETTYPE" = "ptl" ] || LIBLUSTRE=no # bug 15660
+       if [ "$LIBLUSTRE" != "no" ] && ! check_versions ; then
+               skip liblustre version mismatch: cli $CLIVER, mds $MDSVER, ost $OSTVER
+               LIBLUSTRE=no    # bug 17696
+       fi
         if [ "$LIBLUSTRE" != "no" ]; then
-               title liblustre
+               title liblustre
                 assert_env MGSNID MOUNT2
-               $CLEANUP
-               unload_modules
-               # Liblustre needs accept=all, noacl
-               [ -f /etc/modprobe.conf ] && MODPROBECONF=/etc/modprobe.conf
-               [ -f /etc/modprobe.d/Lustre ] && MODPROBECONF=/etc/modprobe.d/Lustre
-
-               LNETOPTS="$(awk '/^options lnet/ { print $0}' $MODPROBECONF | \
-                       sed 's/^options lnet //g; s/"//g') accept=all" \
-                       MDS_MOUNT_OPTS=$(echo $MDS_MOUNT_OPTS | sed 's/^[ \t]*//;s/[ \t]*$//') \
-                       MDS_MOUNT_OPTS="${MDS_MOUNT_OPTS},noacl" \
-                       MDS_MOUNT_OPTS=${MDS_MOUNT_OPTS/#,/-o } \
-                       $SETUP
                 export LIBLUSTRE_MOUNT_POINT=$MOUNT2
+               export LIBLUSTRE_MOUNT_RETRY=5
                 export LIBLUSTRE_MOUNT_TARGET=$MGSNID:/$FSNAME
-               export LIBLUSTRE_TIMEOUT=`cat /proc/sys/lustre/timeout`
-               #export LIBLUSTRE_DEBUG_MASK=`cat /proc/sys/lnet/debug`
+               export LIBLUSTRE_TIMEOUT=`lctl get_param -n timeout`
+               #export LIBLUSTRE_DEBUG_MASK=`lctl get_param -n debug`
                 if [ -x $LIBLUSTRETESTS/sanity ]; then
                         mkdir -p $MOUNT2
                         echo $LIBLUSTRETESTS/sanity --target=$LIBLUSTRE_MOUNT_TARGET
@@ -289,45 +332,75 @@ for NAME in $CONFIGS; do
                 LIBLUSTRE="done"
         fi
  
-       $CLEANUP
+       [ "$RACER" != "no" ] && [ -n "$CLIENTS" -a "$PDSH" = "no_dsh" ] && log "Remote client with no_dsh" && RACER=no 
+       if [ "$RACER" != "no" ]; then
+               title racer
+               setup_if_needed
+               DURATION=${DURATION:-900}
+               [ "$SLOW" = "no" ] && DURATION=300
+               RACERCLIENTS=$HOSTNAME
+               [ ! -z ${CLIENTS} ] && RACERCLIENTS=$CLIENTS
+               log "racer on clients: $RACERCLIENTS DURATION=$DURATION"
+               CLIENTS=${RACERCLIENTS} DURATION=$DURATION bash runracer
+               $CLEANUP
+               $SETUP
+               RACER="done"
+       fi
  done
  
+[ "$REPLAY_SINGLE" != "no" ] && skip_remmds replay-single && REPLAY_SINGLE=no && MSKIPPED=1
  if [ "$REPLAY_SINGLE" != "no" ]; then
          title replay-single
         bash replay-single.sh
         REPLAY_SINGLE="done"
  fi
  
+[ "$CONF_SANITY" != "no" ] && skip_remmds conf-sanity && CONF_SANITY=no && MSKIPPED=1
+[ "$CONF_SANITY" != "no" ] && skip_remost conf-sanity && CONF_SANITY=no && OSKIPPED=1
  if [ "$CONF_SANITY" != "no" ]; then
          title conf-sanity
          bash conf-sanity.sh
          CONF_SANITY="done"
  fi
  
+[ "$RECOVERY_SMALL" != "no" ] && skip_remmds recover-small && RECOVERY_SMALL=no && MSKIPPED=1
  if [ "$RECOVERY_SMALL" != "no" ]; then
          title recovery-small
          bash recovery-small.sh
          RECOVERY_SMALL="done"
  fi
  
+[ "$REPLAY_OST_SINGLE" != "no" ] && skip_remost replay-ost-single && REPLAY_OST_SINGLE=no && OSKIPPED=1
  if [ "$REPLAY_OST_SINGLE" != "no" ]; then
          title replay-ost-single
          bash replay-ost-single.sh
          REPLAY_OST_SINGLE="done"
  fi
  
+[ "$REPLAY_DUAL" != "no" ] && skip_remost replay-dual && REPLAY_DUAL=no && OSKIPPED=1
  if [ "$REPLAY_DUAL" != "no" ]; then
          title replay-dual
          bash replay-dual.sh
          REPLAY_DUAL="done"
  fi
  
+[ "$REPLAY_VBR" != "no" ] && skip_remmds replay-vbr && REPLAY_VBR=no && MSKIPPED=1
+if [ "$REPLAY_VBR" != "no" ]; then
+        title replay-vbr
+        bash replay-vbr.sh
+        REPLAY_VBR="done"
+fi
+
+[ "$INSANITY" != "no" ] && skip_remmds insanity && INSANITY=no && MSKIPPED=1
+[ "$INSANITY" != "no" ] && skip_remost insanity && INSANITY=no && OSKIPPED=1
  if [ "$INSANITY" != "no" ]; then
          title insanity
          bash insanity.sh -r
          INSANITY="done"
  fi
  
+[ "$SANITY_QUOTA" != "no" ] && skip_remmds sanity-quota && SANITY_QUOTA=no && MSKIPPED=1
+[ "$SANITY_QUOTA" != "no" ] && skip_remost sanity-quota && SANITY_QUOTA=no && OSKIPPED=1
  if [ "$SANITY_QUOTA" != "no" ]; then
          title sanity-quota
          bash sanity-quota.sh
@@ -335,9 +408,36 @@ if [ "$SANITY_QUOTA" != "no" ]; then
  fi
  
  
+[ "$SLOW" = no ] && PERFORMANCE_SANITY="no"
+[ -x "$MDSRATE" ] || PERFORMANCE_SANITY="no"
+which mpirun > /dev/null 2>&1 || PERFORMANCE_SANITY="no"
+if [ "$PERFORMANCE_SANITY" != "no" ]; then
+        title performance-sanity
+        bash performance-sanity.sh
+        PERFORMANCE_SANITY="done"
+fi
+
+[ "$LARGE_SCALE" != "no" ] && skip_remmds large-scale && LARGE_SCALE=no && MSKIPPED=1
+if [ "$LARGE_SCALE" != "no" ]; then
+        title large-scale
+        bash large-scale.sh
+        LARGE_SCALE="done"
+fi
+
+[ "$SLOW" = no ] && RECOVERY_MDS_SCALE="no"
+[ "$RECOVERY_MDS_SCALE" != "no" ] && skip_remmds recovery-mds-scale && RECOVERY_MDS_SCALE=no && MSKIPPED=1
+[ "$RECOVERY_MDS_SCALE" != "no" ] && skip_remost recovery-mds-scale && RECOVERY_MDS_SCALE=no && OSKIPPED=1
+if [ "$RECOVERY_MDS_SCALE" != "no" ]; then
+        title recovery-mds-scale
+        bash recovery-mds-scale.sh
+        RECOVERY_MDS_SCALE="done"
+fi
+
  RC=$?
  title FINISHED
  echo "Finished at `date` in $((`date +%s` - $STARTTIME))s"
  echo "Tests ran: $RANTEST"
  print_summary
+[ "$MSKIPPED" = 1 ] && log "FAIL: remote MDS tests skipped" && RC=1
+[ "$OSKIPPED" = 1 ] && log "FAIL: remote OST tests skipped" && RC=1
  echo "$0: completed with rc $RC" && exit $RC
diff --git a/lustre/tests/cfg/local.sh b/lustre/tests/cfg/local.sh

index 00d5796..60096be 100644 (file)
--- a/lustre/tests/cfg/local.sh
+++ b/lustre/tests/cfg/local.sh
@@ -6,6 +6,7 @@ mdsfailover_HOST=${mdsfailover_HOST}
  mgs_HOST=${mgs_HOST:-$mds_HOST}
  ost_HOST=${ost_HOST:-`hostname`}
  ostfailover_HOST=${ostfailover_HOST}
+CLIENTS=""
  PDSH=${PDSH:-no_dsh}
  
  TMP=${TMP:-/tmp}
@@ -36,7 +37,6 @@ SUBSYSTEM=${SUBSYSTEM:- 0xffb7e3ff}
  L_GETGROUPS=${L_GETGROUPS:-`do_facet mds which l_getgroups || echo`}
  
  MKFSOPT=""
-MOUNTOPT=""
  [ "x$MDSJOURNALSIZE" != "x" ] &&
      MKFSOPT=$MKFSOPT" -J size=$MDSJOURNALSIZE"
  [ "x$MDSISIZE" != "x" ] &&
@@ -44,24 +44,23 @@ MOUNTOPT=""
  [ "x$MKFSOPT" != "x" ] &&
      MKFSOPT="--mkfsoptions=\\\"$MKFSOPT\\\""
  [ "x$mdsfailover_HOST" != "x" ] &&
-    MOUNTOPT=$MOUNTOPT" --failnode=`h2$NETTYPE $mdsfailover_HOST`"
+    MDSOPT=$MDSOPT" --failnode=`h2$NETTYPE $mdsfailover_HOST`"
  [ "x$STRIPE_BYTES" != "x" ] &&
-    MOUNTOPT=$MOUNTOPT" --param lov.stripesize=$STRIPE_BYTES"
+    MDSOPT=$MDSOPT" --param lov.stripesize=$STRIPE_BYTES"
  [ "x$STRIPES_PER_OBJ" != "x" ] &&
-    MOUNTOPT=$MOUNTOPT" --param lov.stripecount=$STRIPES_PER_OBJ"
+    MDSOPT=$MDSOPT" --param lov.stripecount=$STRIPES_PER_OBJ"
  [ "x$L_GETGROUPS" != "x" ] &&
-    MOUNTOPT=$MOUNTOPT" --param mdt.group_upcall=$L_GETGROUPS"
-MDS_MKFS_OPTS="--mgs --mdt --fsname=$FSNAME --device-size=$MDSSIZE --param sys.timeout=$TIMEOUT $MKFSOPT $MOUNTOPT $MDSOPT"
+    MDSOPT=$MDSOPT" --param mdt.group_upcall=$L_GETGROUPS"
+MDS_MKFS_OPTS="--mgs --mdt --fsname=$FSNAME --device-size=$MDSSIZE --param sys.timeout=$TIMEOUT $MKFSOPT $MDSOPT $MDS_MKFS_OPTS"
  
  MKFSOPT=""
-MOUNTOPT=""
  [ "x$OSTJOURNALSIZE" != "x" ] &&
      MKFSOPT=$MKFSOPT" -J size=$OSTJOURNALSIZE"
  [ "x$MKFSOPT" != "x" ] &&
      MKFSOPT="--mkfsoptions=\\\"$MKFSOPT\\\""
  [ "x$ostfailover_HOST" != "x" ] &&
-    MOUNTOPT=$MOUNTOPT" --failnode=`h2$NETTYPE $ostfailover_HOST`"
-OST_MKFS_OPTS="--ost --fsname=$FSNAME --device-size=$OSTSIZE --mgsnode=$MGSNID --param sys.timeout=$TIMEOUT $MKFSOPT $MOUNTOPT $OSTOPT"
+    OSTOPT=$OSTOPT" --failnode=`h2$NETTYPE $ostfailover_HOST`"
+OST_MKFS_OPTS="--ost --fsname=$FSNAME --device-size=$OSTSIZE --mgsnode=$MGSNID --param sys.timeout=$TIMEOUT $MKFSOPT $OSTOPT $OST_MKFS_OPTS"
  
  MDS_MOUNT_OPTS=${MDS_MOUNT_OPTS:-"-o loop"}
  OST_MOUNT_OPTS=${OST_MOUNT_OPTS:-"-o loop"}
diff --git a/lustre/tests/cfg/ncli.sh b/lustre/tests/cfg/ncli.sh

index 6dfae9b..a3fff9f 100644 (file)
--- a/lustre/tests/cfg/ncli.sh
+++ b/lustre/tests/cfg/ncli.sh
@@ -3,16 +3,22 @@
  CLIENT1=${CLIENT1:-`hostname`}
  SINGLECLIENT=$CLIENT1
  RCLIENTS=${RCLIENTS:-""}
-CLIENTS=`comma_list $SINGLECLIENT $RCLIENTS`
-REMOTECLIENTS=($RCLIENTS)
-for ((i=0; $i<${#REMOTECLIENTS[@]}; i++)); do
-       varname=CLIENT$((i + 2))
-       eval $varname=${REMOTECLIENTS[i]}
-done
  
-CLIENTCOUNT=$((${#REMOTECLIENTS[@]} + 1))
+init_clients_lists
  
  [ -n "$RCLIENTS" -a "$PDSH" = "no_dsh" ] && \
                  error "tests for remote clients $RCLIENTS needs pdsh != do_dsh " || true
  
  [ -n "$FUNCTIONS" ] && . $FUNCTIONS || true
+
+MPIBIN=${MPIBIN:-/testsuite/tests/`arch`/bin}
+export PATH=:$PATH:$MPIBIN
+MPIRUN=$(which mpirun) || true
+MPI_USER=${MPI_USER:-mpiuser}
+
+# for recovery scale tests
+# default boulder cluster iozone location
+export PATH=/opt/iozone/bin:$PATH
+SHARED_DIRECTORY=${SHARED_DIRECTORY:-""}       # bug 17839 comment 65
+LOADS="dd tar dbench iozone"
+CLIENT_LOADS=($LOADS)
diff --git a/lustre/tests/checkstat.c b/lustre/tests/checkstat.c

index c98d6aa..b53a82a 100644 (file)
--- a/lustre/tests/checkstat.c
+++ b/lustre/tests/checkstat.c
@@ -1,3 +1,39 @@
+/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
+ * vim:expandtab:shiftwidth=8:tabstop=8:
+ *
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright  2008 Sun Microsystems, Inc. All rights reserved
+ * Use is subject to license terms.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
  #include <stdio.h>
  #include <stdlib.h>
  #include <sys/types.h>
diff --git a/lustre/tests/chownmany.c b/lustre/tests/chownmany.c

index 64512ef..ebdf9b6 100644 (file)
--- a/lustre/tests/chownmany.c
+++ b/lustre/tests/chownmany.c
@@ -1,6 +1,39 @@
  /* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
   * vim:expandtab:shiftwidth=8:tabstop=8:
+ *
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
   */
+/*
+ * Copyright  2008 Sun Microsystems, Inc. All rights reserved
+ * Use is subject to license terms.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
  #include <stdio.h>
  #include <sys/types.h>
  #include <sys/stat.h>
diff --git a/lustre/tests/cmknod.c b/lustre/tests/cmknod.c

index 920ee5b..49e7ecc 100644 (file)
--- a/lustre/tests/cmknod.c
+++ b/lustre/tests/cmknod.c
@@ -1,4 +1,43 @@
-/* Simple test to check that device nodes are correctly created and visible */
+/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
+ * vim:expandtab:shiftwidth=8:tabstop=8:
+ *
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright  2008 Sun Microsystems, Inc. All rights reserved
+ * Use is subject to license terms.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/tests/cmknod.c
+ *
+ * Simple test to check that device nodes are correctly created and visible
+ */
+
  #include <stdio.h>
  #include <stdlib.h>
  #include <string.h>
diff --git a/lustre/tests/conf-sanity.sh b/lustre/tests/conf-sanity.sh

index fda47a4..8347754 100644 (file)
--- a/lustre/tests/conf-sanity.sh
+++ b/lustre/tests/conf-sanity.sh
@@ -11,12 +11,8 @@ set -e
  
  ONLY=${ONLY:-"$*"}
  
-# These tests don't apply to mountconf
-#              xml xml xml xml xml xml dumb
-MOUNTCONFSKIP="10  11  12  13  13b 14  15 "
-
-# bug number for skipped test:                     13369
-ALWAYS_EXCEPT=" $CONF_SANITY_EXCEPT $MOUNTCONFSKIP 34a"
+# bug number for skipped test:      13369
+ALWAYS_EXCEPT=" $CONF_SANITY_EXCEPT 34a"
  # UPDATE THE COMMENT ABOVE WITH BUG NUMBERS WHEN CHANGING ALWAYS_EXCEPT!
  
  SRCDIR=`dirname $0`
@@ -29,13 +25,20 @@ HOSTNAME=`hostname`
  
  . $LUSTRE/tests/test-framework.sh
  init_test_env $@
+# STORED_MDSSIZE is used in test_18
+if [ -n "$MDSSIZE" ]; then
+    STORED_MDSSIZE=$MDSSIZE
+fi
  # use small MDS + OST size to speed formatting time
  MDSSIZE=40000
  OSTSIZE=40000
  . ${CONFIG:=$LUSTRE/tests/cfg/$NAME.sh}
  
+remote_mds_nodsh && skip "remote MDS with nodsh" && exit 0
+remote_ost_nodsh && skip "remote OST with nodsh" && exit 0
+
  #
-[ "$SLOW" = "no" ] && EXCEPT_SLOW="0 1 2 3 6 7 15 18 24b 25 30 31 32 33 34a "
+[ "$SLOW" = "no" ] && EXCEPT_SLOW="0 1 2 3 6 7 15 18 24b 25 30 31 32 33 34a 45"
  
  assert_DIR
  
@@ -390,261 +393,8 @@ test_9() {
  
  run_test 9 "test ptldebug and subsystem for mkfs"
  
-test_10() {
-        echo "generate configuration with the same name for node and mds"
-        OLDXMLCONFIG=$XMLCONFIG
-        XMLCONFIG="broken.xml"
-        [ -f "$XMLCONFIG" ] && rm -f $XMLCONFIG
-        facet="mds"
-        rm -f ${facet}active
-        add_facet $facet
-        echo "the name for node and mds is the same"
-        do_lmc --add mds --node ${facet}_facet --mds ${facet}_facet \
-            --dev $MDSDEV --size $MDSSIZE || return $?
-        do_lmc --add lov --mds ${facet}_facet --lov lov1 --stripe_sz \
-            $STRIPE_BYTES --stripe_cnt $STRIPES_PER_OBJ \
-            --stripe_pattern 0 || return $?
-        add_ost ost --lov lov1 --dev $OSTDEV --size $OSTSIZE
-        facet="client"
-        add_facet $facet --lustre_upcall $UPCALL
-        do_lmc --add mtpt --node ${facet}_facet --mds mds_facet \
-            --lov lov1 --path $MOUNT
-
-        echo "mount lustre"
-        start_ost
-        start_mds
-        mount_client $MOUNT
-        check_mount || return 41
-        cleanup || return $?
-
-        echo "Success!"
-        XMLCONFIG=$OLDXMLCONFIG
-}
-run_test 10 "mount lustre with the same name for node and mds"
-
-test_11() {
-        OLDXMLCONFIG=$XMLCONFIG
-        XMLCONFIG="conf11.xml"
-
-        [ -f "$XMLCONFIG" ] && rm -f $XMLCONFIG
-        add_mds mds --dev $MDSDEV --size $MDSSIZE
-        add_ost ost --dev $OSTDEV --size $OSTSIZE
-        add_client client mds --path $MOUNT --ost ost_svc || return $?
-        echo "Default lov config success!"
-
-        [ -f "$XMLCONFIG" ] && rm -f $XMLCONFIG
-        add_mds mds --dev $MDSDEV --size $MDSSIZE
-        add_ost ost --dev $OSTDEV --size $OSTSIZE
-        add_client client mds --path $MOUNT && return $?
-        echo "--add mtpt with neither --lov nor --ost will return error"
-
-        echo ""
-        echo "Success!"
-        XMLCONFIG=$OLDXMLCONFIG
-}
-run_test 11 "use default lov configuration (should return error)"
-
-test_12() {
-        OLDXMLCONFIG=$XMLCONFIG
-        XMLCONFIG="batch.xml"
-        BATCHFILE="batchfile"
-
-        # test double quote
-        [ -f "$XMLCONFIG" ] && rm -f $XMLCONFIG
-        [ -f "$BATCHFILE" ] && rm -f $BATCHFILE
-        echo "--add net --node $HOSTNAME --nid $HOSTNAME --nettype tcp" > $BATCHFILE
-        echo "--add mds --node $HOSTNAME --mds mds1 --mkfsoptions \"-I 128\"" >> $BATCHFILE
-        # --mkfsoptions "-I 128"
-        do_lmc -m $XMLCONFIG --batch $BATCHFILE || return $?
-        if [ `sed -n '/>-I 128</p' $XMLCONFIG | wc -l` -eq 1 ]; then
-                echo "matched double quote success"
-        else
-                echo "matched double quote fail"
-                return 1
-        fi
-        rm -f $XMLCONFIG
-        rm -f $BATCHFILE
-        echo "--add net --node $HOSTNAME --nid $HOSTNAME --nettype tcp" > $BATCHFILE
-        echo "--add mds --node $HOSTNAME --mds mds1 --mkfsoptions \"-I 128" >> $BATCHFILE
-        # --mkfsoptions "-I 128
-        do_lmc -m $XMLCONFIG --batch $BATCHFILE && return $?
-        echo "unmatched double quote should return error"
-
-        # test single quote
-        rm -f $BATCHFILE
-        echo "--add net --node $HOSTNAME --nid $HOSTNAME --nettype tcp" > $BATCHFILE
-        echo "--add mds --node $HOSTNAME --mds mds1 --mkfsoptions '-I 128'" >> $BATCHFILE
-        # --mkfsoptions '-I 128'
-        do_lmc -m $XMLCONFIG --batch $BATCHFILE || return $?
-        if [ `sed -n '/>-I 128</p' $XMLCONFIG | wc -l` -eq 1 ]; then
-                echo "matched single quote success"
-        else
-                echo "matched single quote fail"
-                return 1
-        fi
-        rm -f $XMLCONFIG
-        rm -f $BATCHFILE
-        echo "--add net --node $HOSTNAME --nid $HOSTNAME --nettype tcp" > $BATCHFILE
-        echo "--add mds --node $HOSTNAME --mds mds1 --mkfsoptions '-I 128" >> $BATCHFILE
-        # --mkfsoptions '-I 128
-        do_lmc -m $XMLCONFIG --batch $BATCHFILE && return $?
-        echo "unmatched single quote should return error"
-
-        # test backslash
-        rm -f $BATCHFILE
-        echo "--add net --node $HOSTNAME --nid $HOSTNAME --nettype tcp" > $BATCHFILE
-        echo "--add mds --node $HOSTNAME --mds mds1 --mkfsoptions \-\I\ \128" >> $BATCHFILE
-        # --mkfsoptions \-\I\ \128
-        do_lmc -m $XMLCONFIG --batch $BATCHFILE || return $?
-        if [ `sed -n '/>-I 128</p' $XMLCONFIG | wc -l` -eq 1 ]; then
-                echo "backslash followed by a whitespace/letter success"
-        else
-                echo "backslash followed by a whitespace/letter fail"
-                return 1
-        fi
-        rm -f $XMLCONFIG
-        rm -f $BATCHFILE
-        echo "--add net --node $HOSTNAME --nid $HOSTNAME --nettype tcp" > $BATCHFILE
-        echo "--add mds --node $HOSTNAME --mds mds1 --mkfsoptions -I\ 128\\" >> $BATCHFILE
-        # --mkfsoptions -I\ 128\
-        do_lmc -m $XMLCONFIG --batch $BATCHFILE && return $?
-        echo "backslash followed by nothing should return error"
-
-        rm -f $BATCHFILE
-        XMLCONFIG=$OLDXMLCONFIG
-}
-run_test 12 "lmc --batch, with single/double quote, backslash in batchfile"
-
-test_13a() {   # was test_13
-        OLDXMLCONFIG=$XMLCONFIG
-        XMLCONFIG="conf13-1.xml"
-
-        # check long uuid will be truncated properly and uniquely
-        echo "To generate XML configuration file(with long ost name): $XMLCONFIG"
-        [ -f "$XMLCONFIG" ] && rm -f $XMLCONFIG
-        do_lmc --add net --node $HOSTNAME --nid $HOSTNAME --nettype tcp
-        do_lmc --add mds --node $HOSTNAME --mds mds1_name_longer_than_31characters
-        do_lmc --add mds --node $HOSTNAME --mds mds2_name_longer_than_31characters
-        if [ ! -f "$XMLCONFIG" ]; then
-                echo "Error:no file $XMLCONFIG created!"
-                return 1
-        fi
-        EXPECTEDMDS1UUID="e_longer_than_31characters_UUID"
-        EXPECTEDMDS2UUID="longer_than_31characters_UUID_2"
-        FOUNDMDS1UUID=`awk -F"'" '/<mds .*uuid=/' $XMLCONFIG | sed -n '1p' \
-                       | sed "s/ /\n\r/g" | awk -F"'" '/uuid=/{print $2}'`
-        FOUNDMDS2UUID=`awk -F"'" '/<mds .*uuid=/' $XMLCONFIG | sed -n '2p' \
-                       | sed "s/ /\n\r/g" | awk -F"'" '/uuid=/{print $2}'`
-       [ -z "$FOUNDMDS1UUID" ] && echo "MDS1 UUID empty" && return 1
-       [ -z "$FOUNDMDS2UUID" ] && echo "MDS2 UUID empty" && return 1
-        if ([ $EXPECTEDMDS1UUID = $FOUNDMDS1UUID ] && [ $EXPECTEDMDS2UUID = $FOUNDMDS2UUID ]) || \
-           ([ $EXPECTEDMDS1UUID = $FOUNDMDS2UUID ] && [ $EXPECTEDMDS2UUID = $FOUNDMDS1UUID ]); then
-                echo "Success:long uuid truncated successfully and being unique."
-        else
-                echo "Error:expected uuid for mds1 and mds2: $EXPECTEDMDS1UUID; $EXPECTEDMDS2UUID"
-                echo "but:     found uuid for mds1 and mds2: $FOUNDMDS1UUID; $FOUNDMDS2UUID"
-                return 1
-        fi
-        rm -f $XMLCONFIG
-        XMLCONFIG=$OLDXMLCONFIG
-}
-run_test 13a "check new_uuid of lmc operating correctly"
-
-test_13b() {
-        OLDXMLCONFIG=$XMLCONFIG
-        XMLCONFIG="conf13-1.xml"
-        SECONDXMLCONFIG="conf13-2.xml"
-        # check multiple invocations for lmc generate same XML configuration file
-        rm -f $XMLCONFIG
-        echo "Generate the first XML configuration file"
-        gen_config
-        echo "mv $XMLCONFIG to $SECONDXMLCONFIG"
-        sed -e "s/mtime[^ ]*//" $XMLCONFIG > $SECONDXMLCONFIG || return $?
-        echo "Generate the second XML configuration file"
-        gen_config
-       # don't compare .xml mtime, it will always be different
-        if [ `sed -e "s/mtime[^ ]*//" $XMLCONFIG | diff - $SECONDXMLCONFIG | wc -l` -eq 0 ]; then
-                echo "Success:multiple invocations for lmc generate same XML file"
-        else
-                echo "Error: multiple invocations for lmc generate different XML file"
-                return 1
-        fi
-
-        rm -f $XMLCONFIG $SECONDXMLCONFIG
-        XMLCONFIG=$OLDXMLCONFIG
-}
-run_test 13b "check lmc generates consistent .xml file"
-
-test_14() {
-        rm -f $XMLCONFIG
-
-        # create xml file with --mkfsoptions for ost
-        echo "create xml file with --mkfsoptions for ost"
-        add_mds mds --dev $MDSDEV --size $MDSSIZE
-        add_lov lov1 mds --stripe_sz $STRIPE_BYTES\
-            --stripe_cnt $STRIPES_PER_OBJ --stripe_pattern 0
-        add_ost ost --lov lov1 --dev $OSTDEV --size $OSTSIZE \
-            --mkfsoptions "-Llabel_conf_14"
-        add_client client mds --lov lov1 --path $MOUNT
-
-        FOUNDSTRING=`awk -F"<" '/<mkfsoptions>/{print $2}' $XMLCONFIG`
-        EXPECTEDSTRING="mkfsoptions>-Llabel_conf_14"
-        if [ "$EXPECTEDSTRING" != "$FOUNDSTRING" ]; then
-                echo "Error: expected: $EXPECTEDSTRING; found: $FOUNDSTRING"
-                return 1
-        fi
-        echo "Success:mkfsoptions for ost written to xml file correctly."
-
-        # mount lustre to test lconf mkfsoptions-parsing
-        echo "mount lustre"
-        start_ost
-        start_mds
-        mount_client $MOUNT || return $?
-        if [ -z "`do_facet ost1 dumpe2fs -h $OSTDEV | grep label_conf_14`" ]; then
-                echo "Error: the mkoptions not applied to mke2fs of ost."
-                return 1
-        fi
-        cleanup
-        echo "lconf mkfsoptions for ost success"
-
-        gen_config
-}
-run_test 14 "test mkfsoptions of ost for lmc and lconf"
-
-cleanup_15() {
-       trap 0
-       [ -f $MOUNTLUSTRE ] && echo "remove $MOUNTLUSTRE" && rm -f $MOUNTLUSTRE
-       if [ -f $MOUNTLUSTRE.sav ]; then
-               echo "return original $MOUNTLUSTRE.sav to $MOUNTLUSTRE"
-               mv $MOUNTLUSTRE.sav $MOUNTLUSTRE
-       fi
-}
-
-# this only tests the kernel mount command, not anything about lustre.
-test_15() {
-        MOUNTLUSTRE=${MOUNTLUSTRE:-/sbin/mount.lustre}
-       start_ost
-       start_mds
-
-       echo "mount lustre on ${MOUNT} without $MOUNTLUSTRE....."
-       if [ -f "$MOUNTLUSTRE" ]; then
-               echo "save $MOUNTLUSTRE to $MOUNTLUSTRE.sav"
-               mv $MOUNTLUSTRE $MOUNTLUSTRE.sav && trap cleanup_15 EXIT INT
-               if [ -f $MOUNTLUSTRE ]; then
-                       skip "$MOUNTLUSTRE cannot be moved, skipping test"
-                       return 0
-               fi
-       fi
-
-       mount_client $MOUNT && error "mount succeeded" && return 1
-       echo "mount lustre on $MOUNT without $MOUNTLUSTRE failed as expected"
-       cleanup_15
-       cleanup || return $?
-}
-run_test 15 "zconf-mount without /sbin/mount.lustre (should return error)"
-
  test_16() {
-        TMPMTPT="${TMP}/conf16"
+        local TMPMTPT="${TMP}/conf16"
  
          if [ ! -e "$MDSDEV" ]; then
              log "no $MDSDEV existing, so mount Lustre to create one"
@@ -709,10 +459,38 @@ test_17() {
  run_test 17 "Verify failed mds_postsetup won't fail assertion (2936) (should return errs)"
  
  test_18() {
-        [ -f $MDSDEV ] && echo "remove $MDSDEV" && rm -f $MDSDEV
+        [ "$FSTYPE" != "ldiskfs" ] && skip "not needed for FSTYPE=$FSTYPE" && return
+
+        local MIN=2000000
+
+        local OK=
+        # check if current MDSSIZE is large enough
+        [ $MDSSIZE -ge $MIN ] && OK=1 && myMDSSIZE=$MDSSIZE && \
+                log "use MDSSIZE=$MDSSIZE"
+
+        # check if the global config has a large enough MDSSIZE
+        [ -z "$OK" -a ! -z "$STORED_MDSSIZE" ] && [ $STORED_MDSSIZE -ge $MIN ] && \
+                OK=1 && myMDSSIZE=$STORED_MDSSIZE && \
+                log "use STORED_MDSSIZE=$STORED_MDSSIZE"
+
+        # check if the block device is large enough
+        [ -z "$OK" -a -b $MDSDEV ] && \
+                [ "$(dd if=$MDSDEV of=/dev/null bs=1k count=1 skip=$MIN 2>&1 |
+                     awk '($3 == "in") { print $1 }')" = "1+0" ] && OK=1 && \
+                myMDSSIZE=$MIN && log "use device $MDSDEV with MIN=$MIN"
+
+        # check if a loopback device has enough space for fs metadata (5%)
+        [ -z "$OK" ] && [ -f $MDSDEV -o ! -e $MDSDEV ] &&
+                SPACE=$(df -P $(dirname $MDSDEV) |
+                        awk '($1 != "Filesystem") {print $4}') &&
+                [ $SPACE -gt $((MIN / 20)) ] && OK=1 && myMDSSIZE=$MIN && \
+                        log "use file $MDSDEV with MIN=$MIN"
+
+        [ -z "$OK" ] && skip "$MDSDEV too small for ${MIN}kB MDS" && return
+
+
          echo "mount mds with large journal..."
-        local myMDSSIZE=2000000
-        OLD_MDS_MKFS_OPTS=$MDS_MKFS_OPTS
+        local OLD_MDS_MKFS_OPTS=$MDS_MKFS_OPTS
  
          MDS_MKFS_OPTS="--mgs --mdt --fsname=$FSNAME --device-size=$myMDSSIZE --param sys.timeout=$TIMEOUT $MDSOPT"
  
@@ -722,7 +500,7 @@ test_18() {
          check_mount || return 41
  
          echo "check journal size..."
-        FOUNDSIZE=`do_facet mds "debugfs -c -R 'stat <8>' $MDSDEV" | awk '/Size: / { print $NF; exit;}'`
+        local FOUNDSIZE=`do_facet mds "debugfs -c -R 'stat <8>' $MDSDEV" | awk '/Size: / { print $NF; exit;}'`
          if [ $FOUNDSIZE -gt $((32 * 1024 * 1024)) ]; then
                  log "Success: mkfs creates large journals. Size: $((FOUNDSIZE >> 20))M"
          else
@@ -792,18 +570,13 @@ test_21c() {
         stop_ost
         stop_ost2
         stop_mds
+       #writeconf to remove all ost2 traces for subsequent tests
+       writeconf
  }
  run_test 21c "start mds between two osts, stop mds last"
  
  test_22() {
-        #reformat to remove all logs
-        reformat
         start_mds
-       echo Client mount before any osts are in the logs
-       mount_client $MOUNT
-       check_mount && return 41
-       umount_client $MOUNT
-       pass
  
         echo Client mount with ost in logs, but none running
         start_ost
@@ -982,6 +755,29 @@ test_26() {
  }
  run_test 26 "MDT startup failure cleans LOV (should return errs)"
  
+wait_update () {
+       local node=$1
+       local TEST=$2
+       local FINAL=$3
+
+       local RESULT
+       local MAX=90
+       local WAIT=0
+       local sleep=5
+       while [ $WAIT -lt $MAX ]; do
+           RESULT=$(do_node $node "$TEST") 
+           if [ $RESULT -eq $FINAL ]; then
+               echo "Updated config after $WAIT sec: wanted $FINAL got $RESULT"
+               return 0
+           fi
+           WAIT=$((WAIT + sleep))
+           echo "Waiting $((MAX - WAIT)) secs for config update" 
+           sleep $sleep
+       done
+       echo "Config update not seen after $MAX sec: wanted $FINAL got $RESULT"
+       return 3
+}
+
  set_and_check() {
         local myfacet=$1
         local TEST=$2
@@ -995,23 +791,8 @@ set_and_check() {
         fi
         echo "Setting $PARAM from $ORIG to $FINAL"
         do_facet mds "$LCTL conf_param $PARAM=$FINAL" || error conf_param failed
-       local RESULT
-       local MAX=90
-       local WAIT=0
-       while [ 1 ]; do
-           sleep 5
-           RESULT=$(do_facet $myfacet "$TEST") 
-           if [ $RESULT -eq $FINAL ]; then
-               echo "Updated config after $WAIT sec (got $RESULT)"
-               break
-           fi
-           WAIT=$((WAIT + 5))
-           if [ $WAIT -eq $MAX ]; then
-               echo "Config update not seen: wanted $FINAL got $RESULT"
-               return 3
-           fi
-           echo "Waiting $(($MAX - $WAIT)) secs for config update" 
-       done
+
+       wait_update $(facet_host $myfacet) "$TEST" $FINAL || error check failed!
  }
  
  test_27a() {
@@ -1029,6 +810,7 @@ test_27b() {
         facet_failover mds
         set_and_check mds "lctl get_param -n mds.$FSNAME-MDT0000.group_acquire_expire" "$FSNAME-MDT0000.mdt.group_acquire_expire" || return 3
         set_and_check client "lctl get_param -n mdc.$FSNAME-MDT0000-mdc-*.max_rpcs_in_flight" "$FSNAME-MDT0000.mdc.max_rpcs_in_flight" || return 4
+       check_mount
         cleanup
  }
  run_test 27b "Reacquire MGS lock after failover"
@@ -1036,11 +818,12 @@ run_test 27b "Reacquire MGS lock after failover"
  test_28() {
          setup
         TEST="lctl get_param -n llite.$FSNAME-*.max_read_ahead_whole_mb"
-       ORIG=$($TEST) 
-       declare -i FINAL
-       FINAL=$(($ORIG + 10))
-       set_and_check client "$TEST" "$FSNAME.llite.max_read_ahead_whole_mb" || return 3
-       set_and_check client "$TEST" "$FSNAME.llite.max_read_ahead_whole_mb" || return 3
+       PARAM="$FSNAME.llite.max_read_ahead_whole_mb"
+       ORIG=$($TEST)
+       FINAL=$(($ORIG + 1))
+       set_and_check client "$TEST" "$PARAM" $FINAL || return 3
+       FINAL=$(($FINAL + 1))
+       set_and_check client "$TEST" "$PARAM" $FINAL || return 4
         umount_client $MOUNT || return 200
         mount_client $MOUNT
         RESULT=$($TEST)
@@ -1050,6 +833,7 @@ test_28() {
         else
             echo "New config success: got $RESULT"
         fi
+       set_and_check client "$TEST" "$PARAM" $ORIG || return 5
         cleanup
  }
  run_test 28 "permanent parameter setting"
@@ -1125,8 +909,9 @@ test_30() {
         setup
  
         TEST="lctl get_param -n llite.$FSNAME-*.max_read_ahead_whole_mb"
-       ORIG=$($TEST) 
-       for i in $(seq 1 20); do 
+       ORIG=$($TEST)
+       LIST=(1 2 3 4 5 4 3 2 1 2 3 4 5 4 3 2 1 2 3 4 5)
+       for i in ${LIST[@]}; do
             set_and_check client "$TEST" "$FSNAME.llite.max_read_ahead_whole_mb" $i || return 3
         done
         # make sure client restart still works 
@@ -1145,18 +930,55 @@ test_31() { # bug 10734
  }
  run_test 31 "Connect to non-existent node (returns errors, should not crash)"
  
+# Use these start32/stop32 fn instead of t-f start/stop fn,
+# for local devices, to skip global facet vars init 
+stop32 () {
+       local facet=$1
+       shift
+       echo "Stopping local ${MOUNT%/*}/${facet} (opts:$@)"
+       umount -d $@ ${MOUNT%/*}/${facet}
+       losetup -a
+}
+
+start32 () {
+       local facet=$1
+       shift
+       local device=$1
+       shift
+       mkdir -p ${MOUNT%/*}/${facet}
+
+       echo "Starting local ${facet}: $@ $device ${MOUNT%/*}/${facet}"
+       mount -t lustre $@ ${device} ${MOUNT%/*}/${facet}
+       RC=$?
+       if [ $RC -ne 0 ]; then
+               echo "mount -t lustre $@ ${device} ${MOUNT%/*}/${facet}"
+               echo "Start of ${device} of local ${facet} failed ${RC}"
+       fi 
+       losetup -a
+       return $RC
+}
+
+cleanup_nocli32 () {
+       stop32 mds -f
+       stop32 ost1 -f
+       wait_exit_ST client
+}
+
+cleanup_32() {
+       trap 0
+       echo "Cleanup test_32 umount $MOUNT ..."
+       umount -f $MOUNT || true
+       echo "Cleanup local mds ost1 ..."
+       cleanup_nocli32
+       unload_modules
+}
+
  test_32a() {
-        # XXX - make this run on client-only systems with real hardware on
-        #       the OST and MDT
-        #       there appears to be a lot of assumption here about loopback
-        #       devices
-        # or maybe this test is just totally useless on a client-only system
+       # this test is totally useless on a client-only system
+       [ -n "$CLIENTONLY" -o -n "$CLIENTMODSONLY" ] && skip "client only testing" && return 0
         [ "$NETTYPE" = "tcp" ] || { skip "NETTYPE != tcp" && return 0; }
-       [ "$mds_HOST" = "`hostname`" ] || { skip "remote MDS" && return 0; }
-       [ "$ost_HOST" = "`hostname`" -o "$ost1_HOST" = "`hostname`" ] || \
-               { skip "remote OST" && return 0; }
+       [ -z "$TUNEFS" ] && skip "No tunefs" && return 0
  
-        [ -z "$TUNEFS" ] && skip "No tunefs" && return
         local DISK1_4=$LUSTRE/tests/disk1_4.zip
         [ ! -r $DISK1_4 ] && skip "Cant find $DISK1_4, skipping" && return
  
@@ -1166,14 +988,17 @@ test_32a() {
         lctl set_param debug=$PTLDEBUG
  
         $TUNEFS $tmpdir/mds || error "tunefs failed"
+
         # nids are wrong, so client wont work, but server should start
-       start mds $tmpdir/mds "-o loop,exclude=lustre-OST0000" || return 3
-        local UUID=$(lctl get_param -n mds.lustre-MDT0000.uuid)
+       start32 mds $tmpdir/mds "-o loop,exclude=lustre-OST0000" && \
+               trap cleanup_32 EXIT INT || return 3
+        
+       local UUID=$(lctl get_param -n mds.lustre-MDT0000.uuid)
         echo MDS uuid $UUID
         [ "$UUID" == "mdsA_UUID" ] || error "UUID is wrong: $UUID" 
  
         $TUNEFS --mgsnode=`hostname` $tmpdir/ost1 || error "tunefs failed"
-       start ost1 $tmpdir/ost1 "-o loop" || return 5
+       start32 ost1 $tmpdir/ost1 "-o loop" || return 5
         UUID=$(lctl get_param -n obdfilter.lustre-OST0000.uuid)
         echo OST uuid $UUID
         [ "$UUID" == "ost1_UUID" ] || error "UUID is wrong: $UUID" 
@@ -1191,40 +1016,37 @@ test_32a() {
  
         # With a new good MDT failover nid, we should be able to mount a client
         # (but it cant talk to OST)
-        local OLDMOUNTOPT=$MOUNTOPT
-        MOUNTOPT="exclude=lustre-OST0000"
-       mount_client $MOUNT
-        MOUNTOPT=$OLDMOUNTOPT
-       set_and_check client "lctl get_param -n mdc.*.max_rpcs_in_flight" "lustre-MDT0000.mdc.max_rpcs_in_flight" ||
-               return 11
+       local mountopt="-o exclude=lustre-OST0000"
  
-       zconf_umount `hostname` $MOUNT -f
-       cleanup_nocli
-       load_modules
+       local device=`h2$NETTYPE $HOSTNAME`:/lustre
+       echo "Starting local client: $HOSTNAME: $mountopt $device $MOUNT"
+       mount -t lustre $mountopt $device $MOUNT || return 1
  
-        # mount a second time to make sure we didnt leave upgrade flag on
+       local old=$(lctl get_param -n mdc.*.max_rpcs_in_flight)
+       local new=$((old + 5))
+       lctl conf_param lustre-MDT0000.mdc.max_rpcs_in_flight=$new
+       wait_update $HOSTNAME "lctl get_param -n mdc.*.max_rpcs_in_flight" $new || return 11
+
+       cleanup_32
+
+       # mount a second time to make sure we didnt leave upgrade flag on
         load_modules
         $TUNEFS --dryrun $tmpdir/mds || error "tunefs failed"
-       load_modules
-       start mds $tmpdir/mds "-o loop,exclude=lustre-OST0000" || return 12
-       cleanup_nocli
+       start32 mds $tmpdir/mds "-o loop,exclude=lustre-OST0000" && \
+               trap cleanup_32 EXIT INT || return 12
+
+       cleanup_32
  
-       [ -d $tmpdir ] && rm -rf $tmpdir
+       rm -rf $tmpdir || true  # true is only for TMP on NFS
  }
  run_test 32a "Upgrade from 1.4 (not live)"
  
  test_32b() {
-        # XXX - make this run on client-only systems with real hardware on
-        #       the OST and MDT
-        #       there appears to be a lot of assumption here about loopback
-        #       devices
-        # or maybe this test is just totally useless on a client-only system
-        [ "$NETTYPE" = "tcp" ] || { skip "NETTYPE != tcp" && return 0; }
-        [ "$mds_HOST" = "`hostname`" ] || { skip "remote MDS" && return 0; }
-        [ "$ost_HOST" = "`hostname`" -o "$ost1_HOST" = "`hostname`" ] || \
-               { skip "remote OST" && return 0; }
+       # this test is totally useless on a client-only system
+       [ -n "$CLIENTONLY" -o -n "$CLIENTMODSONLY" ] && skip "client only testing" && return 0
+       [ "$NETTYPE" = "tcp" ] || { skip "NETTYPE != tcp" && return 0; }
+       [ -z "$TUNEFS" ] && skip "No tunefs" && return
  
-        [ -z "$TUNEFS" ] && skip "No tunefs" && return
         local DISK1_4=$LUSTRE/tests/disk1_4.zip
         [ ! -r $DISK1_4 ] && skip "Cant find $DISK1_4, skipping" && return
  
@@ -1232,17 +1054,19 @@ test_32b() {
         unzip -o -j -d $tmpdir $DISK1_4 || { skip "Cant unzip $DISK1_4, skipping" && return ; }
         load_modules
         lctl set_param debug=$PTLDEBUG
-       NEWNAME=sofia
+       local NEWNAME=sofia
  
         # writeconf will cause servers to register with their current nids
         $TUNEFS --writeconf --fsname=$NEWNAME $tmpdir/mds || error "tunefs failed"
-       start mds $tmpdir/mds "-o loop" || return 3
+       start32 mds $tmpdir/mds "-o loop" && \
+               trap cleanup_32 EXIT INT || return 3
+
         local UUID=$(lctl get_param -n mds.${NEWNAME}-MDT0000.uuid)
         echo MDS uuid $UUID
         [ "$UUID" == "mdsA_UUID" ] || error "UUID is wrong: $UUID" 
  
         $TUNEFS --mgsnode=`hostname` --fsname=$NEWNAME --writeconf $tmpdir/ost1 || error "tunefs failed"
-       start ost1 $tmpdir/ost1 "-o loop" || return 5
+       start32 ost1 $tmpdir/ost1 "-o loop" || return 5
         UUID=$(lctl get_param -n obdfilter.${NEWNAME}-OST0000.uuid)
         echo OST uuid $UUID
         [ "$UUID" == "ost1_UUID" ] || error "UUID is wrong: $UUID"
@@ -1258,16 +1082,22 @@ test_32b() {
         # MDT and OST should have registered with new nids, so we should have
         # a fully-functioning client
         echo "Check client and old fs contents"
-       OLDFS=$FSNAME
-       FSNAME=$NEWNAME
-       mount_client $MOUNT
-       FSNAME=$OLDFS
-       set_and_check client "lctl get_param -n mdc.*.max_rpcs_in_flight" "${NEWNAME}-MDT0000.mdc.max_rpcs_in_flight" || return 11
+
+       local device=`h2$NETTYPE $HOSTNAME`:/$NEWNAME
+       echo "Starting local client: $HOSTNAME: $device $MOUNT"
+       mount -t lustre $device $MOUNT || return 1
+
+       local old=$(lctl get_param -n mdc.*.max_rpcs_in_flight)
+       local new=$((old + 5))
+       lctl conf_param ${NEWNAME}-MDT0000.mdc.max_rpcs_in_flight=$new
+       wait_update $HOSTNAME "lctl get_param -n mdc.*.max_rpcs_in_flight" $new || return 11
+
         [ "$(cksum $MOUNT/passwd | cut -d' ' -f 1,2)" == "2479747619 779" ] || return 12  
         echo "ok."
  
-       cleanup
-       [ -d $tmpdir ] && rm -rf $tmpdir
+       cleanup_32
+
+       rm -rf $tmpdir || true  # true is only for TMP on NFS
  }
  run_test 32b "Upgrade from 1.4 with writeconf"
  
@@ -1291,8 +1121,12 @@ test_33a() { # bug 12333, was test_33
          do_facet mds "$LCTL conf_param $FSNAME2.sys.timeout=200" || rc=1
          mkdir -p $MOUNT2
          mount -t lustre $MGSNID:/${FSNAME2} $MOUNT2 || rc=2
+        cp /etc/hosts $MOUNT2/. || rc=3
          echo "ok."
  
+        cp /etc/hosts $MOUNT2/ || rc=3 
+        $LFS getstripe $MOUNT2/hosts
+
          umount -d $MOUNT2
          stop fs2ost -f
          stop fs2mds -f
@@ -1585,7 +1419,7 @@ run_test 40 "race during service thread startup"
  test_41() { #bug 14134
          local rc
          start mds $MDSDEV $MDS_MOUNT_OPTS -o nosvc -n
-        start ost `ostdevname 1` $OST_MOUNT_OPTS
+        start ost1 `ostdevname 1` $OST_MOUNT_OPTS
          start mds $MDSDEV $MDS_MOUNT_OPTS -o nomgs
          mkdir -p $MOUNT
          mount_client $MOUNT || return 1
@@ -1595,7 +1429,7 @@ test_41() { #bug 14134
          cat $MOUNT/$tfile
  
          umount_client $MOUNT
-        stop ost -f || return 201
+        stop ost1 -f || return 201
          stop mds -f || return 202
          stop mds -f || return 203
          unload_modules || return 204
@@ -1614,5 +1448,155 @@ test_42() { #bug 14693
  }
  run_test 42 "invalid config param should not prevent client from mounting"
  
+test_43() { #bug 15993
+        setup
+        VERSION_1_8=$(do_facet mds $LCTL get_param version | grep ^lustre.*1\.[78])
+        if [ -z "$VERSION_1_8" ]; then
+                skip "skipping test for non 1.8 MDS"
+                cleanup
+                return 0
+        fi
+
+        check_mount || return 2
+        testfile=$DIR/$tfile
+        lma="this-should-be-removed-after-remount-and-accessed"
+        touch $testfile
+        echo "set/get trusted.lma"
+#define OBD_FAIL_MDS_ALLOW_COMMON_EA_SETTING    0x13f
+        do_facet mds "lctl set_param fail_loc=0x13f"
+        lctl set_param fail_loc=0x13f
+        setfattr -n trusted.lma -v $lma $testfile || error "create common EA"
+        do_facet mds "lctl set_param fail_loc=0"
+        lctl set_param fail_loc=0
+        ATTR=$(getfattr -n trusted.lma $testfile 2> /dev/null | grep trusted.lma)
+        [ "$ATTR" = "trusted.lma=\"$lma\"" ] || error "check common EA"
+        umount_client $MOUNT
+        stop_mds
+        sleep 5
+        start_mds
+        mount_client $MOUNT
+        check_mount || return 3
+#define OBD_FAIL_MDS_REMOVE_COMMON_EA    0x13e
+        do_facet mds "lctl set_param fail_loc=0x13e"
+        stat $testfile
+        do_facet mds "lctl set_param fail_loc=0"
+        getfattr -d -m trusted $testfile 2> /dev/null | \
+            grep "trusted.lma" && error "common EA not removed" || true
+        cleanup
+        return 0
+}
+run_test 43 "remove common EA if it exists"
+
+test_44() { # 16317
+        setup
+        check_mount || return 2
+        UUID=$($LCTL get_param llite.${FSNAME}*.uuid | cut -d= -f2)
+        STATS_FOUND=no
+        UUIDS=$(do_facet mds "$LCTL get_param mds.${FSNAME}*.exports.*.uuid")
+        for VAL in $UUIDS; do
+                NID=$(echo $VAL | cut -d= -f1)
+                CLUUID=$(echo $VAL | cut -d= -f2)
+                [ "$UUID" = "$CLUUID" ] && STATS_FOUND=yes && break
+        done
+        [ "$STATS_FOUND" = "no" ] && error "stats not found for client"
+        cleanup
+        return 0
+}
+run_test 44 "mounted client proc entry exists"
+
+test_45() { #17310
+        setup
+        check_mount || return 2
+        stop_mds
+        df -h $MOUNT &
+        log "sleep 60 sec"
+        sleep 60
+#define OBD_FAIL_PTLRPC_LONG_UNLINK   0x50f
+        do_facet client "lctl set_param fail_loc=0x50f"
+        log "sleep 10 sec"
+        sleep 10
+        manual_umount_client --force || return 3
+        do_facet client "lctl set_param fail_loc=0x0"
+        start_mds
+        mount_client $MOUNT || return 4
+        cleanup
+        return 0
+}
+run_test 45 "long unlink handling in ptlrpcd"
+
+test_46a() {
+       OSTCOUNT=6
+       reformat
+       start_mds || return 1
+       #first client should see only one ost
+       start_ost || return 2
+       #start_client
+       mount_client $MOUNT || return 3
+       
+       start_ost2 || return 4
+       start ost3 `ostdevname 3` $OST_MOUNT_OPTS || return 5
+       start ost4 `ostdevname 4` $OST_MOUNT_OPTS || return 6
+       start ost5 `ostdevname 5` $OST_MOUNT_OPTS || return 7
+       # wait until ost2-5 is sync
+       sleep 5
+       #second client see both ost's
+
+       mount_client $MOUNT2 || return 8
+       $LFS setstripe $MOUNT2 -c -1 || return 9
+       $LFS getstripe $MOUNT2 || return 10
+
+       echo "ok" > $MOUNT2/widestripe
+       $LFS getstripe $MOUNT2/widestripe || return 11
+       # fill acl buffer for avoid expand lsm to them
+       awk -F : '{if (FNR < 25) { print "u:"$1":rwx" }}' /etc/passwd | while read acl; do  
+           setfacl -m $acl $MOUNT2/widestripe
+       done
+
+       # will be deadlock
+       stat $MOUNT/widestripe || return 12
+
+       umount_client $MOUNT2 || return 13
+       umount_client $MOUNT || return 14
+       stop ost5 -f || return 20
+       stop ost4 -f || return 21
+       stop ost3 -f || return 22
+       stop_ost2 || return 23
+       stop_ost || return 24
+       stop_mds || return 25
+}
+run_test 46a "handle ost additional - wide striped file"
+
+test_47() { #17674
+        setup
+        check_mount || return 2
+        $LCTL set_param ldlm.namespaces.$FSNAME-*-*-*.lru_size=100
+
+        local lru_size=[]
+        local count=0
+        for ns in $($LCTL get_param ldlm.namespaces.$FSNAME-*-*-*.lru_size); do
+            lrs=$(echo $ns | sed 's/.*lru_size=//')
+            lru_size[count]=$lrs
+            let count=count+1
+        done
+        
+        facet_failover ost1
+        facet_failover mds
+        df -h $MOUNT || return 3
+
+        count=0
+        for ns in $($LCTL get_param ldlm.namespaces.$FSNAME-*-*-*.lru_size); do
+            lrs=$(echo $ns | sed 's/.*lru_size=//')
+            if ! test "$lrs" -eq "${lru_size[count]}"; then
+                n=$(echo $ns | sed -e 's/ldlm.namespaces.//' -e 's/.lru_size=.*//')
+                error "$n has lost lru_size: $lrs vs. ${lru_size[count]}"
+            fi
+            let count=count+1
+        done
+        
+        cleanup
+        return 0
+}
+run_test 47 "server restart does not make client loss lru_resize settings"
+
  equals_msg `basename $0`: test complete
-[ -f "$TESTSUITELOG" ] && cat $TESTSUITELOG || true
+[ -f "$TESTSUITELOG" ] && cat $TESTSUITELOG && grep -q FAIL $TESTSUITELOG && exit 1 || true
diff --git a/lustre/tests/createdestroy.c b/lustre/tests/createdestroy.c

index b5b7c2b..bc145e4 100644 (file)
--- a/lustre/tests/createdestroy.c
+++ b/lustre/tests/createdestroy.c
@@ -1,6 +1,39 @@
  /* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
   * vim:expandtab:shiftwidth=8:tabstop=8:
+ *
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
   */
+/*
+ * Copyright  2008 Sun Microsystems, Inc. All rights reserved
+ * Use is subject to license terms.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
  #include <stdlib.h>
  #include <stdio.h>
  #include <unistd.h>
diff --git a/lustre/tests/createmany-mpi.c b/lustre/tests/createmany-mpi.c

index 524c2d9..6e558fc 100644 (file)
--- a/lustre/tests/createmany-mpi.c
+++ b/lustre/tests/createmany-mpi.c
@@ -1,5 +1,37 @@
  /* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
   * vim:expandtab:shiftwidth=8:tabstop=8:
+ *
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright  2008 Sun Microsystems, Inc. All rights reserved
+ * Use is subject to license terms.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
   */
  
  #include <stdio.h>
diff --git a/lustre/tests/createmany.c b/lustre/tests/createmany.c

index 466c156..3ae06cb 100644 (file)
--- a/lustre/tests/createmany.c
+++ b/lustre/tests/createmany.c
@@ -1,5 +1,37 @@
  /* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
   * vim:expandtab:shiftwidth=8:tabstop=8:
+ *
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright  2008 Sun Microsystems, Inc. All rights reserved
+ * Use is subject to license terms.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
   */
  
  #include <stdio.h>
diff --git a/lustre/tests/createtest.c b/lustre/tests/createtest.c

index 6223034..6f7ec0d 100644 (file)
--- a/lustre/tests/createtest.c
+++ b/lustre/tests/createtest.c
@@ -1,3 +1,39 @@
+/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
+ * vim:expandtab:shiftwidth=8:tabstop=8:
+ *
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright  2008 Sun Microsystems, Inc. All rights reserved
+ * Use is subject to license terms.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
  #include <stdio.h>
  #include <stdlib.h>
  #include <sys/types.h>
diff --git a/lustre/tests/directio.c b/lustre/tests/directio.c

index 1108cba..e4ee62d 100644 (file)
--- a/lustre/tests/directio.c
+++ b/lustre/tests/directio.c
@@ -1,6 +1,39 @@
  /* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
   * vim:expandtab:shiftwidth=8:tabstop=8:
+ *
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
   */
+/*
+ * Copyright  2008 Sun Microsystems, Inc. All rights reserved
+ * Use is subject to license terms.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
  #ifndef _GNU_SOURCE
  #define  _GNU_SOURCE
  #endif
diff --git a/lustre/tests/fchdir_test.c b/lustre/tests/fchdir_test.c

index 83c096e..b300396 100644 (file)
--- a/lustre/tests/fchdir_test.c
+++ b/lustre/tests/fchdir_test.c
@@ -1,3 +1,39 @@
+/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
+ * vim:expandtab:shiftwidth=8:tabstop=8:
+ *
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright  2008 Sun Microsystems, Inc. All rights reserved
+ * Use is subject to license terms.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
  #include <stdlib.h>
  #include <stdio.h>
  #include <string.h>
diff --git a/lustre/tests/flock.c b/lustre/tests/flock.c

index 83da7c5..17f9687 100644 (file)
--- a/lustre/tests/flock.c
+++ b/lustre/tests/flock.c
@@ -1,24 +1,41 @@
  /* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
   * vim:expandtab:shiftwidth=8:tabstop=8:
   *
- * Lustre Light user test program
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   *
- *  Copyright (c) 2002, 2003 Cluster File Systems, Inc.
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
   *
- *   This file is part of Lustre, http://www.lustre.org.
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
   *
- *   Lustre is free software; you can redistribute it and/or
- *   modify it under the terms of version 2 of the GNU General Public
- *   License as published by the Free Software Foundation.
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
   *
- *   Lustre is distributed in the hope that it will be useful,
- *   but WITHOUT ANY WARRANTY; without even the implied warranty of
- *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- *   GNU General Public License for more details.
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
   *
- *   You should have received a copy of the GNU General Public License
- *   along with Lustre; if not, write to the Free Software
- *   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ * GPL HEADER END
+ */
+/*
+ * Copyright  2008 Sun Microsystems, Inc. All rights reserved
+ * Use is subject to license terms.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/tests/flock.c
+ *
+ * Lustre Light user test program
   */
  
  #define _BSD_SOURCE
diff --git a/lustre/tests/flock_test.c b/lustre/tests/flock_test.c

index 8ca0797..664be3d 100644 (file)
--- a/lustre/tests/flock_test.c
+++ b/lustre/tests/flock_test.c
@@ -1,3 +1,39 @@
+/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
+ * vim:expandtab:shiftwidth=8:tabstop=8:
+ *
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright  2008 Sun Microsystems, Inc. All rights reserved
+ * Use is subject to license terms.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
  #include <stdio.h>
  #include <stdlib.h>
  #include <errno.h>
diff --git a/lustre/tests/flocks_test.c b/lustre/tests/flocks_test.c

index ff54e06..97890d8 100644 (file)
--- a/lustre/tests/flocks_test.c
+++ b/lustre/tests/flocks_test.c
@@ -1,44 +1,157 @@
  /* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
   * vim:expandtab:shiftwidth=8:tabstop=8:
+ *
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
   */
+/*
+ * Copyright  2008 Sun Microsystems, Inc. All rights reserved
+ * Use is subject to license terms.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
  #include <errno.h>
  #include <fcntl.h>
  #include <stdio.h>
  #include <stdlib.h>
  #include <string.h>
  #include <unistd.h>
-
+#include <pthread.h>
  #include <sys/file.h>
+#include <stdarg.h>
  
-void usage(void)
+#define MAX_PATH_LENGTH 4096
+/**
+ * helper functions
+ */
+int t_fcntl(int fd, int cmd, ...)
  {
-        fprintf(stderr, "usage: ./flocks_test on|off -c|-f|-l /path/to/file\n");
-        exit(EXIT_FAILURE);
+        va_list ap;
+        long arg;
+        struct flock *lock;
+        int rc = -1;
+
+        va_start(ap, cmd);
+        switch (cmd) {
+        case F_GETFL:
+                va_end(ap);
+                rc = fcntl(fd, cmd);
+                if (rc == -1) {
+                        fprintf(stderr, "fcntl GETFL failed: %s\n",
+                                strerror(errno));
+                        return(1);
+                }
+                break;
+        case F_SETFL:
+                arg = va_arg(ap, long);
+                va_end(ap);
+                rc = fcntl(fd, cmd, arg);
+                if (rc == -1) {
+                        fprintf(stderr, "fcntl SETFL %ld failed: %s\n",
+                                arg, strerror(errno));
+                        return(1);
+                }
+                break;
+        case F_GETLK:
+        case F_SETLK:
+        case F_SETLKW:
+                lock = va_arg(ap, struct flock *);
+                va_end(ap);
+                rc = fcntl(fd, cmd, lock);
+                if (rc == -1) {
+                        fprintf(stderr, "fcntl cmd %d failed: %s\n",
+                                cmd, strerror(errno));
+                        return(1);
+                }
+                break;
+        case F_DUPFD:
+                arg = va_arg(ap, long);
+                va_end(ap);
+                rc = fcntl(fd, cmd, arg);
+                if (rc == -1) {
+                        fprintf(stderr, "fcntl F_DUPFD %d failed: %s\n",
+                                (int)arg, strerror(errno));
+                        return(1);
+                }
+                break;
+        default:
+                va_end(ap);
+                fprintf(stderr, "fcntl cmd %d not supported\n", cmd);
+                return(1);
+        }
+        return rc;
  }
  
-int main(int argc, char *argv[])
+int t_unlink(const char *path)
+{
+        int rc;
+
+        rc = unlink(path);
+        if (rc)
+                fprintf(stderr, "unlink(%s) error: %s\n", path, strerror(errno));
+        return rc;
+}
+
+/** =================================================================
+ * test number 1
+ * 
+ * normal flock test
+ */
+void t1_usage(void)
+{
+        fprintf(stderr, "usage: ./flocks_test 1 on|off -c|-f|-l /path/to/file\n");
+}
+
+int t1(int argc, char *argv[])
  {
          int fd;
          int mount_with_flock = 0;
          int error = 0;
  
-        if (argc != 4)
-                usage();
-        
-        if (!strncmp(argv[1], "on", 3)) {
+        if (argc != 5) {
+                t1_usage();
+                return EXIT_FAILURE;
+        }
+
+        if (!strncmp(argv[2], "on", 3)) {
                  mount_with_flock = 1;
-        } else if (!strncmp(argv[1], "off", 4)) {
+        } else if (!strncmp(argv[2], "off", 4)) {
                  mount_with_flock = 0;
          } else {
-                usage();
+                t1_usage();
+                return EXIT_FAILURE;
          }
  
-        if ((fd = open(argv[3], O_RDWR)) < 0) {
-                fprintf(stderr, "Couldn't open file: %s\n", argv[2]);
-                exit(EXIT_FAILURE);
+        if ((fd = open(argv[4], O_RDWR)) < 0) {
+                fprintf(stderr, "Couldn't open file: %s\n", argv[3]);
+                return EXIT_FAILURE;
          }
  
-        if (!strncmp(argv[2], "-c", 3)) {
+        if (!strncmp(argv[3], "-c", 3)) {
                  struct flock fl;
  
                  fl.l_type = F_RDLCK;
@@ -47,12 +160,13 @@ int main(int argc, char *argv[])
                  fl.l_len = 1;
  
                  error = fcntl(fd, F_SETLK, &fl);
-        } else if (!strncmp(argv[2], "-l", 3)) {
+        } else if (!strncmp(argv[3], "-l", 3)) {
                  error = lockf(fd, F_LOCK, 1);
-        } else if (!strncmp(argv[2], "-f", 3)) {
+        } else if (!strncmp(argv[3], "-f", 3)) {
                  error = flock(fd, LOCK_EX);
          } else {
-                usage();
+                t1_usage();
+                return EXIT_FAILURE;
          }
  
          if (mount_with_flock)
@@ -60,3 +174,128 @@ int main(int argc, char *argv[])
          else
                  return((error == 0) ? EXIT_FAILURE : EXIT_SUCCESS);
  }
+
+/** ===============================================================
+ * test number 2
+ * 
+ * 2 threads flock ops interweave
+ */
+typedef struct {
+        struct flock* lock;
+        int fd;
+} th_data;
+
+void* t2_thread1(void *arg)
+{
+        struct flock *lock = ((th_data *)arg)->lock;
+        int fd             = ((th_data *)arg)->fd;
+
+        printf("thread 1: set write lock (blocking)\n");
+        lock->l_type = F_WRLCK;
+        t_fcntl(fd, F_SETLKW, lock);
+        printf("thread 1: set write lock done\n");
+        t_fcntl(fd, F_GETLK, lock);
+        printf("thread 1: unlock\n");
+        lock->l_type = F_UNLCK;
+        t_fcntl(fd, F_SETLK, lock);
+        printf("thread 1: unlock done\n");
+        return 0;
+}
+
+void* t2_thread2(void *arg)
+{
+        struct flock *lock = ((th_data *)arg)->lock;
+        int fd             = ((th_data *)arg)->fd;
+
+        sleep(2);
+        printf("thread 2: unlock\n");
+        lock->l_type = F_UNLCK;
+        t_fcntl(fd, F_SETLK, lock);
+        printf("thread 2: unlock done\n");
+        printf("thread 2: set write lock (non-blocking)\n");
+        lock->l_type = F_WRLCK;
+        t_fcntl(fd, F_SETLK, lock);
+        printf("thread 2: set write lock done\n");
+        t_fcntl(fd, F_GETLK, lock);
+        return 0;
+}
+
+int t2(int argc, char* argv[])
+{
+        struct flock lock = {
+                .l_type = F_RDLCK,
+                .l_whence = SEEK_SET,
+        };
+        char file[MAX_PATH_LENGTH] = "";
+        int  fd, rc;
+        pthread_t th1, th2;
+        th_data   ta;
+
+        snprintf(file, MAX_PATH_LENGTH, "%s/test_t2_file", argv[2]);
+
+        fd = open(file, O_RDWR|O_CREAT, (mode_t)0666);
+        if (fd < 0) {
+                fprintf(stderr, "error open file: %s\n", file);
+                return EXIT_FAILURE;
+        }
+
+        t_fcntl(fd, F_SETFL, O_APPEND);
+        if (!(rc = t_fcntl(fd, F_GETFL)) & O_APPEND) {
+                fprintf(stderr, "error get flag: ret %x\n", rc);
+                return EXIT_FAILURE;
+        }
+
+        ta.lock = &lock;
+        ta.fd   = fd;
+        rc = pthread_create(&th1, NULL, t2_thread1, &ta);
+        if (rc) {
+                fprintf(stderr, "error create thread 1\n");
+                rc = EXIT_FAILURE;
+                goto out;
+        }
+        rc = pthread_create(&th2, NULL, t2_thread2, &ta);
+        if (rc) {
+                fprintf(stderr, "error create thread 2\n");
+                rc = EXIT_FAILURE;
+                goto out;
+        }
+        (void)pthread_join(th1, NULL);
+        (void)pthread_join(th2, NULL);
+out:
+        t_unlink(file);
+        close(fd);
+        return rc;
+}
+
+/** ==============================================================
+ * program entry
+ */
+void usage(void)
+{
+        fprintf(stderr, "usage: ./flocks_test test# [corresponding arguments]\n");
+}
+
+int main(int argc, char* argv[])
+{
+        int test_no;
+        int rc = EXIT_SUCCESS;
+
+        if (argc < 1) {
+                usage();
+                exit(EXIT_FAILURE);
+        }
+        test_no = atoi(argv[1]);
+
+        switch(test_no) {
+        case 1:
+                rc = t1(argc, argv);
+                break;
+        case 2:
+                rc = t2(argc, argv);
+                break;
+        default:
+                fprintf(stderr, "unknow test number %s\n", argv[1]);
+                break;
+        }
+        return rc;
+}
diff --git a/lustre/tests/getdents.c b/lustre/tests/getdents.c

index b4155a9..7237160 100644 (file)
--- a/lustre/tests/getdents.c
+++ b/lustre/tests/getdents.c
@@ -1,3 +1,39 @@
+/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
+ * vim:expandtab:shiftwidth=8:tabstop=8:
+ *
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright  2008 Sun Microsystems, Inc. All rights reserved
+ * Use is subject to license terms.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
  #include <stdio.h>
  #include <sys/types.h>
  #include <dirent.h>
diff --git a/lustre/tests/insanity.sh b/lustre/tests/insanity.sh

index cd4ffaf..fbdfd1b 100755 (executable)
--- a/lustre/tests/insanity.sh
+++ b/lustre/tests/insanity.sh
@@ -35,9 +35,11 @@ assert_env mds_HOST MDS_MKFS_OPTS MDSDEV
  assert_env ost_HOST OST_MKFS_OPTS OSTCOUNT
  assert_env LIVE_CLIENT FSNAME
  
+remote_mds_nodsh && skip "remote MDS with nodsh" && exit 0
+remote_ost_nodsh && skip "remote OST with nodsh" && exit 0
  
-# This can be a regexp, to allow more clients
-CLIENTS=${CLIENTS:-"`comma_list $LIVE_CLIENT $FAIL_CLIENTS`"}
+# FAIL_CLIENTS list should not contain the LIVE_CLIENT
+FAIL_CLIENTS=$(echo " $FAIL_CLIENTS " | sed -re "s/\s+$LIVE_CLIENT\s+/ /g")
  
  DIR=${DIR:-$MOUNT}
  
@@ -71,13 +73,6 @@ shutdown_client() {
      fi
  }
  
-reboot_node() {
-    NODE=$1
-    if [ "$FAILURE_MODE" = HARD ]; then
-       $POWER_UP $NODE
-    fi
-}
-
  fail_clients() {
      num=$1
  
@@ -103,7 +98,7 @@ fail_clients() {
      echo "down clients: $DOWN_CLIENTS"
  
      for client in $DOWN_CLIENTS; do
-       reboot_node $client
+       boot_node $client
      done
      DOWN_NUM=`echo $DOWN_CLIENTS | wc -w`
      client_rmdirs
@@ -160,7 +155,7 @@ clients_recover_osts() {
  #    do_node $CLIENTS "$LCTL "'--device %OSC_`hostname`_'"${facet}_svc_MNT_client_facet recover"
  }
  
-cleanup_and_setup_lustre
+check_and_setup_lustre
  
  # 9 Different Failure Modes Combinations
  echo "Starting Test 17 at `date`"
@@ -370,6 +365,7 @@ test_6() {
      echo "Test Lustre stability after OST failure"
      client_df &
      DFPIDA=$!
+    echo DFPIDA=$DFPIDA
      sleep 5
  
      #CLIENT Portion
@@ -380,17 +376,20 @@ test_6() {
      echo "Test Lustre stability after CLIENTs failure"
      client_df &
      DFPIDB=$!
+    echo DFPIDB=$DFPIDB
      sleep 5
      
      #Reintegration
      echo "Reintegrating OST/CLIENTs"
      wait_for ost1
      start_ost 1
-    reintegrate_clients
+    reintegrate_clients || return 1
      sleep 5 
  
+    wait_remote_prog df $((TIMEOUT * 3 + 10)) 
      wait $DFPIDA
      wait $DFPIDB
+
      echo "Verifying mount"
      [ -z "$(mounted_lustre_filesystems)" ] && return 3
      client_df
@@ -579,4 +578,4 @@ run_test 10 "Running Availability for 6 hours..."
  
  equals_msg `basename $0`: test complete, cleaning up
  check_and_cleanup_lustre
-[ -f "$TESTSUITELOG" ] && cat $TESTSUITELOG || true
+[ -f "$TESTSUITELOG" ] && cat $TESTSUITELOG && grep -q FAIL $TESTSUITELOG && exit 1 || true
diff --git a/lustre/tests/iopentest1.c b/lustre/tests/iopentest1.c

index acc1112..8e1aba4 100644 (file)
--- a/lustre/tests/iopentest1.c
+++ b/lustre/tests/iopentest1.c
@@ -1,5 +1,37 @@
  /* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
   * vim:expandtab:shiftwidth=8:tabstop=8:
+ *
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright  2008 Sun Microsystems, Inc. All rights reserved
+ * Use is subject to license terms.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
   */
  
  #include <sys/types.h>
diff --git a/lustre/tests/iopentest2.c b/lustre/tests/iopentest2.c

index 046de92..d38d395 100644 (file)
--- a/lustre/tests/iopentest2.c
+++ b/lustre/tests/iopentest2.c
@@ -1,5 +1,37 @@
  /* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
   * vim:expandtab:shiftwidth=8:tabstop=8:
+ *
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright  2008 Sun Microsystems, Inc. All rights reserved
+ * Use is subject to license terms.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
   */
  
  #include <sys/types.h>
diff --git a/lustre/tests/it_test.c b/lustre/tests/it_test.c

index 44c7f6f..adaccde 100644 (file)
--- a/lustre/tests/it_test.c
+++ b/lustre/tests/it_test.c
@@ -1,7 +1,45 @@
-/* vi:set ts=8 sw=8 expandtab: */
-/* Unit test tool for interval tree.
- * Written by jay <jxiong@clusterfs.com> 
+/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
+ * vim:expandtab:shiftwidth=8:tabstop=8:
+ *
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
   */
+/*
+ * Copyright  2008 Sun Microsystems, Inc. All rights reserved
+ * Use is subject to license terms.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/tests/it_test.c
+ *
+ * Unit test tool for interval tree.
+ *
+ * Author: jay <jxiong@clusterfs.com>
+ */
+
  #include <stdio.h>
  #include <stdlib.h>
  #include <time.h>
diff --git a/lustre/tests/large-scale.sh b/lustre/tests/large-scale.sh

new file mode 100644 (file)

index 0000000..580c95e
--- /dev/null
+++ b/lustre/tests/large-scale.sh
@@ -0,0 +1,185 @@
+#!/bin/bash
+
+set -e
+
+# bug number:
+ALWAYS_EXCEPT="$LARGE_SCALE_EXCEPT"
+
+SAVE_PWD=$PWD
+PTLDEBUG=${PTLDEBUG:--1}
+LUSTRE=${LUSTRE:-`dirname $0`/..}
+SETUP=${SETUP:-""}
+CLEANUP=${CLEANUP:-""}
+. $LUSTRE/tests/test-framework.sh
+
+init_test_env $@
+
+. ${CONFIG:=$LUSTRE/tests/cfg/$NAME.sh}
+
+remote_mds_nodsh && log "SKIP: remote MDS with nodsh" && exit 0
+
+[ -n "$CLIENTS" ] || { skip "$0: Need two or more clients" && exit 0; }
+[ $CLIENTCOUNT -ge 2 ] || \
+    { skip "$0: Need two or more clients, have $CLIENTCOUNT" && exit 0; }
+
+#
+[ "$SLOW" = "no" ] && EXCEPT_SLOW=""
+
+MOUNT_2=""
+build_test_filter
+
+check_and_setup_lustre
+rm -rf $DIR/[df][0-9]*
+
+[ "$DAEMONFILE" ] && $LCTL debug_daemon start $DAEMONFILE $DAEMONSIZE
+
+# VBR scale tests
+check_vbr () {
+    do_nodes $CLIENTS "$LCTL get_param mdc.*.connect_flags | grep version_recovery" 
+}
+
+check_vbr || \
+    { skip "$0: no version_recovery" && exit 0; }
+
+FAKE_NUM_MAX=${FAKE_NUM_MAX:-1000}
+[ "$SLOW" = "no" ] && FAKE_NUM_MAX=100
+
+do_and_time () {
+   local cmd=$1
+
+   local start_ts=`date +%s`
+
+   $cmd
+
+   local current_ts=`date +%s`
+   ELAPSED=`expr $current_ts - $start_ts`
+   echo "===== START $start_ts CURRENT $current_ts"
+}
+
+delete_fake_exports () {
+    NUM=$(do_facet mds "lctl get_param -n mds.${mds_svc}.stale_exports|wc -l")
+
+    OLD_AGE=$(do_facet mds "lctl get_param -n mds.${mds_svc}.stale_export_age")
+    NEW_AGE=0
+    do_facet mds "lctl set_param mds.${mds_svc}.stale_export_age=$NEW_AGE"
+    sleep $((NEW_AGE + 3))
+    EX_NUM=$(do_facet mds "lctl get_param -n mds.${mds_svc}.stale_exports|grep -c EXPIRED")
+    [ "$EX_NUM" -eq "$NUM" ] || error "not all exports are expired $EX_NUM != $NUM"
+
+    do_facet mds "lctl set_param mds.${mds_svc}.flush_stale_exports=1"
+    do_facet mds "lctl set_param mds.${mds_svc}.stale_export_age=$OLD_AGE"
+}
+
+test_1b() {
+    local FAKE_NUM
+    local NUM
+
+    for FAKE_NUM in 10 $FAKE_NUM_MAX; do
+        zconf_umount_clients $CLIENTS $DIR
+        zconf_mount $CLIENT1 $DIR
+
+        NUM=$(do_facet mds "lctl get_param -n mds.${mds_svc}.stale_exports|wc -l")
+
+        log "===== CREATE FAKE EXPORTS: $FAKE_NUM ( were $NUM )"
+        create_fake_exports mds $FAKE_NUM
+        NUM=$(do_facet mds "lctl get_param -n mds.${mds_svc}.stale_exports|wc -l")
+        [ $NUM -lt $FAKE_NUM ] && error "fake exports $NUM -ne $FAKE_NUM"
+        echo "===== STALE EXPORTS: FAKE_NUM=$FAKE_NUM NUM=$NUM"
+        do_and_time "zconf_mount_clients $CLIENTS $DIR"
+        echo "==== $TESTNAME ===== CONNECTION TIME $ELAPSED: FAKE_NUM=$FAKE_NUM CLIENTCOUNT=$CLIENTCOUNT"
+
+        # do_facet mds "lctl set_param mds.${mds_svc}.flush_stale_exports=1"
+        delete_fake_exports
+    done
+
+    return 0
+}
+run_test 1b "VBR: connect $CLIENTCOUNT clients with delayed exports"
+
+# Sigh. One more function for mds failover
+# fail fn does not do df on all clients
+fail_mds () {
+    facet_failover mds
+    client_df
+}
+
+test_1c() {
+    zconf_mount_clients $CLIENTS $DIR
+
+    # sanity mds fail (to exclude the recults on fresh formatted fs)
+    facet_failover mds
+
+    local current_ts
+    local elapsed
+    local FAKE_NUM
+    local NUM
+
+    for FAKE_NUM in 10 $FAKE_NUM_MAX; do
+
+        NUM=$(do_facet mds "lctl get_param -n mds.${mds_svc}.stale_exports|wc -l")
+
+        log "===== CREATE FAKE EXPORTS: $FAKE_NUM ( were $NUM )"
+        create_fake_exports mds $FAKE_NUM
+        NUM=$(do_facet mds "lctl get_param -n mds.${mds_svc}.stale_exports|wc -l")
+        [ $NUM -lt $FAKE_NUM ] && error "fake exports $NUM -ne $FAKE_NUM"
+        echo "===== STALE EXPORTS: FAKE_NUM=$FAKE_NUM NUM=$NUM"
+
+        replay_barrier mds
+        do_nodes $CLIENTS "createmany -o $DIR/$tfile-\\\$(hostname)" 25
+        # XXX For FAILURE_MODE=HARD it is better to exclude
+        # shutdown_facet and reboot_facet time 
+        fail_mds
+
+        local current_ts=`date +%s`
+        local elapsed=`expr $current_ts - $RECOVERY_START_TIME`
+
+        do_nodes $CLIENTS "unlinkmany $DIR/$tfile-\\\$(hostname) 25"
+        echo "==== $TESTNAME ===== RECOVERY TIME $elapsed: FAKE_NUM=$FAKE_NUM CLIENTCOUNT=$CLIENTCOUNT"
+
+        # do_facet mds "lctl set_param mds.${mds_svc}.flush_stale_exports=1"
+        delete_fake_exports
+    done
+
+    return 0
+}
+run_test 1c "VBR: recovery $CLIENTCOUNT clients with delayed exports"
+
+
+test_1d() {
+    local FAKE_NUM
+    local NUM
+
+    for FAKE_NUM in 10 $FAKE_NUM_MAX; do
+        zconf_umount_clients $CLIENTS $DIR
+        zconf_mount $CLIENT1 $DIR
+
+        NUM=$(do_facet mds "lctl get_param -n mds.${mds_svc}.stale_exports|wc -l")
+
+        log "===== CREATE FAKE EXPORTS: $FAKE_NUM ( were $NUM )"
+        create_fake_exports mds $FAKE_NUM
+        NUM=$(do_facet mds "lctl get_param -n mds.${mds_svc}.stale_exports|wc -l")
+        [ $NUM -lt $FAKE_NUM ] && error "fake exports $NUM -lt $FAKE_NUM"
+        echo "===== STALE EXPORTS: FAKE_NUM=$FAKE_NUM NUM=$NUM"
+
+        OLD_AGE=$(do_facet mds "lctl get_param -n mds.${mds_svc}.stale_export_age")
+        echo OLD_AGE=$OLD_AGE
+        NEW_AGE=10
+        do_facet mds "lctl set_param mds.${mds_svc}.stale_export_age=$NEW_AGE"
+        sleep $((NEW_AGE + 3))
+        EX_NUM=$(do_facet mds "lctl get_param -n mds.${mds_svc}.stale_exports|grep -c EXPIRED")
+        [ "$EX_NUM" -eq "$NUM" ] || error "not all exports are expired $EX_NUM != $NUM"
+
+        do_and_time "zconf_mount_clients $CLIENTS $DIR"
+        echo "==== $TESTNAME===== CONNECTION TIME $ELAPSED: expired FAKE_NUM=$FAKE_NUM CLIENTCOUNT=$CLIENTCOUNT"
+
+        do_facet mds "lctl set_param mds.${mds_svc}.stale_export_age=$OLD_AGE"
+    done
+
+    return 0
+}
+run_test 1d "VBR: expire exports, connect $CLIENTCOUNT clients"
+# VBR scale tests end
+
+equals_msg `basename $0`: test complete, cleaning up
+check_and_cleanup_lustre
+[ -f "$TESTSUITELOG" ] && cat $TESTSUITELOG || true
diff --git a/lustre/tests/ldaptest.c b/lustre/tests/ldaptest.c

index c1a7499..899377a 100644 (file)
--- a/lustre/tests/ldaptest.c
+++ b/lustre/tests/ldaptest.c
@@ -1,3 +1,39 @@
+/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
+ * vim:expandtab:shiftwidth=8:tabstop=8:
+ *
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright  2008 Sun Microsystems, Inc. All rights reserved
+ * Use is subject to license terms.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
  #include <ldap.h>
  #include <stdio.h>
  #include <errno.h>
diff --git a/lustre/tests/lfscktest.sh b/lustre/tests/lfscktest.sh

index b10190a..67fbd7e 100755 (executable)
--- a/lustre/tests/lfscktest.sh
+++ b/lustre/tests/lfscktest.sh
@@ -25,6 +25,9 @@ LUSTRE=${LUSTRE:-`dirname $0`/..}
  init_test_env $@
  . ${CONFIG:=$LUSTRE/tests/cfg/$NAME.sh}
  
+remote_mds && skip "remote MDS" && exit 0
+remote_ost && skip "remote OST" && exit 0
+
  # if nothing mounted, don't nuke MOUNT variable needed in llmount.sh
  WAS_MOUNTED=$(mounted_lustre_filesystems | head -1)
  if [ -z "$WAS_MOUNTED" ]; then
@@ -45,7 +48,7 @@ if [ "$WAS_MOUNTED" ]; then
  fi
  
  get_mnt_devs() {
-       DEVS=`cat /proc/fs/lustre/$1/*/mntdev`
+       DEVS=`lctl get_param -n $1.*.mntdev`
         for DEV in $DEVS; do
                 case $DEV in
                 *loop*) losetup $DEV | sed -e "s/.*(//" -e "s/).*//" ;;
diff --git a/lustre/tests/ll_dirstripe_verify.c b/lustre/tests/ll_dirstripe_verify.c

index 9998142..77c5512 100644 (file)
--- a/lustre/tests/ll_dirstripe_verify.c
+++ b/lustre/tests/ll_dirstripe_verify.c
@@ -1,6 +1,40 @@
  /* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
   * vim:expandtab:shiftwidth=8:tabstop=8:
   *
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright  2008 Sun Microsystems, Inc. All rights reserved
+ * Use is subject to license terms.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/tests/ll_dirstripe_verify.c
+ *
   * ll_dirstripe_verify <dir> <file>:
   * - to verify if the file has the same lov_user_md setting as the parent dir.
   * - if dir's offset is set -1, ll_dirstripe_verify <dir> <file1> <file2>
@@ -204,7 +238,7 @@ int main(int argc, char **argv)
                  return rc;
          }
  
-        lum_size = lov_mds_md_size(MAX_LOV_UUID_COUNT);
+        lum_size = lov_mds_md_size(MAX_LOV_UUID_COUNT, LOV_MAGIC);
          if ((lum_dir = (struct lov_user_md *)malloc(lum_size)) == NULL) {
                  rc = ENOMEM;
                  llapi_err(LLAPI_MSG_ERROR, "error: can't allocate %d bytes "
diff --git a/lustre/tests/ll_getstripe_info.c b/lustre/tests/ll_getstripe_info.c

index b8df70c..ca5093c 100644 (file)
--- a/lustre/tests/ll_getstripe_info.c
+++ b/lustre/tests/ll_getstripe_info.c
@@ -1,10 +1,45 @@
  /* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
   * vim:expandtab:shiftwidth=8:tabstop=8:
   *
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright  2008 Sun Microsystems, Inc. All rights reserved
+ * Use is subject to license terms.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/tests/ll_getstripe_info.c
+ *
   * ll_getstripe_info <file>:
   * - get file's stripe info.
   */
  
+
  #include <stdio.h>
  #include <stdlib.h>
  #include <unistd.h>
@@ -29,7 +64,7 @@ int main(int argc, char** argv)
                  return 1;
          }
  
-        lum_size = lov_mds_md_size(MAX_LOV_UUID_COUNT);
+        lum_size = lov_mds_md_size(MAX_LOV_UUID_COUNT, LOV_MAGIC);
  
          if ((lum_file = (struct lov_user_md *)malloc(lum_size)) == NULL) {
                  fprintf(stderr, "unable to allocate memory for ioctl's");
diff --git a/lustre/tests/ll_sparseness_verify.c b/lustre/tests/ll_sparseness_verify.c

index 574f064..b0aabeb 100644 (file)
--- a/lustre/tests/ll_sparseness_verify.c
+++ b/lustre/tests/ll_sparseness_verify.c
@@ -1,11 +1,45 @@
  /* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
   * vim:expandtab:shiftwidth=8:tabstop=8:
   *
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright  2008 Sun Microsystems, Inc. All rights reserved
+ * Use is subject to license terms.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/tests/ll_sparseness_verify.c
+ *
   * The companion to ll_sparseness_write; walk all the bytes in the file.
   * the bytes at the offsets specified on the command line must be '+', as
- * previously written by ll_sparseness_write.  All other bytes must be
- * 0.
+ * previously written by ll_sparseness_write.  All other bytes must be 0.
   */
+
  #include <stdio.h>
  #include <stdlib.h>
  #include <unistd.h>
diff --git a/lustre/tests/ll_sparseness_write.c b/lustre/tests/ll_sparseness_write.c

index 7c11096..c925800 100644 (file)
--- a/lustre/tests/ll_sparseness_write.c
+++ b/lustre/tests/ll_sparseness_write.c
@@ -1,6 +1,39 @@
  /* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
   * vim:expandtab:shiftwidth=8:tabstop=8:
+ *
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
   */
+/*
+ * Copyright  2008 Sun Microsystems, Inc. All rights reserved
+ * Use is subject to license terms.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
  #ifndef _GNU_SOURCE
  #define _GNU_SOURCE
  #endif
diff --git a/lustre/tests/llmount.sh b/lustre/tests/llmount.sh

index b3024aa..82217bc 100755 (executable)
--- a/lustre/tests/llmount.sh
+++ b/lustre/tests/llmount.sh
@@ -7,5 +7,6 @@ LUSTRE=${LUSTRE:-`dirname $0`/..}
  init_test_env $@
  . ${CONFIG:=$LUSTRE/tests/cfg/$NAME.sh}
  
+[ -n "$LOAD" ] && load_modules && exit 0
  [ -z "$NOFORMAT" ] && formatall
  [ -z "$NOSETUP" ] && setupall
diff --git a/lustre/tests/lockorder.sh b/lustre/tests/lockorder.sh

index 10e9573..4f1ca4b 100644 (file)
--- a/lustre/tests/lockorder.sh
+++ b/lustre/tests/lockorder.sh
@@ -28,7 +28,7 @@ NUM=0
  
  MINDIR=$DIR
  MAXDIR=$DIR
-MINRES=2000000000
+MINRES=4294967295
  MAXRES=0
  mkdir -p $MINDIR
  while [ $MINRES -gt $MAXRES ]; do
diff --git a/lustre/tests/lp_utils.c b/lustre/tests/lp_utils.c

index 0ca7a85..91ae0b0 100644 (file)
--- a/lustre/tests/lp_utils.c
+++ b/lustre/tests/lp_utils.c
@@ -1,23 +1,41 @@
  /* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
   * vim:expandtab:shiftwidth=8:tabstop=8:
   *
- * Copyright (C) 2002 Cluster File Systems, Inc.
- *   Author: You Feng <youfeng@clusterfs.com>
+ * GPL HEADER START
   *
- *   This file is part of Lustre, http://www.lustre.org.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   *
- *   Lustre is free software; you can redistribute it and/or
- *   modify it under the terms of version 2 of the GNU General Public
- *   License as published by the Free Software Foundation.
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
   *
- *   Lustre is distributed in the hope that it will be useful,
- *   but WITHOUT ANY WARRANTY; without even the implied warranty of
- *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- *   GNU General Public License for more details.
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
   *
- *   You should have received a copy of the GNU General Public License
- *   along with Lustre; if not, write to the Free Software
- *   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright  2008 Sun Microsystems, Inc. All rights reserved
+ * Use is subject to license terms.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/tests/lp_utils.c
+ *
+ * Author: You Feng <youfeng@clusterfs.com>
   */
  
  #include <mpi.h>
@@ -32,7 +50,7 @@
  #include <fcntl.h>
  #include <errno.h>
  #include "lustre/lustre_user.h"
-#include "lp_utils.h"
+#include "lustre/tests/lp_utils.h"
  
  #define MAX_PROCESSES 8
  
diff --git a/lustre/tests/lp_utils.h b/lustre/tests/lp_utils.h

index 993253a..52aca75 100644 (file)
--- a/lustre/tests/lp_utils.h
+++ b/lustre/tests/lp_utils.h
@@ -1,23 +1,41 @@
  /* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
   * vim:expandtab:shiftwidth=8:tabstop=8:
   *
- * Copyright (C) 2002 Cluster File Systems, Inc.
- *   Author: You Feng <youfeng@clusterfs.com>
+ * GPL HEADER START
   *
- *   This file is part of Lustre, http://www.lustre.org.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   *
- *   Lustre is free software; you can redistribute it and/or
- *   modify it under the terms of version 2 of the GNU General Public
- *   License as published by the Free Software Foundation.
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
   *
- *   Lustre is distributed in the hope that it will be useful,
- *   but WITHOUT ANY WARRANTY; without even the implied warranty of
- *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- *   GNU General Public License for more details.
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
   *
- *   You should have received a copy of the GNU General Public License
- *   along with Lustre; if not, write to the Free Software
- *   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright  2008 Sun Microsystems, Inc. All rights reserved
+ * Use is subject to license terms.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/tests/lp_utils.h
+ *
+ * Author: You Feng <youfeng@clusterfs.com>
   */
  
  #ifndef __LP_UTILS_H__
diff --git a/lustre/tests/mcreate.c b/lustre/tests/mcreate.c

index 9d48b11..871f7fb 100644 (file)
--- a/lustre/tests/mcreate.c
+++ b/lustre/tests/mcreate.c
@@ -1,3 +1,39 @@
+/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
+ * vim:expandtab:shiftwidth=8:tabstop=8:
+ *
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright  2008 Sun Microsystems, Inc. All rights reserved
+ * Use is subject to license terms.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
  #include <stdio.h>
  #include <sys/types.h>
  #include <sys/stat.h>
diff --git a/lustre/tests/mdsrate-create-large.sh b/lustre/tests/mdsrate-create-large.sh

new file mode 100644 (file)

index 0000000..00ad399
--- /dev/null
+++ b/lustre/tests/mdsrate-create-large.sh
@@ -0,0 +1,119 @@
+#!/bin/bash
+#
+# This test was used in a set of CMD3 tests (cmd3-4 test).
+
+LUSTRE=${LUSTRE:-`dirname $0`/..}
+. $LUSTRE/tests/test-framework.sh
+init_test_env $@
+. ${CONFIG:=$LUSTRE/tests/cfg/$NAME.sh}
+
+assert_env CLIENTS MDSRATE SINGLECLIENT MPIRUN
+
+MACHINEFILE=${MACHINEFILE:-$TMP/$(basename $0 .sh).machines}
+TESTDIR=$MOUNT
+
+# Requirements
+TIME_PERIOD=${TIME_PERIOD:-600}                        # seconds
+SINGLE_TARGET_RATE=$((1300 / OSTCOUNT))     # ops/sec
+AGGREGATE_TARGET_RATE=$((7000 / OSTCOUNT))  # ops/sec
+
+# Local test variables
+TESTDIR_SINGLE="${TESTDIR}/single"
+TESTDIR_MULTI="${TESTDIR}/multi"
+
+LOG=${TESTSUITELOG:-$TMP/$(basename $0 .sh).log}
+CLIENT=$SINGLECLIENT
+NODES_TO_USE=${NODES_TO_USE:-$CLIENTS}
+NUM_CLIENTS=$(get_node_count ${NODES_TO_USE//,/ })
+
+[ ! -x ${MDSRATE} ] && error "${MDSRATE} not built."
+
+log "===== $0 ====== " 
+
+check_and_setup_lustre
+
+generate_machine_file $NODES_TO_USE $MACHINEFILE || error "can not generate machinefile"
+
+$LFS setstripe $TESTDIR -c -1
+get_stripe $TESTDIR
+
+# Make sure we start with a clean slate
+rm -f ${LOG} PI*
+
+if [ -n "$NOSINGLE" ]; then
+    echo "NO Test for creates for a single client."
+else
+    log "===== $0 ### 1 NODE CREATE ###"
+    echo "Running creates on 1 node(s)."
+
+    COMMAND="${MDSRATE} ${MDSRATE_DEBUG} --create --time ${TIME_PERIOD}
+                        --dir ${TESTDIR_SINGLE} --filefmt 'f%%d'"
+    echo "+ ${COMMAND}"
+    mpi_run -np 1 -machinefile ${MACHINEFILE} ${COMMAND} | tee ${LOG}
+
+    if [ ${PIPESTATUS[0]} != 0 ]; then
+       [ -f $LOG ] && cat $LOG
+       error "mpirun ... mdsrate ... failed, aborting"
+    fi
+    
+    check_rate create ${SINGLE_TARGET_RATE} 1 ${LOG} || true
+
+    log "===== $0 ### 1 NODE UNLINK ###"
+    echo "Running unlinks on 1 node(s)."
+
+    let NUM_FILES=${SINGLE_TARGET_RATE}\*${TIME_PERIOD}
+    COMMAND="${MDSRATE} ${MDSRATE_DEBUG} --unlink --time ${TIME_PERIOD}
+                 --nfiles ${NUM_FILES} --dir ${TESTDIR_SINGLE} --filefmt 'f%%d'"
+    echo "+ ${COMMAND}"
+    mpi_run -np 1 -machinefile ${MACHINEFILE} ${COMMAND} | tee ${LOG}
+ 
+    if [ ${PIPESTATUS[0]} != 0 ]; then
+       [ -f $LOG ] && cat $LOG
+       error "mpirun ... mdsrate ... failed, aborting"
+    fi
+
+    check_rate unlink ${SINGLE_TARGET_RATE} 1 ${LOG} || true
+fi
+
+if [ -n "$NOMULTI" ]; then
+    echo "NO test for create on multiple nodes."
+else
+
+    log "===== $0 ### $NUM_CLIENTS NODES CREATE ###"
+    echo "Running creates on ${NUM_CLIENTS} node(s)."
+
+    COMMAND="${MDSRATE} ${MDSRATE_DEBUG} --create --time ${TIME_PERIOD}
+                        --dir ${TESTDIR_MULTI} --filefmt 'f%%d'"
+    echo "+ ${COMMAND}"
+    mpi_run -np ${NUM_CLIENTS} -machinefile ${MACHINEFILE} ${COMMAND} | tee ${LOG}
+
+    if [ ${PIPESTATUS[0]} != 0 ]; then
+       [ -f $LOG ] && cat $LOG
+       error "mpirun ... mdsrate ... failed, aborting"
+    fi
+
+    check_rate create ${AGGREGATE_TARGET_RATE} ${NUM_CLIENTS} ${LOG} || true
+
+    echo "Running unlinks on ${NUM_CLIENTS} node(s)."
+
+    let NUM_FILES=${AGGREGATE_TARGET_RATE}\*${TIME_PERIOD}
+    COMMAND="${MDSRATE} ${MDSRATE_DEBUG} --unlink --time ${TIME_PERIOD}
+                  --nfiles ${NUM_FILES} --dir ${TESTDIR_MULTI} --filefmt 'f%%d'"
+    echo "+ ${COMMAND}"
+    mpi_run -np ${NUM_CLIENTS} -machinefile ${MACHINEFILE} ${COMMAND} | tee ${LOG}
+
+    if [ ${PIPESTATUS[0]} != 0 ]; then
+       [ -f $LOG ] && cat $LOG
+       error "mpirun ... mdsrate ... failed, aborting"
+    fi
+
+    check_rate unlink ${AGGREGATE_TARGET_RATE} ${NUM_CLIENTS} ${LOG} || true
+fi
+
+equals_msg `basename $0`: test complete, cleaning up
+rm -f $MACHINEFILE
+zconf_umount_clients $NODES_TO_USE $MOUNT
+check_and_cleanup_lustre
+#rm -f $LOG
+
+exit 0
diff --git a/lustre/tests/mdsrate-create-small.sh b/lustre/tests/mdsrate-create-small.sh

new file mode 100644 (file)

index 0000000..0f42e5d
--- /dev/null
+++ b/lustre/tests/mdsrate-create-small.sh
@@ -0,0 +1,142 @@
+#!/bin/bash
+#
+# This test was used in a set of CMD3 tests (cmd3-3 test). 
+
+LUSTRE=${LUSTRE:-`dirname $0`/..}
+. $LUSTRE/tests/test-framework.sh
+init_test_env $@
+. ${CONFIG:=$LUSTRE/tests/cfg/$NAME.sh}
+
+assert_env CLIENTS MDSRATE SINGLECLIENT MPIRUN
+
+MACHINEFILE=${MACHINEFILE:-$TMP/$(basename $0 .sh).machines}
+TESTDIR=$MOUNT
+
+# Requirements
+# The default number of stripes per file is set to 1 in test3/run_test.sh.
+TIME_PERIOD=${TIME_PERIOD:-600}                        # seconds
+SINGLE_TARGET_RATE=1400                # ops/sec
+AGGREGATE_TARGET_RATE=10000            # ops/sec
+
+# Local test variables
+TESTDIR_SINGLE="${TESTDIR}/single"
+TESTDIR_MULTI="${TESTDIR}/multi"
+
+LOG=${TESTSUITELOG:-$TMP/$(basename $0 .sh).log}
+CLIENT=$SINGLECLIENT
+NODES_TO_USE=${NODES_TO_USE:-$CLIENTS}
+NUM_CLIENTS=$(get_node_count ${NODES_TO_USE//,/ })
+# XXX - this needs to be determined given the number of MDTs and the number
+#       of clients.
+THREADS_PER_CLIENT=3                   # threads/client for multi client test
+if [ $NUM_CLIENTS -gt 50 ]; then
+    THREADS_PER_CLIENT=1
+fi
+
+[ ! -x ${MDSRATE} ] && error "${MDSRATE} not built."
+
+# Make sure we start with a clean slate
+rm -f ${LOG} PI*
+
+log "===== $0 ====== " 
+
+check_and_setup_lustre
+
+generate_machine_file $NODES_TO_USE $MACHINEFILE || error "can not generate machinefile"
+
+$LFS setstripe $TESTDIR -i 0 -c 1
+get_stripe $TESTDIR
+
+if [ -n "$NOSINGLE" ]; then
+    echo "NO Tests on single client."
+else
+    if [ -n "$NOCREATE" ]; then
+        echo "NO Test for creates for a single client."
+    else
+        do_node ${CLIENT} "rm -rf $TESTDIR_SINGLE"
+
+        log "===== $0 ### 1 NODE CREATE ###"
+        echo "Running creates on 1 node(s)."
+
+        COMMAND="${MDSRATE} ${MDSRATE_DEBUG} --create --time ${TIME_PERIOD}
+                            --dir ${TESTDIR_SINGLE} --filefmt 'f%%d'"
+        echo "+ ${COMMAND}"
+        mpi_run -np 1 -machinefile ${MACHINEFILE} ${COMMAND} | tee ${LOG}
+
+        if [ ${PIPESTATUS[0]} != 0 ]; then
+        [ -f $LOG ] && cat $LOG
+            error "mpirun ... mdsrate ... failed, aborting"
+        fi
+        check_rate create ${SINGLE_TARGET_RATE} 1 ${LOG} || true
+    fi
+
+    if [ -n "$NOUNLINK" ]; then
+        echo "NO Test for unlinks for a single client."
+    else
+        log "===== $0 ### 1 NODE UNLINK ###"
+        echo "Running unlinks on 1 node(s)."
+
+        let NUM_FILES=${SINGLE_TARGET_RATE}\*${TIME_PERIOD}
+        COMMAND="${MDSRATE} ${MDSRATE_DEBUG} --unlink --time ${TIME_PERIOD}
+                     --nfiles ${NUM_FILES} --dir ${TESTDIR_SINGLE} --filefmt 'f%%d'"
+        echo "+ ${COMMAND}"
+        mpi_run -np 1 -machinefile ${MACHINEFILE} ${COMMAND} | tee ${LOG}
+
+        if [ ${PIPESTATUS[0]} != 0 ]; then
+        [ -f $LOG ] && cat $LOG
+            error "mpirun ... mdsrate ... failed, aborting"
+        fi
+        check_rate unlink ${SINGLE_TARGET_RATE} 1 ${LOG} || true
+    fi
+fi
+
+if [ -n "$NOMULTI" ]; then
+    echo "NO tests on multiple nodes."
+else
+    if [ -n "$NOCREATE" ]; then
+        echo "NO test for create on multiple nodes."
+    else
+        do_node $CLIENT rm -rf $TESTDIR_MULTI
+
+        log "===== $0 ### $NUM_CLIENTS NODES CREATE ###"
+        echo "Running creates on ${NUM_CLIENTS} node(s) with $THREADS_PER_CLIENT threads per client."
+
+        COMMAND="${MDSRATE} ${MDSRATE_DEBUG} --create --time ${TIME_PERIOD}
+                            --dir ${TESTDIR_MULTI} --filefmt 'f%%d'"
+        echo "+ ${COMMAND}"
+        mpi_run -np $((NUM_CLIENTS * THREADS_PER_CLIENT)) -machinefile ${MACHINEFILE} \
+            ${COMMAND} | tee ${LOG}
+        if [ ${PIPESTATUS[0]} != 0 ]; then
+            [ -f $LOG ] && cat $LOG
+            error "mpirun ... mdsrate ... failed, aborting"
+        fi
+        check_rate create ${AGGREGATE_TARGET_RATE} ${NUM_CLIENTS} ${LOG} || true
+    fi
+
+    if [ -n "$NOUNLINK" ]; then
+        echo "NO Test for unlinks multiple nodes."
+    else
+        log "===== $0 ### $NUM_CLIENTS NODES UNLINK ###"
+        echo "Running unlinks on ${NUM_CLIENTS} node(s) with $THREADS_PER_CLIENT threads per client."
+
+        let NUM_FILES=${AGGREGATE_TARGET_RATE}\*${TIME_PERIOD}
+        COMMAND="${MDSRATE} ${MDSRATE_DEBUG} --unlink --time ${TIME_PERIOD}
+                      --nfiles ${NUM_FILES} --dir ${TESTDIR_MULTI} --filefmt 'f%%d'"
+        echo "+ ${COMMAND}"
+        mpi_run -np $((NUM_CLIENTS * THREADS_PER_CLIENT)) -machinefile ${MACHINEFILE} \
+            ${COMMAND} | tee ${LOG}
+        if [ ${PIPESTATUS[0]} != 0 ]; then
+            [ -f $LOG ] && cat $LOG
+            error "mpirun ... mdsrate ... failed, aborting"
+        fi
+        check_rate unlink ${AGGREGATE_TARGET_RATE} ${NUM_CLIENTS} ${LOG} || true
+    fi
+fi
+
+equals_msg `basename $0`: test complete, cleaning up
+rm -f $MACHINEFILE 
+zconf_umount_clients $NODES_TO_USE $MOUNT
+check_and_cleanup_lustre
+#rm -f $LOG
+
+exit 0
diff --git a/lustre/tests/mdsrate-lookup-1dir.sh b/lustre/tests/mdsrate-lookup-1dir.sh

new file mode 100644 (file)

index 0000000..9c46674
--- /dev/null
+++ b/lustre/tests/mdsrate-lookup-1dir.sh
@@ -0,0 +1,107 @@
+#!/bin/bash
+#
+# This test was used in a set of CMD3 tests (cmd3-5 test).
+
+# Directory lookup retrieval rate single directory 10 million files
+# 5900 random lookups/sec per client node 62,000 random lookups/sec aggregate
+# 
+# In a dir containing 10 million non-striped files the mdsrate Test Program will
+# perform lookups for 10 minutes. This test can be run from a single node for
+# #1 and from all nodes for #2 aggregate test to measure lookup performance.
+
+LUSTRE=${LUSTRE:-`dirname $0`/..}
+. $LUSTRE/tests/test-framework.sh
+init_test_env $@
+. ${CONFIG:=$LUSTRE/tests/cfg/$NAME.sh}
+assert_env CLIENTS MDSRATE SINGLECLIENT MPIRUN
+
+MACHINEFILE=${MACHINEFILE:-$TMP/$(basename $0 .sh).machines}
+TESTDIR=$MOUNT
+
+# Requirements
+NUM_FILES=${NUM_FILES:-1000000}
+TIME_PERIOD=${TIME_PERIOD:-600}                        # seconds
+SINGLE_TARGET_RATE=5900                  # ops/sec
+AGGREGATE_TARGET_RATE=62000              # ops/sec
+
+LOG=${TESTSUITELOG:-$TMP/$(basename $0 .sh).log}
+CLIENT=$SINGLECLIENT
+NODES_TO_USE=${NODES_TO_USE:-$CLIENTS}
+NUM_CLIENTS=$(get_node_count ${NODES_TO_USE//,/ })
+
+rm -f $LOG
+
+[ ! -x ${MDSRATE} ] && error "${MDSRATE} not built."
+
+log "===== $0 ====== " 
+
+check_and_setup_lustre
+
+generate_machine_file $NODES_TO_USE $MACHINEFILE || error "can not generate machinefile"
+
+$LFS setstripe $TESTDIR -c 1
+get_stripe $TESTDIR
+
+if [ -n "$NOCREATE" ]; then
+    echo "NOCREATE=$NOCREATE  => no file creation."
+else
+    log "===== $0 Test preparation: creating ${NUM_FILES} files."
+    echo "Test preparation: creating ${NUM_FILES} files."
+
+    MDSCOUNT=1
+    NUM_CLIENTS=$(get_node_count ${NODES_TO_USE//,/ })
+    NUM_THREADS=$((NUM_CLIENTS * MDSCOUNT))
+    if [ $NUM_CLIENTS -gt 50 ]; then
+        NUM_THREADS=$NUM_CLIENTS
+    fi
+    COMMAND="${MDSRATE} ${MDSRATE_DEBUG} --mknod --dir ${TESTDIR}
+                        --nfiles ${NUM_FILES} --filefmt 'f%%d'"
+    echo "+" ${COMMAND}
+    mpi_run -np ${NUM_THREADS} -machinefile ${MACHINEFILE} ${COMMAND} 2>&1 
+
+    # No lockup if error occurs on file creation, abort.
+    [ ${PIPESTATUS[0]} != 0 ] && error "mpirun ... mdsrate ... file creation failed, aborting"
+fi
+
+COMMAND="${MDSRATE} ${MDSRATE_DEBUG} --lookup --time ${TIME_PERIOD} ${SEED_OPTION}
+        --dir ${TESTDIR} --nfiles ${NUM_FILES} --filefmt 'f%%d'"
+
+# 1
+if [ -n "$NOSINGLE" ]; then
+    echo "NO Test for lookups on a single client."
+else
+    log "===== $0 ### 1 NODE LOOKUPS ###"
+    echo "Running lookups on 1 node(s)."
+    echo "+" ${COMMAND}
+    mpi_run -np 1 -machinefile ${MACHINEFILE} ${COMMAND} | tee ${LOG}
+
+    if [ ${PIPESTATUS[0]} != 0 ]; then
+        [ -f $LOG ] && cat $LOG
+        error "mpirun ... mdsrate ... failed, aborting"
+    fi
+    check_rate lookup ${SINGLE_TARGET_RATE} 1 ${LOG} || true
+fi
+
+# 2
+if [ -n "$NOMULTI" ]; then
+    echo "NO test for lookups on multiple nodes."
+else
+    log "===== $0 ### ${NUM_CLIENTS} NODES LOOKUPS ###"
+    echo "Running lookups on ${NUM_CLIENTS} node(s)."
+    echo "+" ${COMMAND}
+    mpi_run -np ${NUM_CLIENTS} -machinefile ${MACHINEFILE} ${COMMAND} | tee ${LOG}
+
+    if [ ${PIPESTATUS[0]} != 0 ]; then
+        [ -f $LOG ] && cat $LOG
+        error "mpirun ... mdsrate ... failed, aborting"
+    fi
+    check_rate lookup ${AGGREGATE_TARGET_RATE} ${NUM_CLIENTS} ${LOG} || true
+fi
+
+equals_msg `basename $0`: test complete, cleaning up
+rm -f $MACHINEFILE
+zconf_umount_clients $NODES_TO_USE $MOUNT
+check_and_cleanup_lustre
+#rm -f $LOG
+
+exit 0
diff --git a/lustre/tests/mdsrate-stat-large.sh b/lustre/tests/mdsrate-stat-large.sh

new file mode 100644 (file)

index 0000000..3620282
--- /dev/null
+++ b/lustre/tests/mdsrate-stat-large.sh
@@ -0,0 +1,117 @@
+#!/bin/sh
+#
+# This test was used in a set of CMD3 tests (cmd3-8 test).
+
+# File attribute retrieval rate for large file creation
+# 3300 ops/sec/OST for single node 28500 ops/sec/OST aggregate
+
+# In a dir containing 10 million striped files, the mdsrate Test Program will
+# perform directory ordered stat's (readdir) for 10 minutes. This test will be
+# run from a single node for #1 and from all nodes for #2 aggregate test to
+# measure stat performance.  
+
+LUSTRE=${LUSTRE:-`dirname $0`/..}
+. $LUSTRE/tests/test-framework.sh
+init_test_env $@
+. ${CONFIG:=$LUSTRE/tests/cfg/$NAME.sh}
+
+assert_env CLIENTS MDSRATE SINGLECLIENT MPIRUN
+
+MACHINEFILE=${MACHINEFILE:-$TMP/$(basename $0 .sh).machines}
+TESTDIR=$MOUNT
+
+# Requirements
+NUM_FILES=${NUM_FILES:-1000000}
+TIME_PERIOD=${TIME_PERIOD:-600}                        # seconds
+SINGLE_TARGET_RATE=$((3300 / OSTCOUNT))      # ops/sec
+AGGREGATE_TARGET_RATE=$((28500 / OSTCOUNT))  # ops/sec
+
+# --random_order (default) -OR- --readdir_order
+DIR_ORDER=${DIR_ORDER:-"--readdir_order"}
+
+LOG=${TESTSUITELOG:-$TMP/$(basename $0 .sh).log}
+CLIENT=$SINGLECLIENT
+NODES_TO_USE=${NODES_TO_USE:-$CLIENTS}
+NUM_CLIENTS=$(get_node_count ${NODES_TO_USE//,/ })
+
+rm -f $LOG
+
+[ ! -x ${MDSRATE} ] && error "${MDSRATE} not built."
+
+log "===== $0 ====== " 
+
+check_and_setup_lustre
+
+generate_machine_file $NODES_TO_USE $MACHINEFILE || error "can not generate machinefile"
+
+$LFS setstripe $TESTDIR -c -1
+get_stripe $TESTDIR
+
+if [ -n "$NOCREATE" ]; then
+    echo "NOCREATE=$NOCREATE  => no file creation."
+else
+    log "===== $0 Test preparation: creating ${NUM_FILES} files."
+    echo "Test preparation: creating ${NUM_FILES} files."
+
+    COMMAND="${MDSRATE} ${MDSRATE_DEBUG} --create --dir ${TESTDIR}
+                        --nfiles ${NUM_FILES} --filefmt 'f%%d'"
+    echo "+" ${COMMAND}
+
+    MDSCOUNT=1
+    NUM_CLIENTS=$(get_node_count ${NODES_TO_USE//,/ })
+    NUM_THREADS=$((NUM_CLIENTS * MDSCOUNT))
+    if [ $NUM_CLIENTS -gt 50 ]; then
+        NUM_THREADS=$NUM_CLIENTS
+    fi
+
+    mpi_run -np ${NUM_THREADS} -machinefile ${MACHINEFILE} ${COMMAND} 2>&1
+    [ ${PIPESTATUS[0]} != 0 ] && error "mpirun ... mdsrate ... file creation failed, aborting"
+
+fi
+
+COMMAND="${MDSRATE} ${MDSRATE_DEBUG} --stat --time ${TIME_PERIOD}
+        --dir ${TESTDIR} --nfiles ${NUM_FILES} --filefmt 'f%%d'
+        ${DIR_ORDER} ${SEED_OPTION}"
+
+# 1
+if [ -n "$NOSINGLE" ]; then
+    echo "NO Test for stats on a single client."
+else
+    log "===== $0 ### 1 NODE STAT ###"
+    echo "Running stats on 1 node(s)."
+    echo "+" ${COMMAND}
+
+    mpi_run -np 1 -machinefile ${MACHINEFILE} ${COMMAND} | tee ${LOG}
+
+    if [ ${PIPESTATUS[0]} != 0 ]; then
+        [ -f $LOG ] && cat $LOG
+        error "mpirun ... mdsrate ... failed, aborting"
+    fi
+    check_rate stat ${SINGLE_TARGET_RATE} 1 ${LOG} || true
+fi
+
+# 2
+if [ -n "$NOMULTI" ]; then
+    echo "NO test for stats on multiple nodes."
+else
+    log "===== $0 ### ${NUM_CLIENTS} NODES STAT ###"
+    echo "Running stats on ${NUM_CLIENTS} node(s)."
+    echo "+" ${COMMAND}
+
+    NUM_THREADS=$(get_node_count ${NODES_TO_USE//,/ })
+    mpi_run -np ${NUM_THREADS} -machinefile ${MACHINEFILE} ${COMMAND} | tee ${LOG}
+
+    if [ ${PIPESTATUS[0]} != 0 ]; then
+        [ -f $LOG ] && cat $LOG
+        error "mpirun ... mdsrate ... failed, aborting"
+    fi
+    check_rate stat ${AGGREGATE_TARGET_RATE} ${NUM_CLIENTS} ${LOG} || true
+fi
+
+equals_msg `basename $0`: test complete, cleaning up
+rm -f $MACHINEFILE
+zconf_umount_clients $NODES_TO_USE $MOUNT
+check_and_cleanup_lustre
+#rm -f $LOG
+
+exit 0
diff --git a/lustre/tests/mdsrate-stat-small.sh b/lustre/tests/mdsrate-stat-small.sh

new file mode 100644 (file)

index 0000000..ddcd609
--- /dev/null
+++ b/lustre/tests/mdsrate-stat-small.sh
@@ -0,0 +1,116 @@
+#!/bin/bash
+#
+# This test was used in a set of CMD3 tests (cmd3-7 test).
+
+# File attribute retrieval rate for small file creation
+# 3200 ops/sec for single node 29,000 ops/sec aggregate
+
+# In a dir containing 10 million non-striped files, the mdsrate Test Program
+# will perform directory ordered stat's (readdir) for 10 minutes. This test
+# will be run from a single node for #1 and from all nodes for #2
+# aggregate test to measure stat performance.
+
+LUSTRE=${LUSTRE:-`dirname $0`/..}
+. $LUSTRE/tests/test-framework.sh
+init_test_env $@
+. ${CONFIG:=$LUSTRE/tests/cfg/$NAME.sh}
+
+assert_env CLIENTS MDSRATE SINGLECLIENT MPIRUN
+
+MACHINEFILE=${MACHINEFILE:-$TMP/$(basename $0 .sh).machines}
+TESTDIR=$MOUNT
+
+# Requirements
+NUM_FILES=${NUM_FILES:-1000000}
+TIME_PERIOD=${TIME_PERIOD:-600}                        # seconds
+SINGLE_TARGET_RATE=3200                     # ops/sec
+AGGREGATE_TARGET_RATE=29000                 # ops/sec
+
+# --random_order (default) -OR- --readdir_order
+DIR_ORDER=${DIR_ORDER:-"--readdir_order"}
+
+LOG=${TESTSUITELOG:-$TMP/$(basename $0 .sh).log}
+CLIENT=$SINGLECLIENT
+NODES_TO_USE=${NODES_TO_USE:-$CLIENTS}
+NUM_CLIENTS=$(get_node_count ${NODES_TO_USE//,/ })
+
+rm -f $LOG
+
+[ ! -x ${MDSRATE} ] && error "${MDSRATE} not built."
+
+log "===== $0 ====== " 
+
+check_and_setup_lustre
+
+generate_machine_file $NODES_TO_USE $MACHINEFILE || error "can not generate machinefile"
+
+$LFS setstripe $TESTDIR -i 0 -c 1
+get_stripe $TESTDIR
+
+if [ -n "$NOCREATE" ]; then
+    echo "NOCREATE=$NOCREATE  => no file creation."
+else
+    log "===== $0 Test preparation: creating ${NUM_FILES} files."
+    echo "Test preparation: creating ${NUM_FILES} files."
+
+    COMMAND="${MDSRATE} ${MDSRATE_DEBUG} --mknod --dir ${TESTDIR}
+                        --nfiles ${NUM_FILES} --filefmt 'f%%d'"
+    echo "+" ${COMMAND}
+
+    MDSCOUNT=1
+    NUM_CLIENTS=$(get_node_count ${NODES_TO_USE//,/ })
+    NUM_THREADS=$((NUM_CLIENTS * MDSCOUNT))
+    if [ $NUM_CLIENTS -gt 50 ]; then
+        NUM_THREADS=$NUM_CLIENTS
+    fi
+
+    mpi_run -np ${NUM_THREADS} -machinefile ${MACHINEFILE} ${COMMAND} 2>&1
+    [ ${PIPESTATUS[0]} != 0 ] && error "Error running mdsrate, aborting..."
+
+fi
+
+COMMAND="${MDSRATE} ${MDSRATE_DEBUG} --stat --time ${TIME_PERIOD}
+        --dir ${TESTDIR} --nfiles ${NUM_FILES} --filefmt 'f%%d'
+        ${DIR_ORDER} ${SEED_OPTION}"
+
+# 1
+if [ -n "$NOSINGLE" ]; then
+    echo "NO Test for stats on a single client."
+else
+    log "===== $0 ### 1 NODE STAT ###"
+    echo "Running stats on 1 node(s)."
+    echo "+" ${COMMAND}
+
+    mpi_run -np 1 -machinefile ${MACHINEFILE} ${COMMAND} | tee ${LOG}
+    
+    if [ ${PIPESTATUS[0]} != 0 ]; then
+        [ -f $LOG ] && cat $LOG
+        error "mpirun ... mdsrate ... failed, aborting"
+    fi
+    check_rate stat ${SINGLE_TARGET_RATE} 1 ${LOG} || true
+fi
+
+# 2
+if [ -n "$NOMULTI" ]; then
+    echo "NO test for stats on multiple nodes."
+else
+    log "===== $0 ### ${NUM_CLIENTS} NODES STAT ###"
+    echo "Running stats on ${NUM_CLIENTS} node(s)."
+    echo "+" ${COMMAND}
+
+    mpi_run -np ${NUM_CLIENTS} -machinefile ${MACHINEFILE} ${COMMAND} | tee ${LOG}
+
+    if [ ${PIPESTATUS[0]} != 0 ]; then
+        [ -f $LOG ] && cat $LOG
+        error "mpirun ... mdsrate ... failed, aborting"
+    fi
+    check_rate stat ${AGGREGATE_TARGET_RATE} ${NUM_CLIENTS} ${LOG} || true
+fi
+
+equals_msg `basename $0`: test complete, cleaning up
+rm -f $MACHINEFILE
+zconf_umount_clients $NODES_TO_USE $MOUNT
+check_and_cleanup_lustre
+#rm -f $LOG
+
+exit 0
diff --git a/lustre/tests/mdsrate.c b/lustre/tests/mdsrate.c

new file mode 100644 (file)

index 0000000..6a92a24
--- /dev/null
+++ b/lustre/tests/mdsrate.c
@@ -0,0 +1,780 @@
+/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
+ * vim:expandtab:shiftwidth=8:tabstop=8:
+ *
+ * 2003, Copyright, Hewlett-Packard Development Compnay, LP.
+ *
+ * Developed under the sponsorship of the U.S. Government
+ *     under Subcontract No. B514193
+ */
+
+/*
+ * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#include <stdio.h>
+#include <getopt.h>
+#include <libgen.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <time.h>
+#include <limits.h>
+#include <errno.h>
+#include <string.h>
+#include <fcntl.h>
+#include <unistd.h>
+#include <stdlib.h>
+#include <stdarg.h>
+#include <signal.h>
+#include <sys/ioctl.h>
+#include <dirent.h>
+
+#include "mpi.h"
+
+/* lustre */
+#include <lustre/liblustreapi.h>        /* for O_LOV_DELAY_CREATE */
+
+#define CHECK_COUNT 10000
+#define DISPLAY_COUNT (CHECK_COUNT * 10)
+#define DISPLAY_TIME 100
+
+enum {
+        CREATE   = 'c',
+        LOOKUP   = 'l',
+        MKNOD    = 'm',
+        OPEN     = 'o',
+        STAT     = 's',
+        UNLINK   = 'u',
+        BEGIN    = 'b',
+        ITERS    = 'i',
+        TIME     = 't',
+        DIRFMT   = 'd',
+        NDIRS    = 'D',
+        FILEFMT  = 'f',
+        NFILES   = 'F',
+        NOEXCL   = 'X',
+        STRIPES  = 'S',
+        SEED     = 'r',
+        SEEDFILE = 'R',
+        RANDOM   = 'A',
+        READDIR  = 'B',
+        RECREATE = 'C',
+        VERBOSE  = 'V',
+        DEBUG    = 'v',
+        HELP     = 'h',
+};
+
+struct option longOpts[] = {
+        {"create",        0, NULL, CREATE     },
+        {"lookup",        0, NULL, LOOKUP     },
+        {"mknod",         0, NULL, MKNOD      },
+        {"open",          0, NULL, OPEN       },
+        {"stat",          0, NULL, STAT       },
+        {"unlink",        0, NULL, UNLINK     },
+        {"begin",         1, NULL, BEGIN      },
+        {"iters",         1, NULL, ITERS      },
+        {"time",          1, NULL, TIME       },   /* seconds */
+        {"dirfmt",        1, NULL, DIRFMT     },
+        {"ndirs",         1, NULL, NDIRS      },
+        {"filefmt",       1, NULL, FILEFMT    },
+        {"nfiles",        1, NULL, NFILES     },
+        {"noexcl",        0, NULL, NOEXCL     },
+        {"stripes",       1, NULL, STRIPES    },
+        {"seed",          1, NULL, SEED       },
+        {"seedfile",      1, NULL, SEEDFILE   },
+        {"random_order",  0, NULL, RANDOM     },
+        {"readdir_order", 0, NULL, READDIR    },
+        {"recreate",      0, NULL, RECREATE   },
+        {"verbose",       0, NULL, VERBOSE    },
+        {"debug",         0, NULL, DEBUG      },
+        {"help",          0, NULL, HELP       },
+        { 0,              0, NULL, 0          }
+};
+
+int foo1, foo2;
+
+char   shortOpts[128];
+int    myrank = -1;
+int    nthreads = -1;
+char * prog;
+char   hostname[512] = "unknown";
+char   mode;
+char * cmd;
+int    openflags = O_RDWR|O_CREAT|O_EXCL;
+int    ndirs = 1;
+char * dirfmt;
+char   dir[PATH_MAX];
+char   mkdir_cmd[PATH_MAX+14];
+int    dirthreads;
+int    dirnum;
+DIR *  directory;
+struct dirent *dir_entry;
+int    nfiles;
+char   filefmt[PATH_MAX];
+char   filename[PATH_MAX];
+int    stripes = -1;
+int    begin;
+int    beginsave;
+int    end;
+int    iters;
+int    seconds;
+int    alarm_caught;
+struct sigaction act;
+int    order = RANDOM;
+int    seed;
+int    recreate;
+int    verbose;
+int    debug;
+struct stat statbuf;
+
+#define dmesg if (debug) printf
+
+#define DISPLAY_PROGRESS() {                                                \
+        if ((++nops % CHECK_COUNT) == 0 && verbose) {                       \
+                curTime = time(0);                                          \
+                interval = curTime - lastTime;                              \
+                if (interval > DISPLAY_TIME || nops % DISPLAY_COUNT == 0) { \
+                        rate = (float)(nops - lastOps);                     \
+                        if (interval > 1)                                   \
+                                rate /= (float)interval;                    \
+                        printf("Rank %d: %.2f %ss/sec %lu secs "            \
+                               "(total: %d %ss %lu secs)\n",                \
+                               myrank, rate, cmd, interval,                 \
+                               nops, cmd, curTime - startTime);             \
+                        lastOps = nops;                                     \
+                        lastTime = curTime;                                 \
+                }                                                           \
+        }                                                                   \
+}
+
+char *usage_msg = "usage: %s\n"
+                  "    { --create [ --noexcl ] | --lookup | --mknod |\n"
+                  "      --open | --stat | --unlink  [ --recreate ] }\n"
+                  "    [ --help ] [ --verbose ] [ --debug ]\n"
+                  "    { [ --begin <num> ] --nfiles <num> }\n"
+                  "    [ --iters <num> ] [ --time <secs> ]\n"
+                  "    [ --dirfmt <str> ] [ --ndirs  <num> ]\n"
+                  "    [ --filefmt <str> ] [ --stripes <num> ]\n"
+                  "    [ --random_order [--seed <num> | --seedfile <file>] ]\n"
+                  "    [ --readdir_order ]\n";
+
+static void
+usage(FILE *stream, char *fmt, ...)
+{
+        if (myrank == 0) {
+                if (fmt != NULL) {
+                        va_list       ap;
+
+                        fprintf(stream, "%s: ", prog);
+                        va_start(ap, fmt);
+                        vfprintf(stderr, fmt, ap);
+                        va_end(ap);
+                }
+                fprintf(stream, usage_msg, prog);
+        }
+
+        MPI_Finalize();
+        exit(stream == stderr);
+}
+
+/* Print process myrank and message, and exit (i.e. a fatal error) */
+static int
+fatal(int rank, const char *fmt, ...)
+{
+        if (rank == myrank) {
+                va_list       ap;
+
+                fprintf(stderr, "rank %d: ", rank);
+                va_start(ap, fmt);
+                vfprintf(stderr, fmt, ap);
+                va_end(ap);
+        }
+
+        MPI_Abort(MPI_COMM_WORLD, 1);
+        exit(1);
+}
+
+static void
+sigalrm_handler(int signum)
+{
+        alarm_caught++;
+}
+
+/* HAVE_LLAPI_FILE_LOOKUP is defined by liblustreapi.h if this function is
+ * defined therein.  Otherwise we can do the equivalent operation via ioctl
+ * if we have access to a complete lustre build tree to get the various
+ * definitions - then compile with USE_MDC_LOOKUP defined. */
+#if defined(HAVE_LLAPI_FILE_LOOKUP)
+#define HAVE_MDC_LOOKUP
+#elif defined(USE_MDC_LOOKUP)
+#include <config.h>
+#include <liblustre.h>
+#include <linux/lustre_lib.h>
+
+int llapi_file_lookup(int dirfd, const char *name)
+{
+        struct obd_ioctl_data data = { 0 };
+        char rawbuf[8192];
+        char *buf = rawbuf;
+        int rc;
+
+        if (dirfd < 0 || name == NULL)
+                return -EINVAL;
+
+        data.ioc_version = OBD_IOCTL_VERSION;
+        data.ioc_len = sizeof(data);
+        data.ioc_inlbuf1 = name;
+        data.ioc_inllen1 = strlen(name) + 1;
+
+        rc = obd_ioctl_pack(&data, &buf, sizeof(rawbuf));
+        if (rc) {
+                fatal(myrank, "ioctl_pack failed: rc = %d\n", rc);
+                return rc;
+        }
+
+        return ioctl(fd, IOC_MDC_LOOKUP, buf);
+}
+#define HAVE_MDC_LOOKUP
+#endif
+
+static void
+process_args(int argc, char *argv[])
+{
+        char   c, *cp, *endptr;
+        int    i, index, offset, tmpend, rc;
+        char   tmp[16];
+        FILE * seed_file;
+        struct option *opt;
+
+        setbuf(stdout, 0);
+        setbuf(stderr, 0);
+        prog = basename(argv[0]);
+        strcpy(filefmt, "f%d");
+        gethostname(hostname, sizeof(hostname));
+
+        /* auto create shortOpts rather than maintaining a static string. */
+        for (opt = longOpts, cp = shortOpts; opt->name != NULL; opt++, cp++) {
+                *cp = opt->val;
+                if (opt->has_arg)
+                        *++cp = ':';
+        }
+
+        while ((c = getopt_long(argc,argv, shortOpts, longOpts,&index)) != -1) {
+                switch (c) {
+                case OPEN:
+                        openflags &= ~(O_CREAT|O_EXCL);
+                case CREATE:
+#ifdef HAVE_MDC_LOOKUP
+                case LOOKUP:
+#endif
+                case MKNOD:
+                case STAT:
+                case UNLINK:
+                        if (cmd != NULL) {
+                                fatal(0, "Invalid - more than one operation "
+                                           "specified: --%s\n",
+                                        longOpts[index].name);
+                        }
+                        mode = c;
+                        cmd = (char *)longOpts[index].name;
+                        break;
+                case NOEXCL:
+                        if (mode != CREATE && mode != MKNOD) {
+                                usage(stderr, "--noexcl only applies to "
+                                              "--create or --mknod.\n");
+                        }
+                        openflags &= ~O_EXCL;
+                        break;
+                case RECREATE:
+                        if (mode != UNLINK) {
+                                usage(stderr, "--recreate only makes sense"
+                                              "with --unlink.\n");
+                        }
+                        recreate++;
+                        break;
+                case BEGIN:
+                        begin = strtol(optarg, &endptr, 0);
+                        if ((*endptr != 0) || (begin < 0)) {
+                                fatal(0, "Invalid --start value.\n");
+                        }
+                        break;
+                case ITERS:
+                        iters = strtol(optarg, &endptr, 0);
+                        if ((*endptr != 0) || (iters <= 0)) {
+                                fatal(0, "Invalid --iters value.\n");
+                        }
+                        if (mode != LOOKUP && mode != OPEN && mode != STAT) {
+                                usage(stderr, "--iters only makes sense with "
+                                              "--lookup, --open, or --stat.\n");
+                        }
+                        break;
+                case TIME:
+                        seconds = strtol(optarg, &endptr, 0);
+                        if ((*endptr != 0) || (seconds <= 0)) {
+                                fatal(0, "Invalid --time value.\n");
+                        }
+                        break;
+                case DIRFMT:
+                        if (strlen(optarg) > (PATH_MAX - 16)) {
+                                fatal(0, "--dirfmt too long\n");
+                        }
+                        dirfmt = optarg;
+                        break;
+                case NDIRS:
+                        ndirs = strtol(optarg, &endptr, 0);
+                        if ((*endptr != 0) || (ndirs <= 0)) {
+                                fatal(0, "Invalid --ndirs value.\n");
+                        }
+                        if ((ndirs > nthreads) &&
+                            ((mode == CREATE) || (mode == MKNOD))) {
+                                fatal(0, "--ndirs=%d must be less than or "
+                                      "equal to the number of threads (%d).\n",
+                                      ndirs, nthreads);
+                        }
+                        break;
+                case FILEFMT:
+                        if (strlen(optarg) > 4080) {
+                                fatal(0, "--filefmt too long\n");
+                        }
+
+                        /* Use %%d where you want the file # in the name. */
+                        sprintf(filefmt, optarg, myrank);
+                        break;
+                case NFILES:
+                        nfiles = strtol(optarg, &endptr, 0);
+                        if ((*endptr != 0) || (nfiles <= 0)) {
+                                fatal(0, "Invalid --nfiles value.\n");
+                        }
+                        break;
+                case STRIPES:
+                        stripes = strtol(optarg, &endptr, 0);
+                        if ((*endptr != 0) || (stripes < 0)) {
+                                fatal(0, "Invalid --stripes value.\n");
+                        }
+
+                        if (stripes == 0) {
+                                openflags |= O_LOV_DELAY_CREATE;
+                        } else {
+                                fatal(0, "non-zero --stripes value "
+                                         "not yet supported.\n");
+                        }
+
+                        break;
+                case SEED:
+                        seed = strtoul(optarg, &endptr, 0);
+                        if (*endptr) {
+                                fatal(0, "bad --seed option %s\n", optarg);
+                        }
+                        break;
+                case SEEDFILE:
+                        seed_file = fopen(optarg, "r");
+                        if (!seed_file) {
+                              fatal(myrank, "fopen(%s) error: %s\n",
+                                      optarg, strerror(errno));
+                        }
+
+                        for (i = -1; fgets(tmp, 16, seed_file) != NULL;) {
+                                if (++i == myrank)
+                                        break;
+                        }
+
+                        if (i == myrank) {
+                                rc = sscanf(tmp, "%d", &seed);
+                                if ((rc != 1) || (seed < 0)) {
+                                        fatal(myrank, "Invalid seed value '%s' "
+                                              "at line %d in %s.\n",
+                                              tmp, i, optarg);
+                                }
+                        } else {
+                                fatal(myrank, "File '%s' too short. Does not "
+                                      "contain a seed for thread %d.\n",
+                                      optarg, myrank);
+                        }
+
+                        fclose(seed_file);
+                        break;
+                case RANDOM:
+                case READDIR:
+                        if (mode != LOOKUP && mode != OPEN && mode != STAT)  {
+                                fatal(0, "--%s can only be specified with "
+                                         "--lookup, --open, or --stat.\n",
+                                      (char *)longOpts[index].name);
+                        }
+                        order = c;
+                        break;
+                case DEBUG:
+                        ++debug;
+                case VERBOSE:
+                        ++verbose;
+                        break;
+                case HELP:
+                        usage(stdout, NULL);
+                default:
+                        usage(stderr, "unrecognized option: '%c'.\n", optopt);
+                }
+        }
+
+        if (optind < argc) {
+                usage(stderr, "too many arguments %d >= %d.\n", optind, argc);
+        }
+
+        if (mode == CREATE || mode == MKNOD || mode == UNLINK) {
+                if (seconds != 0) {
+                        if (nfiles == 0)
+                                nfiles = INT_MAX;
+                } else if (nfiles == 0) {
+                        usage(stderr, "--nfiles or --time must be specified "
+                                      "with %s.\n", cmd);
+                }
+        } else if (mode == LOOKUP || mode == OPEN || mode == STAT) {
+                if (seconds != 0) {
+                        if (iters == 0)
+                                iters = INT_MAX;
+                } else if (iters == 0) {
+                        usage(stderr, "--iters or --time must be specifed "
+                                      "with %s.\n", cmd);
+                }
+
+                if (nfiles == 0) {
+                        usage(stderr, "--nfiles must be specifed with --%s.\n",
+                              cmd);
+                }
+
+                if (seed == 0) {
+                        int fd = open("/dev/urandom", O_RDONLY);
+
+                        if (fd >= 0) {
+                                if (read(fd, &seed, sizeof(seed)) <
+                                    sizeof(seed))
+                                        seed = time(0);
+                                close(fd);
+                        } else {
+                                seed = time(0);
+                        }
+                }
+
+                srand(seed);
+
+                dmesg("%s: rank %d seed %d (%s).\n", prog, myrank, seed,
+                      (order == RANDOM) ? "random_order" : "readdir_order");
+        } else {
+                usage(stderr, "one --create, --mknod, --open, --stat,"
+#ifdef HAVE_MDC_LOOKUP
+                      " --lookup,"
+#endif
+                      " or --unlink must be specifed.");
+        }
+
+        /* support for multiple threads in a dir, set begin/end appropriately.*/
+        dirnum = myrank % ndirs;
+        dirthreads = nthreads / ndirs;
+        if (nthreads > (ndirs * dirthreads + dirnum))
+                ++dirthreads;
+
+        offset = myrank / ndirs;
+
+        tmpend = begin + nfiles - 1;
+        if (tmpend <= 0)
+                tmpend = INT_MAX;
+
+        end = begin + (nfiles / dirthreads) * dirthreads + offset;
+        if ((end > tmpend) || (end <= 0))
+                end -= dirthreads;
+
+        begin += offset;
+        if (begin < 0)
+                begin = INT_MAX;
+
+       beginsave = begin;
+
+        dmesg("%d: iters %d nfiles %d time %d begin %d end %d dirthreads %d."
+              "\n", myrank, iters, nfiles, seconds, begin, end, dirthreads);
+
+        if (dirfmt == NULL) {
+                strcpy(dir, ".");
+        } else {
+                sprintf(dir, dirfmt, dirnum);
+
+                sprintf(mkdir_cmd, "/bin/mkdir -p %s", dir);
+                #ifdef _LIGHTWEIGHT_KERNEL
+                        printf("NOTICE: not running system(%s)\n", mkdir_cmd);
+                #else
+                        rc = system(mkdir_cmd);
+                        if (rc) {
+                                fatal(myrank, "'%s' failed.\n", mkdir_cmd);
+                        }
+                #endif
+
+                rc = chdir(dir);
+                if (rc) {
+                        fatal(myrank, "unable to chdir to '%s'.\n", dir);
+                }
+        }
+}
+
+static inline char *next_file()
+{
+        if (order == RANDOM) {
+                sprintf(filename, filefmt, random() % nfiles);
+                return(filename);
+        }
+
+        /* readdir order */
+
+        dir_entry = readdir(directory);
+        if (dir_entry == NULL) {
+                rewinddir(directory);
+                while ((dir_entry = readdir(directory)) != NULL) {
+                        if (dir_entry->d_name[0] != '.')
+                                return(dir_entry->d_name);
+                }
+
+                fatal(myrank, "unable to read directory %s (%s).\n",
+                      dir, strerror(errno));
+        }
+
+        return(dir_entry->d_name);
+}
+
+int
+main(int argc, char *argv[])
+{
+        int    i, j, fd, rc, nops, lastOps, ag_ops;
+        float  rate, ag_rate;
+        time_t startTime, lastTime, curTime, interval;
+        char * file;
+
+        rc = MPI_Init(&argc, &argv);
+        if (rc != MPI_SUCCESS)
+                fatal(myrank, "MPI_Init failed: %d\n", rc);
+
+        rc = MPI_Comm_size(MPI_COMM_WORLD, &nthreads);
+        if (rc != MPI_SUCCESS)
+                fatal(myrank, "MPI_Comm_size failed: %d\n", rc);
+
+        rc = MPI_Comm_rank(MPI_COMM_WORLD, &myrank);
+        if (rc != MPI_SUCCESS)
+                fatal(myrank, "MPI_Comm_rank failed: %d\n", rc);
+
+        process_args(argc, argv);
+
+        startTime = time(0);
+        if ((myrank == 0) || debug) {
+               printf("%d: %s starting at %s",
+                      myrank, hostname, ctime(&startTime));
+       }
+
+        /* if we're not measuring creation rates then precreate
+         * the files we're operating on. */
+        if ((mode != CREATE) && (mode != MKNOD)) {
+                /* create the files in reverse order. When we encounter
+                 * a file that already exists, assume the remainder of 
+                 * the files exist to save time. The timed performance
+                 * test scripts make use of this behavior. */
+                for (i = end, j = 0; i >= begin; i -= dirthreads) {
+                        sprintf(filename, filefmt, i);
+                        fd = open(filename, openflags, 0644);
+                        if (fd < 0) {
+                                if (errno == EEXIST)
+                                        break;
+                                rc = errno;
+                                fatal(myrank, "precreate open(%s) error: %s\n",
+                                      filename, strerror(rc));
+                        }
+                        j++;
+                        close(fd);
+                }
+                dmesg("%d: %s pre-created %d files.\n",myrank,hostname,j);
+
+                rc = MPI_Barrier(MPI_COMM_WORLD);
+                if (rc != MPI_SUCCESS)
+                        fatal(myrank, "prep MPI_Barrier failed: %d\n", rc);
+        }
+
+        if (order == READDIR) {
+                directory = opendir(dir);
+                if (directory == NULL) {
+                        rc = errno;
+                        fatal(myrank, "opendir(%s) error: %s\n",
+                              dir, strerror(rc));
+                }
+
+                startTime = time(0);
+                j = random() % nfiles;
+                dmesg("%d: %s initializing dir offset %u: %s",
+                      myrank, hostname, j, ctime(&startTime));
+
+                for (i = 0; i <= j; i++) {
+                        if ((dir_entry = readdir(directory)) == NULL) {
+                                fatal(myrank, "could not read entry number %d "
+                                      "in directory %s.\n", i, dir);
+                        }
+                }
+
+                lastTime = time(0);
+                dmesg("%d: index %d, filename %s, offset %ld: "
+                      "%s initialization complete: %s",
+                      myrank, i, dir_entry->d_name, telldir(directory),
+                      hostname, ctime(&lastTime));
+        }
+
+        rc = MPI_Barrier(MPI_COMM_WORLD);
+        if (rc != MPI_SUCCESS)
+                fatal(myrank, "prep MPI_Barrier failed: %d\n", rc);
+
+        if (seconds) {
+                act.sa_handler = sigalrm_handler;
+                (void)sigemptyset(&act.sa_mask);
+                act.sa_flags = 0;
+                sigaction(SIGALRM, &act, NULL);
+                alarm(seconds);
+        }
+
+        startTime = lastTime = time(0);
+        nops = lastOps = 0;
+
+        switch (mode) {
+        case CREATE:
+                for (; begin <= end && !alarm_caught; begin += dirthreads) {
+                        sprintf(filename, filefmt, begin);
+                        if ((fd = open(filename, openflags, 0644)) < 0) {
+                                if (((rc = errno) == EINTR) && alarm_caught)
+                                        break;
+                                fatal(myrank, "open(%s) error: %s\n",
+                                      filename, strerror(rc));
+                        }
+
+                        close(fd);
+                        DISPLAY_PROGRESS();
+                }
+
+                dmesg("%d: created %d files, last file '%s'.\n",
+                      myrank, nops, filename);
+                break;
+#ifdef HAVE_MDC_LOOKUP
+        case LOOKUP:
+                fd = open(dir, O_RDONLY);
+                if (fd < 0) {
+                        fatal(myrank, "open(dir == '%s') error: %s\n",
+                              dir, strerror(errno));
+                }
+
+                for (; nops < iters && !alarm_caught;) {
+                        char *filename = next_file();
+                        rc = llapi_file_lookup(fd, filename);
+                        if (rc < 0) {
+                                if (((rc = errno) == EINTR) && alarm_caught)
+                                        break;
+                                fatal(myrank, "llapi_file_lookup(%s) "
+                                      "error: %s\n", filename, strerror(rc));
+                        }
+
+                        DISPLAY_PROGRESS();
+                }
+                break;
+#endif
+        case MKNOD:
+                for (; begin <= end && !alarm_caught; begin += dirthreads) {
+                        sprintf(filename, filefmt, begin);
+                        rc = mknod(filename, S_IFREG| 0644, 0);
+                        if (rc) {
+                                if (((rc = errno) == EINTR) && alarm_caught)
+                                        break;
+                                fatal(myrank, "mknod(%s) error: %s\n",
+                                      filename, strerror(rc));
+                        }
+
+                        DISPLAY_PROGRESS();
+                }
+                break;
+        case OPEN:
+                for (; nops < iters && !alarm_caught;) {
+                        file = next_file();
+                        if ((fd = open(file, openflags, 0644)) < 0) {
+                                if (((rc = errno) == EINTR) && alarm_caught)
+                                        break;
+                                fatal(myrank, "open(%s) error: %s\n",
+                                      file, strerror(rc));
+                        }
+
+                        close(fd);
+
+                        DISPLAY_PROGRESS();
+                }
+                break;
+        case STAT:
+                for (; nops < iters && !alarm_caught;) {
+                        rc = stat(file = next_file(), &statbuf);
+                        if (rc) {
+                                if (((rc = errno) == EINTR) && alarm_caught)
+                                        break;
+                                fatal(myrank, "stat(%s) error: %s\n",
+                                      file, strerror(rc));
+                        }
+
+                        DISPLAY_PROGRESS();
+                }
+                break;
+        case UNLINK:
+                for (; begin <= end && !alarm_caught; begin += dirthreads) {
+                        sprintf(filename, filefmt, begin);
+                        rc = unlink(filename);
+                        if (rc) {
+                                if (((rc = errno) == EINTR) && alarm_caught)
+                                        break;
+                                fatal(myrank, "unlink(%s) error: %s\n",
+                                      filename, strerror(rc));
+                        }
+
+                        DISPLAY_PROGRESS();
+                }
+                break;
+        }
+
+        curTime = time(0);
+        interval = curTime - startTime;
+        rate = (float)(nops);
+        if (interval != 0)
+                rate /= (float)interval;
+
+        rc = MPI_Reduce(&nops, &ag_ops, 1, MPI_INT, MPI_SUM, 0,
+                        MPI_COMM_WORLD);
+        if (rc != MPI_SUCCESS) {
+                fatal(myrank, "Failure in MPI_Reduce of total ops.\n");
+        }
+
+        rc = MPI_Reduce(&rate, &ag_rate, 1, MPI_FLOAT, MPI_SUM, 0,
+                        MPI_COMM_WORLD);
+        if (rc != MPI_SUCCESS) {
+                fatal(myrank, "Failure in MPI_Reduce of aggregated rate.\n");
+        }
+
+        if (myrank == 0) {
+                printf("Rate: %.2f %ss/sec (total: %d threads %d %ss %lu secs)"
+                       "\n", ag_rate, cmd, nthreads, ag_ops, cmd, interval);
+        }
+
+        if (recreate) {
+                for (begin = beginsave; begin <= end; begin += dirthreads) {
+                        sprintf(filename, filefmt, begin);
+                        if ((fd = open(filename, openflags, 0644)) < 0) {
+                                rc = errno;
+                               if (rc == EEXIST)
+                                       break;
+                                fatal(myrank, "recreate open(%s) error: %s\n",
+                                      filename, strerror(rc));
+                        }
+
+                        close(fd);
+                }
+        }
+
+        curTime = time(0);
+        if ((myrank == 0) || debug) {
+               printf("%d: %s finished at %s",
+                      myrank, hostname, ctime(&curTime));
+       }
+
+        MPI_Finalize();
+        return(0);
+}
diff --git a/lustre/tests/memhog.c b/lustre/tests/memhog.c

index 11cb734..1d98e88 100644 (file)
--- a/lustre/tests/memhog.c
+++ b/lustre/tests/memhog.c
@@ -1,3 +1,39 @@
+/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
+ * vim:expandtab:shiftwidth=8:tabstop=8:
+ *
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright  2008 Sun Microsystems, Inc. All rights reserved
+ * Use is subject to license terms.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
  #include <sys/types.h>
  #include <unistd.h>
  #include <stdio.h>
diff --git a/lustre/tests/mkdirdeep.c b/lustre/tests/mkdirdeep.c

index d5f1b27..853e1e7 100644 (file)
--- a/lustre/tests/mkdirdeep.c
+++ b/lustre/tests/mkdirdeep.c
@@ -1,6 +1,40 @@
  /* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
   * vim:expandtab:shiftwidth=8:tabstop=8:
   *
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright  2008 Sun Microsystems, Inc. All rights reserved
+ * Use is subject to license terms.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/tests/mkdirdeep.c
+ *
   * Compile with:
   * cc -I../../lnet/include -o mkdirdeep mkdirdeep.c
   *    -L../../lnet/linux/utils -lptlctl
diff --git a/lustre/tests/mkdirmany.c b/lustre/tests/mkdirmany.c

index 0f7a1b6..a4ea865 100755 (executable)
--- a/lustre/tests/mkdirmany.c
+++ b/lustre/tests/mkdirmany.c
@@ -1,3 +1,39 @@
+/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
+ * vim:expandtab:shiftwidth=8:tabstop=8:
+ *
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright  2008 Sun Microsystems, Inc. All rights reserved
+ * Use is subject to license terms.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
  #include <stdio.h>
  #include <sys/types.h>
  #include <sys/stat.h>
diff --git a/lustre/tests/mlink.c b/lustre/tests/mlink.c

index 5688b9f..5ef92e9 100755 (executable)
--- a/lustre/tests/mlink.c
+++ b/lustre/tests/mlink.c
@@ -1,3 +1,39 @@
+/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
+ * vim:expandtab:shiftwidth=8:tabstop=8:
+ *
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright  2008 Sun Microsystems, Inc. All rights reserved
+ * Use is subject to license terms.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
  #include <stdio.h>
  #include <sys/types.h>
  #include <sys/stat.h>
diff --git a/lustre/tests/mmap_sanity.c b/lustre/tests/mmap_sanity.c

index f4d0595..ea0db2d 100644 (file)
--- a/lustre/tests/mmap_sanity.c
+++ b/lustre/tests/mmap_sanity.c
@@ -1,6 +1,39 @@
  /* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
   * vim:expandtab:shiftwidth=8:tabstop=8:
+ *
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
   */
+/*
+ * Copyright  2008 Sun Microsystems, Inc. All rights reserved
+ * Use is subject to license terms.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
  #include <stdio.h>
  #include <unistd.h>
  #include <stdlib.h>
diff --git a/lustre/tests/mrename.c b/lustre/tests/mrename.c

index 1c18880..374a346 100644 (file)
--- a/lustre/tests/mrename.c
+++ b/lustre/tests/mrename.c
@@ -1,3 +1,39 @@
+/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
+ * vim:expandtab:shiftwidth=8:tabstop=8:
+ *
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright  2008 Sun Microsystems, Inc. All rights reserved
+ * Use is subject to license terms.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
  #include <stdlib.h>
  #include <stdio.h>
  #include <errno.h>
diff --git a/lustre/tests/multifstat.c b/lustre/tests/multifstat.c

index b3d6479..e169073 100644 (file)
--- a/lustre/tests/multifstat.c
+++ b/lustre/tests/multifstat.c
@@ -1,3 +1,39 @@
+/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
+ * vim:expandtab:shiftwidth=8:tabstop=8:
+ *
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright  2008 Sun Microsystems, Inc. All rights reserved
+ * Use is subject to license terms.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
  #include <stdio.h>
  #include <sys/types.h>
  #include <sys/stat.h>
diff --git a/lustre/tests/multiop.c b/lustre/tests/multiop.c

index 3bd2128..946fe05 100755 (executable)
--- a/lustre/tests/multiop.c
+++ b/lustre/tests/multiop.c
@@ -1,6 +1,39 @@
  /* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
   * vim:expandtab:shiftwidth=8:tabstop=8:
+ *
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
   */
+/*
+ * Copyright  2008 Sun Microsystems, Inc. All rights reserved
+ * Use is subject to license terms.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
  #ifndef _GNU_SOURCE
  #define _GNU_SOURCE /* pull in O_DIRECTORY in bits/fcntl.h */
  #endif
@@ -161,7 +194,7 @@ int main(int argc, char **argv)
                  case '_':
                          if (usr1_received == 0) {
                                  if (verbose) {
-                                        printf("PAUSING\n");
+                                        printf("PAUSING %u\n", getpid());
                                          fflush(stdout);
                                  }
                                  pause();
diff --git a/lustre/tests/munlink.c b/lustre/tests/munlink.c

index 2390cd9..4468483 100755 (executable)
--- a/lustre/tests/munlink.c
+++ b/lustre/tests/munlink.c
@@ -1,6 +1,39 @@
  /* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
   * vim:expandtab:shiftwidth=8:tabstop=8:
+ *
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
   */
+/*
+ * Copyright  2008 Sun Microsystems, Inc. All rights reserved
+ * Use is subject to license terms.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
  #include <stdio.h>
  #include <sys/types.h>
  #include <sys/stat.h>
diff --git a/lustre/tests/o_directory.c b/lustre/tests/o_directory.c

index b587cd0..cb7f913 100644 (file)
--- a/lustre/tests/o_directory.c
+++ b/lustre/tests/o_directory.c
@@ -1,5 +1,37 @@
  /* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
   * vim:expandtab:shiftwidth=8:tabstop=8:
+ *
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright  2008 Sun Microsystems, Inc. All rights reserved
+ * Use is subject to license terms.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
   */
  
  /* for O_DIRECTORY */
diff --git a/lustre/tests/oos.sh b/lustre/tests/oos.sh

index 541a860..77f360c 100755 (executable)
--- a/lustre/tests/oos.sh
+++ b/lustre/tests/oos.sh
@@ -3,13 +3,17 @@
  set -e
  #set -vx
  
+LUSTRE=${LUSTRE:-$(cd $(dirname $0)/..; echo $PWD)}
+. $LUSTRE/tests/test-framework.sh
+
  export PATH=`dirname $0`/../utils:$PATH
  LFS=${LFS:-lfs}
+LCTL=${LCTL:-lctl}
  MOUNT=${MOUNT:-$1}
  MOUNT=${MOUNT:-/mnt/lustre}
  OOS=$MOUNT/oosfile
  TMP=${TMP:-/tmp}
-LOG=$TMP/ooslog
+LOG=$TMP/$(basename $0 .sh).log
  
  SUCCESS=1
  
@@ -17,24 +21,32 @@ rm -f $OOS $LOG
  
  sync; sleep 1; sync    # to ensure we get up-to-date statfs info
  
-#echo -1 > /proc/sys/lnet/debug
-#echo 0x40a8 > /proc/sys/lnet/subsystem_debug
-#lctl clear
-#lctl debug_daemon start /r/tmp/debug 1024
+#$LCTL set_param -n debug=-1
+#$LCTL set_param -n subsystem_debug=0x40a8
+
+#$LCTL clear
+#$LCTL debug_daemon start /r/tmp/debug 1024
  
-STRIPECOUNT=`cat /proc/fs/lustre/lov/*/activeobd | head -n 1`
-ORIGFREE=`cat /proc/fs/lustre/llite/*/kbytesavail | head -n 1`
+STRIPECOUNT=`$LCTL get_param -n lov.*.activeobd | head -n 1`
+ORIGFREE=`$LCTL get_param -n llite.*.kbytesavail | head -n 1`
  MAXFREE=${MAXFREE:-$((400000 * $STRIPECOUNT))}
+echo STRIPECOUNT=$STRIPECOUNT ORIGFREE=$ORIGFREE MAXFREE=$MAXFREE
  if [ $ORIGFREE -gt $MAXFREE ]; then
-       echo "skipping out-of-space test on $OSC"
-       echo "reports ${ORIGFREE}kB free, more than MAXFREE ${MAXFREE}kB"
-       echo "increase $MAXFREE (or reduce test fs size) to proceed"
+       skip "$0: ${ORIGFREE}kB free gt MAXFREE ${MAXFREE}kB, increase $MAXFREE (or reduce test fs size) to proceed"
         exit 0
  fi
  
  export LANG=C LC_LANG=C # for "No space left on device" message
  
-[ -f $LOG ] && echo "ERROR: log file wasn't removed?" && exit 1
+[ -f $LOG ] && error "log file wasn't removed?"
+
+echo BEFORE dd started
+for OSC in `$LCTL get_param -N osc.*-osc-*.kbytesavail | cut -d"." -f1-2`; do
+       AVAIL=`$LCTL get_param -n $OSC.kbytesavail`
+       GRANT=$((`$LCTL get_param -n $OSC.cur_grant_bytes` / 1024))
+       echo -n "$(echo $OSC | cut -d"." -f2) avl=$AVAIL grnt=$GRANT diff=$(($AVAIL - $GRANT))"
+       echo " "
+done
  
  # make sure we stripe over all OSTs to avoid OOS on only a subset of OSTs
  $LFS setstripe $OOS -c $STRIPECOUNT
@@ -43,6 +55,8 @@ if dd if=/dev/zero of=$OOS count=$(($ORIGFREE + 100)) bs=1k 2> $LOG; then
         SUCCESS=0
  fi
  
+[ ! -s "$LOG" ] && error "LOG file is empty!"
+
  if [ "`grep -c 'No space left on device' $LOG`" -ne 1 ]; then
         echo "ERROR: dd not return ENOSPC"
         sed "s/^/LOG: /" $LOG
@@ -52,17 +66,19 @@ fi
  # flush cache to OST(s) so avail numbers are correct
  sync; sleep 1 ; sync
  
-for OSC in /proc/fs/lustre/osc/*-osc-*; do
-       AVAIL=`cat $OSC/kbytesavail`
-       GRANT=$((`cat $OSC/cur_grant_bytes` / 1024))
-       echo -n "$(basename $OSC) avl=$AVAIL grnt=$GRANT diff=$(($AVAIL - $GRANT))"
+echo AFTER dd
+for OSC in `$LCTL get_param -N osc.*-osc-*.kbytesavail | cut -d"." -f1-2`; do
+       AVAIL=`$LCTL get_param -n $OSC.kbytesavail`
+       GRANT=$((`$LCTL get_param -n $OSC.cur_grant_bytes` / 1024))
+       echo -n "$(echo $OSC | cut -d"." -f2) avl=$AVAIL grnt=$GRANT diff=$(($AVAIL - $GRANT))"
         [ $(($AVAIL - $GRANT)) -lt 400 ] && OSCFULL=full && echo -n " FULL"
         echo " "
  done
  
  if [ -z "$OSCFULL" ]; then
         echo "no OSTs are close to full"
-       grep "[0-9]" /proc/fs/lustre/osc/*-osc-*/{kbytesavail,cur*}
+       $LCTL get_param "osc.*-osc-*.kbytesavail"
+       $LCTL get_param "osc.*-osc-*.cur*"
         SUCCESS=0
  fi
  
@@ -77,8 +93,10 @@ elif [ "$RECORDSOUT" -ne $((FILESIZE / 1024)) ]; then
         SUCCESS=0
  fi
  
-#lctl debug_daemon stop
+#$LCTL debug_daemon stop
  
+echo LOG file
+cat $LOG
  rm -f $OOS
  sync; sleep 1; sync
  
diff --git a/lustre/tests/oos2.sh b/lustre/tests/oos2.sh

index d3a4050..6437fa4 100644 (file)
--- a/lustre/tests/oos2.sh
+++ b/lustre/tests/oos2.sh
@@ -2,8 +2,12 @@
  
  set -e
  
-export PATH=`dirname $0`/../utils:$PATH
+LUSTRE=${LUSTRE:-$(cd $(dirname $0)/..; echo $PWD)}
+. $LUSTRE/tests/test-framework.sh
+
+export PATH=$LUSTRE/utils:$PATH
  LFS=${LFS:-lfs}
+LCTL=${LCTL:-lctl}
  MOUNT=${MOUNT:-$1}
  MOUNT=${MOUNT:-/mnt/lustre}
  MOUNT2=${MOUNT2:-$2}
@@ -11,7 +15,7 @@ MOUNT2=${MOUNT2:-${MOUNT}2}
  OOS=$MOUNT/oosfile
  OOS2=$MOUNT2/oosfile2
  TMP=${TMP:-/tmp}
-LOG=$TMP/oosfile
+LOG=$TMP/$(basename $0 .sh).log
  LOG2=${LOG}2
  
  SUCCESS=1
@@ -20,13 +24,12 @@ rm -f $OOS $OOS2 $LOG $LOG2
  
  sync; sleep 1; sync    # to ensure we get up-to-date statfs info
  
-STRIPECOUNT=`cat /proc/fs/lustre/lov/*/activeobd | head -n 1`
-ORIGFREE=`cat /proc/fs/lustre/llite/*/kbytesavail | head -n 1`
+STRIPECOUNT=`$LCTL get_param -n lov.*.activeobd | head -n 1`
+ORIGFREE=`$LCTL get_param -n llite.*.kbytesavail | head -n 1`
  MAXFREE=${MAXFREE:-$((400000 * $STRIPECOUNT))}
+echo STRIPECOUNT=$STRIPECOUNT ORIGFREE=$ORIGFREE MAXFREE=$MAXFREE
  if [ $ORIGFREE -gt $MAXFREE ]; then
-       echo "skipping out-of-space test on $OSC"
-       echo "reports ${ORIGFREE}kB free, more tham MAXFREE ${MAXFREE}kB"
-       echo "increase $MAXFREE (or reduce test fs size) to proceed"
+       skip "$0: ${ORIGFREE}kB free gt MAXFREE ${MAXFREE}kB, increase $MAXFREE (or reduce test fs size) to proceed"
         exit 0
  fi
  
@@ -46,18 +49,23 @@ if wait $DDPID; then
         SUCCESS=0
  fi
  
+[ ! -s "$LOG" ] && error "LOG file is empty!"
+[ ! -s "$LOG2" ] && error "LOG2 file is empty!"
+
  if [ "`cat $LOG $LOG2 | grep -c 'No space left on device'`" -ne 2 ]; then
-        echo "ERROR: dd not return ENOSPC"
+       echo "ERROR: dd not return ENOSPC"
         SUCCESS=0
  fi
  
  # flush cache to OST(s) so avail numbers are correct
  sync; sleep 1 ; sync
  
-for OSC in /proc/fs/lustre/osc/*-osc-*; do
-       AVAIL=`cat $OSC/kbytesavail`
-       GRANT=`cat $OSC/cur_grant_bytes`
-       [ $(($AVAIL - $GRANT / 1024)) -lt 400 ] && OSCFULL=full
+for OSC in `$LCTL get_param -N osc.*-osc-*.kbytesavail | cut -d"." -f1-2`; do
+       AVAIL=`$LCTL get_param -n $OSC.kbytesavail`
+       GRANT=$((`$LCTL get_param -n $OSC.cur_grant_bytes` / 1024))
+       echo -n "$(echo $OSC | cut -d"." -f2) avl=$AVAIL grnt=$GRANT diff=$(($AVAIL - $GRANT))"
+       [ $(($AVAIL - $GRANT)) -lt 400 ] && OSCFULL=full && echo -n " FULL"
+       echo " "
  done
  
  # FIXME - This test reports false failures
@@ -69,7 +77,8 @@ done
  
  if [ -z "$OSCFULL" ]; then
         echo "no OSTs are close to full"
-       grep "[0-9]" /proc/fs/lustre/osc/*-osc-*/{kbytesavail,cur*}|tee -a $LOG
+       $LCTL get_param "osc.*-osc-*.kbytesavail"
+       $LCTL get_param "osc.*-osc-*.cur*"
         SUCCESS=0
  fi
  
@@ -82,6 +91,9 @@ if [ "$RECORDSOUT" -ne $(($FILESIZE / 1024)) ]; then
          SUCCESS=0
  fi
  
+echo LOG LOG2 file
+cat $LOG $LOG2
+
  rm -f $OOS $OOS2
  sync; sleep 1; sync
  
diff --git a/lustre/tests/openclose.c b/lustre/tests/openclose.c

index 6ca7af7..66c05e1 100644 (file)
--- a/lustre/tests/openclose.c
+++ b/lustre/tests/openclose.c
@@ -1,5 +1,37 @@
  /* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
   * vim:expandtab:shiftwidth=8:tabstop=8:
+ *
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright  2008 Sun Microsystems, Inc. All rights reserved
+ * Use is subject to license terms.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
   */
  
  /* for O_DIRECT */
diff --git a/lustre/tests/opendevunlink.c b/lustre/tests/opendevunlink.c

index 15ac708..af4628c 100644 (file)
--- a/lustre/tests/opendevunlink.c
+++ b/lustre/tests/opendevunlink.c
@@ -1,5 +1,37 @@
  /* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
   * vim:expandtab:shiftwidth=8:tabstop=8:
+ *
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright  2008 Sun Microsystems, Inc. All rights reserved
+ * Use is subject to license terms.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
   */
  
  #ifndef _GNU_SOURCE
@@ -114,4 +146,3 @@ int main(int argc, char **argv)
          fprintf(stderr, "Ok, everything goes well.\n");
          return 0;
  }
-
diff --git a/lustre/tests/opendirunlink.c b/lustre/tests/opendirunlink.c

index f7ad30f..932df6c 100644 (file)
--- a/lustre/tests/opendirunlink.c
+++ b/lustre/tests/opendirunlink.c
@@ -1,5 +1,37 @@
  /* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
   * vim:expandtab:shiftwidth=8:tabstop=8:
+ *
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright  2008 Sun Microsystems, Inc. All rights reserved
+ * Use is subject to license terms.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
   */
  
  /* for O_DIRECTORY */
@@ -122,4 +154,3 @@ int main(int argc, char **argv)
          fprintf(stderr, "Ok, everything goes well.\n");
          return 0;
  }
-
diff --git a/lustre/tests/openfile.c b/lustre/tests/openfile.c

index 6638ac1..66b48c8 100644 (file)
--- a/lustre/tests/openfile.c
+++ b/lustre/tests/openfile.c
@@ -1,5 +1,37 @@
  /* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
   * vim:expandtab:shiftwidth=8:tabstop=8:
+ *
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright  2008 Sun Microsystems, Inc. All rights reserved
+ * Use is subject to license terms.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
   */
  
  #if 0
@@ -60,13 +92,13 @@ int main(int argc, char** argv)
          int    fd;
          int    flags = 0;
          mode_t mode = 0644;
-        char*  fname = NULL;
+        char  *fname = NULL;
          int    mode_set = 0;
          int    flag_set = 0;
          int    c;
          int    save_errno = 0;
          int    print_usage = 0;
-        char*  cloned_flags = NULL;
+        char  *cloned_flags = NULL;
  
          if (argc == 1)
                  Usage_and_abort();
@@ -91,9 +123,10 @@ int main(int argc, char** argv)
                                  printf("flags = %d\n",flags);
  #endif
                                  break;
-                        } else 
+                        } else {
                                  flags = 0;
-                        
+                        }
+
                          for (tmp = strtok(cloned_flags, ":|"); tmp;
                               tmp = strtok(NULL, ":|")) {
                                  int i = 0;
@@ -170,12 +203,11 @@ int main(int argc, char** argv)
          } else {
                  fprintf(stderr, "Error in opening file \"%s\"(flags=%s",
                          fname, cloned_flags);
+                if (mode_set)
+                        fprintf(stderr, ", mode=%o", mode);
+                fprintf(stderr, ") %d: %s\n", save_errno, strerror(save_errno));
          }
  
-        if (mode_set)
-                fprintf(stderr, ", mode=%o", mode);
-        fprintf(stderr, ") %d: %s\n", save_errno, strerror(save_errno));
-
  out:
          if (cloned_flags)
                  free(cloned_flags);
@@ -184,4 +216,3 @@ out:
  
          return save_errno;
  }
-
diff --git a/lustre/tests/openfilleddirunlink.c b/lustre/tests/openfilleddirunlink.c

index 9b07c64..eabffa4 100644 (file)
--- a/lustre/tests/openfilleddirunlink.c
+++ b/lustre/tests/openfilleddirunlink.c
@@ -1,5 +1,37 @@
  /* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
   * vim:expandtab:shiftwidth=8:tabstop=8:
+ *
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright  2008 Sun Microsystems, Inc. All rights reserved
+ * Use is subject to license terms.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
   */
  
  /* for O_DIRECTORY */
diff --git a/lustre/tests/openme.c b/lustre/tests/openme.c

index 9a1f3f3..d3f7b24 100644 (file)
--- a/lustre/tests/openme.c
+++ b/lustre/tests/openme.c
@@ -1,3 +1,39 @@
+/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
+ * vim:expandtab:shiftwidth=8:tabstop=8:
+ *
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright  2008 Sun Microsystems, Inc. All rights reserved
+ * Use is subject to license terms.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
  #include <fcntl.h>
  #include <unistd.h>
  #include <stdlib.h>
diff --git a/lustre/tests/openunlink.c b/lustre/tests/openunlink.c

index 4a0d8c3..d69dff3 100644 (file)
--- a/lustre/tests/openunlink.c
+++ b/lustre/tests/openunlink.c
@@ -1,6 +1,39 @@
  /* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
   * vim:expandtab:shiftwidth=8:tabstop=8:
+ *
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
   */
+/*
+ * Copyright  2008 Sun Microsystems, Inc. All rights reserved
+ * Use is subject to license terms.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
  #include <stdio.h>
  #include <fcntl.h>
  #include <string.h>
diff --git a/lustre/tests/parallel_grouplock.c b/lustre/tests/parallel_grouplock.c

index 6230495..d88033f 100644 (file)
--- a/lustre/tests/parallel_grouplock.c
+++ b/lustre/tests/parallel_grouplock.c
@@ -1,23 +1,41 @@
  /* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
   * vim:expandtab:shiftwidth=8:tabstop=8:
   *
- * Copyright (C) 2002 Cluster File Systems, Inc.
- *   Author: You Feng <youfeng@clusterfs.com>
+ * GPL HEADER START
   *
- *   This file is part of Lustre, http://www.lustre.org.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   *
- *   Lustre is free software; you can redistribute it and/or
- *   modify it under the terms of version 2 of the GNU General Public
- *   License as published by the Free Software Foundation.
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
   *
- *   Lustre is distributed in the hope that it will be useful,
- *   but WITHOUT ANY WARRANTY; without even the implied warranty of
- *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- *   GNU General Public License for more details.
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
   *
- *   You should have received a copy of the GNU General Public License
- *   along with Lustre; if not, write to the Free Software
- *   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright  2008 Sun Microsystems, Inc. All rights reserved
+ * Use is subject to license terms.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/tests/parallel_grouplock.c
+ *
+ * Author: You Feng <youfeng@clusterfs.com>
   */
  
  #include <mpi.h>
@@ -32,7 +50,7 @@
  #include <time.h>
  #include <errno.h>
  #include <lustre/lustre_user.h>
-#include "lp_utils.h"
+#include <lustre/tests/lp_utils.h>
  
  #define LPGL_FILEN 700000
  #define LPGL_TEST_ITEMS 7
diff --git a/lustre/tests/performance-sanity.sh b/lustre/tests/performance-sanity.sh

new file mode 100644 (file)

index 0000000..ce3d2f9
--- /dev/null
+++ b/lustre/tests/performance-sanity.sh
@@ -0,0 +1,82 @@
+#!/bin/bash
+#set -vx
+set -e
+
+TESTNAME=`basename $0 .sh`
+TMP=${TMP:-/tmp}
+LOG=${LOG:-"$TMP/${TESTNAME}.log"}
+
+LUSTRE=${LUSTRE:-`dirname $0`/..}
+. $LUSTRE/tests/test-framework.sh
+init_test_env $@
+
+. ${CONFIG:=$LUSTRE/tests/cfg/$NAME.sh}
+
+[ -x "$MDSRATE" ] || FAIL_ON_ERROR=true error "No mdsrate program. Aborting."
+which mpirun > /dev/null 2>&1 || \
+       FAIL_ON_ERROR=true error "No mpirun program. Aborting." 
+
+# Skip these tests
+# bug number:  15266 15266 15266
+ALWAYS_EXCEPT="1     2     6     $PERFORMANCE_SANITY_EXCEPT"
+
+build_test_filter
+
+# single-IOR-rates
+test_1() {
+    echo "Single client I/O performance as a percentage of raw"
+}
+run_test 1 "single-client IO perf ====="
+
+# parallel-IOR-rates 
+test_2() {
+    echo "MPI coordinated test of parallel filesystem system calls and library functions"
+}
+run_test 2 "multi-client IO perf ====="
+
+# mdsrate-create-small
+test_3() {
+    echo "File creation performance tests for file objects"
+    bash mdsrate-create-small.sh
+}
+run_test 3 "small file create/open/delete ======"
+
+# mdsrate-create-large
+test_4() {
+    echo "Large file creation performance"
+    bash mdsrate-create-large.sh
+}
+run_test 4 "large file create/open/delete ======"
+
+# mdsrate-lookup-1dir
+test_5() {
+    echo "Single directory lookup retrieval rate"
+    bash mdsrate-lookup-1dir.sh
+}
+run_test 5 "lookup rate 10M file dir ======"
+
+# mdsrate-lookup-10dir
+test_6() {
+    echo "Directory lookup retrieval rate 10 directories, 1 million files each"
+    bash mdsrate-lookup-10dirs.sh
+}
+run_test 6 "lookup rate 10M file 10 dir ======"
+
+# mdsrate-stat-small
+test_7() {
+    echo "File attribute retrieval rate for small file creation"
+    bash mdsrate-stat-small.sh
+}
+run_test 7 "getattr small file ======"
+
+# mdsrate-stat-large
+test_8() {
+    echo "File attribute retrieval rate for large file creation"
+    bash mdsrate-stat-large.sh
+}
+run_test 8 "getattr large files ======"
+
+equals_msg `basename $0`: test complete, cleaning up
+check_and_cleanup_lustre
+[ -f "$LOG" ] && cat $LOG || true
+[ -f "$TESTSUITELOG" ] && cat $TESTSUITELOG && grep -q FAIL $TESTSUITELOG && exit 1 || true
diff --git a/lustre/tests/racer/racer.sh b/lustre/tests/racer/racer.sh

index c1f8b99..645e349 100755 (executable)
--- a/lustre/tests/racer/racer.sh
+++ b/lustre/tests/racer/racer.sh
@@ -3,16 +3,12 @@
  MAX_FILES=${MAX_FILES:-20}
  DIR=${DIR:-$1}
  DIR=${DIR:-"/mnt/lustre/racer"}
-if ! [ -d "$DIR" -o -d "`basename $DIR`" ]; then
-       echo "$0: '$DIR' and '`basename $DIR`' are not directories"
-       exit 1
-fi
  DURATION=${DURATION:-$((60*5))}
  
  NUM_THREADS=${NUM_THREADS:-$2}
  NUM_THREADS=${NUM_THREADS:-3}
  
-[ -e $DIR ] || mkdir $DIR
+mkdir -p $DIR
  
  racer_cleanup()
  {
@@ -32,7 +28,7 @@ trap "
      echo \"Cleaning up\" 
      racer_cleanup
      exit 0
-" 2
+" 2 15
  
  cd `dirname $0`
  for N in `seq 1 $NUM_THREADS`; do
diff --git a/lustre/tests/reads.c b/lustre/tests/reads.c

index 77ebeaa..6e58f8b 100644 (file)
--- a/lustre/tests/reads.c
+++ b/lustre/tests/reads.c
@@ -1,25 +1,43 @@
-/*
- * Lustre Reads test
+/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
+ * vim:expandtab:shiftwidth=8:tabstop=8:
   *
- * Copyright (c) 2005 Cluster File Systems, Inc.
- * Copyright (c) 2008 SUN Microsystems.
+ * GPL HEADER START
   *
- * Author: Nikita Danilov <nikita@clusterfs.com>
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
   *
- * This file is part of Lustre, http://www.lustre.org.
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
   *
- * Lustre is free software; you can redistribute it and/or modify it under the
- * terms of version 2 of the GNU General Public License as published by the
- * Free Software Foundation.
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
   *
- * Lustre is distributed in the hope that it will be useful, but WITHOUT ANY
- * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
- * FOR A PARTICULAR PURPOSE.  See the GNU General Public License for more
- * details.
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
   *
- * You should have received a copy of the GNU General Public License along
- * with Lustre; if not, write to the Free Software Foundation, Inc., 675 Mass
- * Ave, Cambridge, MA 02139, USA.
+ * GPL HEADER END
+ */
+/*
+ * Copyright  2008 Sun Microsystems, Inc. All rights reserved
+ * Use is subject to license terms.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/tests/reads.c
+ *
+ * Lustre Reads test
+ *
+ * Author: Nikita Danilov <nikita@clusterfs.com>
   */
  
  #define _XOPEN_SOURCE 500 /* for pread(2) */
diff --git a/lustre/tests/recovery-mds-scale.sh b/lustre/tests/recovery-mds-scale.sh

new file mode 100644 (file)

index 0000000..8c47ee1
--- /dev/null
+++ b/lustre/tests/recovery-mds-scale.sh
@@ -0,0 +1,252 @@
+#!/bin/bash
+
+# Was Test 11 in cmd3.
+# For duration of 24 hours repeatedly failover a random MDS at
+# 10 minute intervals and verify that no application errors occur.
+
+# Test runs one of CLIENT_LOAD progs on remote clients.
+
+LUSTRE=${LUSTRE:-`dirname $0`/..}
+SETUP=${SETUP:-""}
+CLEANUP=${CLEANUP:-""}
+. $LUSTRE/tests/test-framework.sh
+
+init_test_env $@
+
+. ${CONFIG:=$LUSTRE/tests/cfg/$NAME.sh}
+
+TESTSUITELOG=${TESTSUITELOG:-$TMP/recovery-mds-scale}
+DEBUGLOG=$TESTSUITELOG.debug
+exec 2>$DEBUGLOG
+echo "--- env ---" >&2
+env >&2
+echo "--- env ---" >&2
+set -x
+
+[ "$SHARED_DIRECTORY" ] || \
+    { skip "$0: Empty SHARED_DIRECTORY" && exit 0; }
+
+[ -n "$CLIENTS" ] || { skip "$0 Need two or more remote clients" && exit 0; }
+[ $CLIENTCOUNT -ge 3 ] || \
+    { skip "$0 Need two or more clients, have $CLIENTCOUNT" && exit 0; }
+
+END_RUN_FILE=${END_RUN_FILE:-$SHARED_DIRECTORY}/end_run_file}
+LOAD_PID_FILE=${LOAD_PID_FILE:-$TMP/client-load.pid}
+
+remote_mds_nodsh && skip "remote MDS with nodsh" && exit 0
+remote_ost_nodsh && skip "remote OST with nodsh" && exit 0
+
+build_test_filter
+
+check_and_setup_lustre
+rm -rf $DIR/[df][0-9]*
+
+# the test node needs to be insulated from a lustre failure as much as possible,
+# so not even loading the lustre modules is ideal.
+# -- umount lustre
+# -- remove hostname from clients list
+zconf_umount $(hostname) $MOUNT
+NODES_TO_USE=${NODES_TO_USE:-$CLIENTS}
+NODES_TO_USE=$(exclude_item_from_list $NODES_TO_USE $(hostname))
+
+check_progs_installed $NODES_TO_USE ${CLIENT_LOADS[@]}
+
+MDTS=mds
+
+OSTS=""
+for ((i=1; i<=$OSTCOUNT; i++)) do
+    OSTS="$OSTS ost$i"
+done
+OSTS=$(comma_list $OSTS)
+
+ERRORS_OK=""    # No application failures should occur during this test.
+FLAVOR=${FLAVOR:-"MDS"}
+
+rm -f $END_RUN_FILE
+
+vmstatLOG=${TESTSUITELOG}_$(basename $0 .sh).vmstat
+
+server_numfailovers () {
+    local facet
+    local var
+
+    for facet in $MDTS ${OSTS//,/ }; do
+        var=${facet}_nums
+        val=${!var}
+        if [ "$val" ] ; then
+            echo "$facet failed  over  $val times"
+        fi
+    done
+}
+
+summary_and_cleanup () {
+
+    local rc=$?
+    local var
+    trap 0
+
+    # Having not empty END_RUN_FILE means the failed loads only
+    if [ -s $END_RUN_FILE ]; then
+        echo "Found the END_RUN_FILE file: $END_RUN_FILE"
+        cat $END_RUN_FILE
+        local END_RUN_NODE=
+        read END_RUN_NODE < $END_RUN_FILE
+
+    # a client load will end (i.e. fail) if it finds
+    # the end run file.  that does not mean that that client load
+    # actually failed though.  the first node in the END_RUN_NODE is
+    # the one we are really interested in.
+        if [ -n "$END_RUN_NODE" ]; then
+            var=${END_RUN_NODE}_load
+            echo "Client load failed on node $END_RUN_NODE" 
+            echo
+            echo "client $END_RUN_NODE load stdout and debug files :
+              ${TESTSUITELOG}_run_${!var}.sh-${END_RUN_NODE}
+              ${TESTSUITELOG}_run_${!var}.sh-${END_RUN_NODE}.debug"
+        fi
+        rc=1
+    fi
+     
+    echo $(date +'%F %H:%M:%S') Terminating clients loads ...
+    echo "$0" >> $END_RUN_FILE
+    local result=PASS
+    [ $rc -eq 0 ] || result=FAIL
+
+    log "Duraion:                $DURATION
+Server failover period: $SERVER_FAILOVER_PERIOD seconds
+Exited after:           $ELAPSED seconds
+Number of failovers before exit:
+$(server_numfailovers)
+Status: $result: rc=$rc"
+
+    # stop the vmstats on the OSTs
+    if [ "$VMSTAT" ]; then
+        do_nodes $(comma_list $(osts_nodes)) "test -f /tmp/vmstat.pid && \
+            { kill -s TERM \$(cat /tmp/vmstat.pid); rm -f /tmp/vmstat.pid; \
+            gzip -f9 $vmstatLOG-\$(hostname); }"
+    fi
+
+    # make sure the client loads die
+    do_nodes $NODES_TO_USE "set -x; test -f $LOAD_PID_FILE && \
+        { kill -s TERM \$(cat $LOAD_PID_FILE) || true; }"
+
+    # and free up the pdshes that started them, if any are still around
+    if [ -n "$CLIENT_LOAD_PIDS" ]; then
+        kill $CLIENT_LOAD_PIDS || true
+        sleep 5
+        kill -9 $CLIENT_LOAD_PIDS || true
+    fi
+    [ $rc -eq 0 ] && zconf_mount $(hostname) $MOUNT
+
+    exit $rc
+}
+
+#
+# MAIN 
+#
+log "-----============= $0 starting =============-----"
+
+trap summary_and_cleanup EXIT INT
+
+DURATION=${DURATION:-$((60*60*24))}
+ELAPSED=0
+NUM_FAILOVERS=0
+
+# vmstat the osts
+if [ "$VMSTAT" ]; then
+    do_nodes $(comma_list $(osts_nodes)) "vmstat 1 > $vmstatLOG-\$(hostname) 2>/dev/null </dev/null & echo \$! > /tmp/vmstat.pid"
+fi
+
+# Start client loads.
+start_client_loads $NODES_TO_USE
+
+echo clients load pids:
+if ! do_nodes $NODES_TO_USE "set -x; echo \$(hostname): && cat $LOAD_PID_FILE"; then
+    if [ -e $DEBUGLOG ]; then
+        exec 2<&-
+        cat $DEBUGLOG
+        exit 3
+    fi
+fi
+
+START_TS=$(date +%s)
+CURRENT_TS=$START_TS
+
+if [ "$FLAVOR" == "MDS" ]; then
+    SERVER_FAILOVER_PERIOD=$MDS_FAILOVER_PERIOD
+    SERVERS=$MDTS
+else
+    SERVER_FAILOVER_PERIOD=$OSS_FAILOVER_PERIOD
+    SERVERS=$OSTS
+fi
+
+SERVER_FAILOVER_PERIOD=${SERVER_FAILOVER_PERIOD:-$((60 * 10))} # 10 minutes
+
+MINSLEEP=${MINSLEEP:-120}
+REQFAIL_PERCENT=${REQFAIL_PERCENT:-3}  # bug17839 comment 62
+REQFAIL=${REQFAIL:-$(( DURATION / SERVER_FAILOVER_PERIOD * REQFAIL_PERCENT / 100))}
+reqfail=0
+sleep=0
+while [ $ELAPSED -lt $DURATION -a ! -e $END_RUN_FILE ]; do
+
+    # In order to perform the 
+    # expected number of failovers, we need to account the following :
+    # 1) the time that has elapsed during the client load checking
+    # 2) time takes for failover
+
+    it_time_start=$(date +%s)
+    
+    SERVERFACET=$(get_random_entry $SERVERS)
+    var=${SERVERFACET}_nums
+
+    # Check that our client loads are still running. If any have died, 
+    # that means they have died outside of recovery, which is unacceptable.    
+
+    log "==== Checking the clients loads BEFORE failover -- failure NOT OK \
+    ELAPSED=$ELAPSED DURATION=$DURATION PERIOD=$SERVER_FAILOVER_PERIOD" 
+
+    if ! check_client_loads $NODES_TO_USE; then
+        exit 4
+    fi
+
+    log "Starting failover on $SERVERNODE"
+
+    facet_failover "$SERVERFACET" || exit 1
+
+    # Check that our client loads are still running during failover.
+    # No application failures should occur.
+
+    log "==== Checking the clients loads AFTER  failover -- failure NOT OK"
+    if ! check_client_loads $NODES_TO_USE; then
+        log "Client load failed during failover. Exiting"
+        exit 5
+    fi
+
+    # Increment the number of failovers
+    NUM_FAILOVERS=$((NUM_FAILOVERS+1))
+    val=$((${!var} + 1))
+    eval $var=$val
+ 
+    CURRENT_TS=$(date +%s)
+    ELAPSED=$((CURRENT_TS - START_TS))
+ 
+    sleep=$((SERVER_FAILOVER_PERIOD-(CURRENT_TS - it_time_start)))
+
+    # keep count the number of itterations when
+    # time spend to failover and two client loads check exceeded 
+    # the value ( SERVER_FAILOVER_PERIOD - MINSLEEP )
+    if [ $sleep -lt $MINSLEEP ]; then
+        reqfail=$((reqfail +1))
+        log "WARNING: failover and two check_client_loads time exceeded SERVER_FAILOVER_PERIOD - MINSLEEP !
+Failed to meet interval $reqfail times ( REQFAIL=$REQFAIL ); have sleep=$sleep"
+        [ $reqfail -gt $REQFAIL ] && exit 6 
+    fi  
+
+    log "$SERVERFACET has failed over ${!var} times, and counting..."
+    if [ $sleep -gt 0 ]; then 
+        echo "sleeping $sleep seconds ... "
+        sleep $sleep
+    fi
+done
+
+exit 0
diff --git a/lustre/tests/recovery-small.sh b/lustre/tests/recovery-small.sh

index a7eaa84..54b6816 100755 (executable)
--- a/lustre/tests/recovery-small.sh
+++ b/lustre/tests/recovery-small.sh
@@ -11,6 +11,15 @@ LUSTRE=${LUSTRE:-`dirname $0`/..}
  init_test_env $@
  . ${CONFIG:=$LUSTRE/tests/cfg/$NAME.sh}
  
+remote_mds_nodsh && skip "remote MDS with nodsh" && exit 0
+
+if [ "$FAILURE_MODE" = "HARD" ] && mixed_ost_devs; then
+    CONFIG_EXCEPTIONS="52"
+    echo -n "Several ost services on one ost node are used with FAILURE_MODE=$FAILURE_MODE. "
+    echo "Except the tests: $CONFIG_EXCEPTIONS"
+    ALWAYS_EXCEPT="$ALWAYS_EXCEPT $CONFIG_EXCEPTIONS"
+fi
+
  # also long tests: 19, 21a, 21e, 21f, 23, 27
  #                                   1  2.5  2.5    4    4          (min)"
  [ "$SLOW" = "no" ] && EXCEPT_SLOW="17  26a  26b    50   51     57"
@@ -22,65 +31,71 @@ build_test_filter
  SETUP=${SETUP:-""}
  CLEANUP=${CLEANUP:-""}
  
-cleanup_and_setup_lustre
+check_and_setup_lustre
+
  assert_DIR
  rm -rf $DIR/[df][0-9]*
  
  test_1() {
-    drop_request "mcreate $MOUNT/1"  || return 1
-    drop_reint_reply "mcreate $MOUNT/2"    || return 2
+    drop_request "mcreate $DIR/f1"  || return 1
+    drop_reint_reply "mcreate $DIR/f2"    || return 2
  }
  run_test 1 "mcreate: drop req, drop rep"
  
  test_2() {
-    drop_request "tchmod 111 $MOUNT/2"  || return 1
-    drop_reint_reply "tchmod 666 $MOUNT/2"    || return 2
+    drop_request "tchmod 111 $DIR/f2"  || return 1
+    drop_reint_reply "tchmod 666 $DIR/f2"    || return 2
  }
  run_test 2 "chmod: drop req, drop rep"
  
  test_3() {
-    drop_request "statone $MOUNT/2" || return 1
-    drop_reply "statone $MOUNT/2"   || return 2
+    drop_request "statone $DIR/f2" || return 1
+    drop_reply "statone $DIR/f2"   || return 2
  }
  run_test 3 "stat: drop req, drop rep"
  
-SAMPLE_NAME=recovery-small.junk
+SAMPLE_NAME=f0.recovery-small.junk
  SAMPLE_FILE=$TMP/$SAMPLE_NAME
  # make this big, else test 9 doesn't wait for bulk -- bz 5595
  dd if=/dev/urandom of=$SAMPLE_FILE bs=1M count=4
  
  test_4() {
-    do_facet client "cp $SAMPLE_FILE $MOUNT/$SAMPLE_NAME" || return 1
-    drop_request "cat $MOUNT/$SAMPLE_NAME > /dev/null"   || return 2
-    drop_reply "cat $MOUNT/$SAMPLE_NAME > /dev/null"     || return 3
+    do_facet client "cp $SAMPLE_FILE $DIR/$SAMPLE_NAME" || return 1
+    drop_request "cat $DIR/$SAMPLE_NAME > /dev/null"   || return 2
+    drop_reply "cat $DIR/$SAMPLE_NAME > /dev/null"     || return 3
  }
  run_test 4 "open: drop req, drop rep"
  
+RENAMED_AGAIN=$DIR/f0.renamed-again
+
  test_5() {
-    drop_request "mv $MOUNT/$SAMPLE_NAME $MOUNT/renamed" || return 1
-    drop_reint_reply "mv $MOUNT/renamed $MOUNT/renamed-again" || return 2
-    do_facet client "checkstat -v $MOUNT/renamed-again"  || return 3
+    drop_request "mv $DIR/$SAMPLE_NAME $DIR/$tfile-renamed" || return 1
+    drop_reint_reply "mv $DIR/$tfile-renamed $RENAMED_AGAIN" || return 2
+    do_facet client "checkstat -v $RENAMED_AGAIN"  || return 3
  }
  run_test 5 "rename: drop req, drop rep"
  
-[ ! -e $MOUNT/renamed-again ] && cp $SAMPLE_FILE $MOUNT/renamed-again
+[ ! -e $RENAMED_AGAIN ] && cp $SAMPLE_FILE $RENAMED_AGAIN
+LINK1=$DIR/f0.link1
+LINK2=$DIR/f0.link2
+
  test_6() {
-    drop_request "mlink $MOUNT/renamed-again $MOUNT/link1" || return 1
-    drop_reint_reply "mlink $MOUNT/renamed-again $MOUNT/link2"   || return 2
+    drop_request "mlink $RENAMED_AGAIN $LINK1" || return 1
+    drop_reint_reply "mlink $RENAMED_AGAIN $LINK2"   || return 2
  }
  run_test 6 "link: drop req, drop rep"
  
-[ ! -e $MOUNT/link1 ] && mlink $MOUNT/renamed-again $MOUNT/link1
-[ ! -e $MOUNT/link2 ] && mlink $MOUNT/renamed-again $MOUNT/link2
+[ ! -e $LINK1 ] && mlink $RENAMED_AGAIN $LINK1
+[ ! -e $LINK2 ] && mlink $RENAMED_AGAIN $LINK2
  test_7() {
-    drop_request "munlink $MOUNT/link1"   || return 1
-    drop_reint_reply "munlink $MOUNT/link2"     || return 2
+    drop_request "munlink $LINK1"   || return 1
+    drop_reint_reply "munlink $LINK2"     || return 2
  }
  run_test 7 "unlink: drop req, drop rep"
  
  #bug 1423
  test_8() {
-    drop_reint_reply "touch $MOUNT/$tfile"    || return 1
+    drop_reint_reply "touch $DIR/$tfile"    || return 1
  }
  run_test 8 "touch: drop rep (bug 1423)"
  
@@ -89,75 +104,77 @@ dd if=/dev/urandom of=$SAMPLE_FILE bs=1M count=4
  
  #bug 1420
  test_9() {
-    pause_bulk "cp /etc/profile $MOUNT/$tfile"       || return 1
-    do_facet client "cp $SAMPLE_FILE $MOUNT/${tfile}.2"  || return 2
+    remote_ost_nodsh && skip "remote OST with nodsh" && return 0
+
+    pause_bulk "cp /etc/profile $DIR/$tfile"       || return 1
+    do_facet client "cp $SAMPLE_FILE $DIR/${tfile}.2"  || return 2
      do_facet client "sync"
-    do_facet client "rm $MOUNT/$tfile $MOUNT/${tfile}.2" || return 3
+    do_facet client "rm $DIR/$tfile $DIR/${tfile}.2" || return 3
  }
  run_test 9 "pause bulk on OST (bug 1420)"
  
  #bug 1521
  test_10() {
-    do_facet client mcreate $MOUNT/$tfile        || return 1
-    drop_bl_callback "chmod 0777 $MOUNT/$tfile"  || echo "evicted as expected"
+    do_facet client mcreate $DIR/$tfile        || return 1
+    drop_bl_callback "chmod 0777 $DIR/$tfile"  || echo "evicted as expected"
      # wait for the mds to evict the client
      #echo "sleep $(($TIMEOUT*2))"
      #sleep $(($TIMEOUT*2))
-    do_facet client touch $MOUNT/$tfile || echo "touch failed, evicted"
-    do_facet client checkstat -v -p 0777 $MOUNT/$tfile  || return 3
-    do_facet client "munlink $MOUNT/$tfile"
+    do_facet client touch $DIR/$tfile || echo "touch failed, evicted"
+    do_facet client checkstat -v -p 0777 $DIR/$tfile  || return 3
+    do_facet client "munlink $DIR/$tfile"
  }
  run_test 10 "finish request on server after client eviction (bug 1521)"
  
  #bug 2460
  # wake up a thread waiting for completion after eviction
  test_11(){
-    do_facet client multiop $MOUNT/$tfile Ow  || return 1
-    do_facet client multiop $MOUNT/$tfile or  || return 2
+    do_facet client multiop $DIR/$tfile Ow  || return 1
+    do_facet client multiop $DIR/$tfile or  || return 2
  
      cancel_lru_locks osc
  
-    do_facet client multiop $MOUNT/$tfile or  || return 3
-    drop_bl_callback multiop $MOUNT/$tfile Ow || echo "evicted as expected"
+    do_facet client multiop $DIR/$tfile or  || return 3
+    drop_bl_callback multiop $DIR/$tfile Ow || echo "evicted as expected"
  
-    do_facet client munlink $MOUNT/$tfile  || return 4
+    do_facet client munlink $DIR/$tfile  || return 4
  }
  run_test 11 "wake up a thread waiting for completion after eviction (b=2460)"
  
  #b=2494
  test_12(){
-    $LCTL mark multiop $MOUNT/$tfile OS_c 
+    $LCTL mark multiop $DIR/$tfile OS_c 
      do_facet mds "lctl set_param fail_loc=0x115"
      clear_failloc mds $((TIMEOUT * 2)) &
-    multiop_bg_pause $MOUNT/$tfile OS_c || return 1
+    multiop_bg_pause $DIR/$tfile OS_c || return 1
      PID=$!
  #define OBD_FAIL_MDS_CLOSE_NET           0x115
      kill -USR1 $PID
      echo "waiting for multiop $PID"
      wait $PID || return 2
-    do_facet client munlink $MOUNT/$tfile  || return 3
+    do_facet client munlink $DIR/$tfile  || return 3
  }
  run_test 12 "recover from timed out resend in ptlrpcd (b=2494)"
  
  # Bug 113, check that readdir lost recv timeout works.
  test_13() {
-    mkdir $MOUNT/readdir || return 1
-    touch $MOUNT/readdir/newentry || return
+    mkdir -p $DIR/$tdir || return 1
+    touch $DIR/$tdir/newentry || return
  # OBD_FAIL_MDS_READPAGE_NET|OBD_FAIL_ONCE
      do_facet mds "lctl set_param fail_loc=0x80000104"
-    ls $MOUNT/readdir || return 3
+    ls $DIR/$tdir || return 3
      do_facet mds "lctl set_param fail_loc=0"
-    rm -rf $MOUNT/readdir || return 4
+    rm -rf $DIR/$tdir || return 4
  }
  run_test 13 "mdc_readpage restart test (bug 1138)"
  
  # Bug 113, check that readdir lost send timeout works.
  test_14() {
-    mkdir $MOUNT/readdir
-    touch $MOUNT/readdir/newentry
+    mkdir -p $DIR/$tdir
+    touch $DIR/$tdir/newentry
  # OBD_FAIL_MDS_SENDPAGE|OBD_FAIL_ONCE
      do_facet mds "lctl set_param fail_loc=0x80000106"
-    ls $MOUNT/readdir || return 1
+    ls $DIR/$tdir || return 1
      do_facet mds "lctl set_param fail_loc=0"
  }
  run_test 14 "mdc_readpage resend test (bug 1138)"
@@ -179,7 +196,9 @@ start_read_ahead() {
  }
  
  test_16() {
-    do_facet client cp $SAMPLE_FILE $MOUNT
+    remote_ost_nodsh && skip "remote OST with nodsh" && return 0
+
+    do_facet client cp $SAMPLE_FILE $DIR
      sync
      stop_read_ahead
  
@@ -187,11 +206,11 @@ test_16() {
      do_facet ost1 lctl set_param fail_loc=0x80000504
      cancel_lru_locks osc
      # OST bulk will time out here, client resends
-    do_facet client "cmp $SAMPLE_FILE $MOUNT/${SAMPLE_FILE##*/}" || return 1
+    do_facet client "cmp $SAMPLE_FILE $DIR/${SAMPLE_FILE##*/}" || return 1
      do_facet ost1 lctl set_param fail_loc=0
      # give recovery a chance to finish (shouldn't take long)
      sleep $TIMEOUT
-    do_facet client "cmp $SAMPLE_FILE $MOUNT/${SAMPLE_FILE##*/}" || return 2
+    do_facet client "cmp $SAMPLE_FILE $DIR/${SAMPLE_FILE##*/}" || return 2
      start_read_ahead
  }
  run_test 16 "timeout bulk put, don't evict client (2732)"
@@ -199,6 +218,8 @@ run_test 16 "timeout bulk put, don't evict client (2732)"
  test_17() {
      local at_max_saved=0
  
+    remote_ost_nodsh && skip "remote OST with nodsh" && return 0
+
      # With adaptive timeouts, bulk_get won't expire until adaptive_timeout_max
      if at_is_valid && at_is_enabled; then
          at_max_saved=$(at_max_get ost1)
@@ -229,14 +250,19 @@ run_test 17 "timeout bulk get, don't evict client (2732)"
  test_18a() {
      [ -z ${ost2_svc} ] && skip "needs 2 osts" && return 0
  
-    do_facet client mkdir -p $MOUNT/$tdir
-    f=$MOUNT/$tdir/$tfile
+    do_facet client mkdir -p $DIR/$tdir
+    f=$DIR/$tdir/$tfile
  
      cancel_lru_locks osc
      pgcache_empty || return 1
  
      # 1 stripe on ost2
      lfs setstripe $f -s $((128 * 1024)) -i 1 -c 1
+    get_stripe_info client $f
+    if [ $stripe_index -ne 1 ]; then
+        lfs getstripe $f
+        error "$f: different stripe offset ($stripe_index)" && return
+    fi
  
      do_facet client cp $SAMPLE_FILE $f
      sync
@@ -253,16 +279,21 @@ test_18a() {
  run_test 18a "manual ost invalidate clears page cache immediately"
  
  test_18b() {
-    do_facet client mkdir -p $MOUNT/$tdir
-    f=$MOUNT/$tdir/$tfile
-    f2=$MOUNT/$tdir/${tfile}-2
+    remote_ost_nodsh && skip "remote OST with nodsh" && return 0
+
+    do_facet client mkdir -p $DIR/$tdir
+    f=$DIR/$tdir/$tfile
  
      cancel_lru_locks osc
      pgcache_empty || return 1
  
      # shouldn't have to set stripe size of count==1
      lfs setstripe $f -s $((128 * 1024)) -i 0 -c 1
-    lfs setstripe $f2 -s $((128 * 1024)) -i 0 -c 1
+    get_stripe_info client $f
+    if [ $stripe_index -ne 0 ]; then
+        lfs getstripe $f
+        error "$f: different stripe offset ($stripe_index)" && return
+    fi
  
      do_facet client cp $SAMPLE_FILE $f
      sync
@@ -275,22 +306,27 @@ test_18b() {
      # cache after the client reconnects?     
      rc=0
      pgcache_empty || rc=2
-    rm -f $f $f2
+    rm -f $f
      return $rc
  }
  run_test 18b "eviction and reconnect clears page cache (2766)"
  
  test_18c() {
-    do_facet client mkdir -p $MOUNT/$tdir
-    f=$MOUNT/$tdir/$tfile
-    f2=$MOUNT/$tdir/${tfile}-2
+    remote_ost_nodsh && skip "remote OST with nodsh" && return 0
+
+    do_facet client mkdir -p $DIR/$tdir
+    f=$DIR/$tdir/$tfile
  
      cancel_lru_locks osc
      pgcache_empty || return 1
  
      # shouldn't have to set stripe size of count==1
      lfs setstripe $f -s $((128 * 1024)) -i 0 -c 1
-    lfs setstripe $f2 -s $((128 * 1024)) -i 0 -c 1
+    get_stripe_info client $f
+    if [ $stripe_index -ne 0 ]; then
+        lfs getstripe $f
+        error "$f: different stripe offset ($stripe_index)" && return
+    fi
  
      do_facet client cp $SAMPLE_FILE $f
      sync
@@ -306,13 +342,13 @@ test_18c() {
      # cache after the client reconnects?     
      rc=0
      pgcache_empty || rc=2
-    rm -f $f $f2
+    rm -f $f
      return $rc
  }
  run_test 18c "Dropped connect reply after eviction handing (14755)"
  
  test_19a() {
-    f=$MOUNT/$tfile
+    f=$DIR/$tfile
      do_facet client mcreate $f        || return 1
      drop_ldlm_cancel "chmod 0777 $f"  || echo "evicted as expected"
  
@@ -324,7 +360,7 @@ test_19a() {
  run_test 19a "test expired_lock_main on mds (2867)"
  
  test_19b() {
-    f=$MOUNT/$tfile
+    f=$DIR/$tfile
      do_facet client multiop $f Ow  || return 1
      do_facet client multiop $f or  || return 2
  
@@ -338,6 +374,8 @@ test_19b() {
  run_test 19b "test expired_lock_main on ost (2867)"
  
  test_20a() {   # bug 2983 - ldlm_handle_enqueue cleanup
+       remote_ost_nodsh && skip "remote OST with nodsh" && return 0
+
         mkdir -p $DIR/$tdir
         lfs setstripe $DIR/$tdir/${tfile} -i 0 -c 1
         multiop_bg_pause $DIR/$tdir/${tfile} O_wc || return 1
@@ -353,6 +391,8 @@ test_20a() {        # bug 2983 - ldlm_handle_enqueue cleanup
  run_test 20a "ldlm_handle_enqueue error (should return error)" 
  
  test_20b() {   # bug 2986 - ldlm_handle_enqueue error during open
+       remote_ost_nodsh && skip "remote OST with nodsh" && return 0
+
         mkdir -p $DIR/$tdir
         lfs setstripe $DIR/$tdir/${tfile} -i 0 -c 1
         cancel_lru_locks osc
@@ -591,8 +631,11 @@ test_23() { #b=4561
  }
  run_test 23 "client hang when close a file after mds crash"
  
-test_24() {    # bug 2248 - eviction fails writeback but app doesn't see it
+test_24() { # bug 11710 details correct fsync() behavior
+       remote_ost_nodsh && skip "remote OST with nodsh" && return 0
+
         mkdir -p $DIR/$tdir
+       lfs setstripe $DIR/$tdir -s 0 -i 0 -c 1
         cancel_lru_locks osc
         multiop_bg_pause $DIR/$tdir/$tfile Owy_wyc || return 1
         MULTI_PID=$!
@@ -609,6 +652,7 @@ run_test 24 "fsync error (should return error)"
  test_26a() {      # was test_26 bug 5921 - evict dead exports by pinger
  # this test can only run from a client on a separate node.
         remote_ost || { skip "local OST" && return 0; }
+       remote_ost_nodsh && skip "remote OST with nodsh" && return 0
         remote_mds || { skip "local MDS" && return 0; }
         OST_FILE=obdfilter.${ost1_svc}.num_exports
          OST_EXP="`do_facet ost1 lctl get_param -n $OST_FILE`"
@@ -631,6 +675,8 @@ test_26a() {      # was test_26 bug 5921 - evict dead exports by pinger
  run_test 26a "evict dead exports"
  
  test_26b() {      # bug 10140 - evict dead exports by pinger
+       remote_ost_nodsh && skip "remote OST with nodsh" && return 0
+
         client_df
         zconf_mount `hostname` $MOUNT2 || error "Failed to mount $MOUNT2"
         MDS_FILE=mds.${mds_svc}.num_exports
@@ -655,37 +701,39 @@ test_26b() {      # bug 10140 - evict dead exports by pinger
  run_test 26b "evict dead exports"
  
  test_27() {
-       remote_mds && { skip "remote MDS" && return 0; }
         mkdir -p $DIR/$tdir
         writemany -q -a $DIR/$tdir/$tfile 0 5 &
         CLIENT_PID=$!
         sleep 1
+       local save_FAILURE_MODE=$FAILURE_MODE
         FAILURE_MODE="SOFT"
         facet_failover mds
  #define OBD_FAIL_OSC_SHUTDOWN            0x407
-       lctl set_param fail_loc=0x80000407
+       do_facet mds lctl set_param fail_loc=0x80000407
         # need to wait for reconnect
         echo -n waiting for fail_loc
-       while [ `lctl get_param -n fail_loc` -eq -2147482617 ]; do
+       while [ $(do_facet mds lctl get_param -n fail_loc) -eq -2147482617 ]; do
             sleep 1
             echo -n .
         done
+       do_facet mds lctl get_param -n fail_loc
         facet_failover mds
         #no crashes allowed!
          kill -USR1 $CLIENT_PID
         wait $CLIENT_PID 
         true
+       FAILURE_MODE=$save_FAILURE_MODE
  }
  run_test 27 "fail LOV while using OSC's"
  
  test_28() {      # bug 6086 - error adding new clients
-       do_facet client mcreate $MOUNT/$tfile       || return 1
-       drop_bl_callback "chmod 0777 $MOUNT/$tfile" ||echo "evicted as expected"
+       do_facet client mcreate $DIR/$tfile       || return 1
+       drop_bl_callback "chmod 0777 $DIR/$tfile" ||echo "evicted as expected"
         #define OBD_FAIL_MDS_ADD_CLIENT 0x12f
         do_facet mds lctl set_param fail_loc=0x8000012f
         # fail once (evicted), reconnect fail (fail_loc), ok
         df || (sleep 1; df) || (sleep 1; df) || error "reconnect failed"
-       rm -f $MOUNT/$tfile
+       rm -f $DIR/$tfile
         fail mds                # verify MDS last_rcvd can be loaded
  }
  run_test 28 "handle error adding new clients (bug 6086)"
@@ -770,6 +818,8 @@ test_52_guts() {
  }
  
  test_52() {
+       remote_ost_nodsh && skip "remote OST with nodsh" && return 0
+
         mkdir -p $DIR/$tdir
         test_52_guts 1
         rc=$?
@@ -810,11 +860,12 @@ run_test 54 "back in time"
  
  # bug 11330 - liblustre application death during I/O locks up OST
  test_55() {
-       remote_ost && { skip "remote OST" && return 0; }
+       remote_ost_nodsh && skip "remote OST with nodsh" && return 0
  
         mkdir -p $DIR/$tdir
  
         # first dd should be finished quickly
+       lfs setstripe $DIR/$tdir/$tfile-1 -c 1 -i 0
         dd if=/dev/zero of=$DIR/$tdir/$tfile-1 bs=32M count=4  &
         DDPID=$!
         count=0
@@ -829,8 +880,9 @@ test_55() {
         done    
         echo "(dd_pid=$DDPID, time=$count)successful"
  
-        #define OBD_FAIL_OST_DROP_REQ            0x21d
-       do_facet ost lctl set_param fail_loc=0x0000021d
+       lfs setstripe $DIR/$tdir/$tfile-2 -c 1 -i 0
+       #define OBD_FAIL_OST_DROP_REQ            0x21d
+       do_facet ost1 lctl set_param fail_loc=0x0000021d
         # second dd will be never finished
         dd if=/dev/zero of=$DIR/$tdir/$tfile-2 bs=32M count=4  &        
         DDPID=$!
@@ -849,7 +901,7 @@ test_55() {
         echo "(dd_pid=$DDPID, time=$count)successful"
  
         #Recover fail_loc and dd will finish soon
-       do_facet ost lctl set_param fail_loc=0
+       do_facet ost1 lctl set_param fail_loc=0
         count=0
         echo  "step3: testing ......"
         while [ true ]; do
@@ -901,14 +953,14 @@ run_test 57 "read procfs entries causes kernel crash"
  
  test_58() { # bug 11546
  #define OBD_FAIL_MDC_ENQUEUE_PAUSE        0x801
-        touch $MOUNT/$tfile
-        ls -la $MOUNT/$tfile
+        touch $DIR/$tfile
+        ls -la $DIR/$tfile
          lctl set_param fail_loc=0x80000801
-        cp $MOUNT/$tfile /dev/null &
+        cp $DIR/$tfile /dev/null &
          pid=$!
          sleep 1
          lctl set_param fail_loc=0
-        drop_bl_callback rm -f $MOUNT/$tfile
+        drop_bl_callback rm -f $DIR/$tfile
          wait $pid
          do_facet client "df $DIR"
  }
@@ -933,4 +985,4 @@ run_test 59 "Read cancel race on client eviction"
  
  equals_msg `basename $0`: test complete, cleaning up
  check_and_cleanup_lustre
-[ -f "$TESTSUITELOG" ] && cat $TESTSUITELOG || true
+[ -f "$TESTSUITELOG" ] && cat $TESTSUITELOG && grep -q FAIL $TESTSUITELOG && exit 1 || true
diff --git a/lustre/tests/rename_many.c b/lustre/tests/rename_many.c

index 5bf46d2..7a366b3 100644 (file)
--- a/lustre/tests/rename_many.c
+++ b/lustre/tests/rename_many.c
@@ -1,3 +1,39 @@
+/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
+ * vim:expandtab:shiftwidth=8:tabstop=8:
+ *
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright  2008 Sun Microsystems, Inc. All rights reserved
+ * Use is subject to license terms.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
  #define PATH_LENGTH 35
  #include <math.h>
  #include <signal.h>
diff --git a/lustre/tests/replay-dual.sh b/lustre/tests/replay-dual.sh

index cf397cb..122dc6d 100755 (executable)
--- a/lustre/tests/replay-dual.sh
+++ b/lustre/tests/replay-dual.sh
@@ -2,7 +2,7 @@
  
  set -e
  
-# bug number:  10124 
+# bug number:  10124
  ALWAYS_EXCEPT="15c   $REPLAY_DUAL_EXCEPT"
  
  SAVE_PWD=$PWD
@@ -13,16 +13,30 @@ CLEANUP=${CLEANUP:-""}
  MOUNT_2=${MOUNT_2:-"yes"}
  . $LUSTRE/tests/test-framework.sh
  
+if [ "$FAILURE_MODE" = "HARD" ] && mixed_ost_devs; then
+    CONFIG_EXCEPTIONS="17"
+    echo -n "Several ost services on one ost node are used with FAILURE_MODE=$FAILURE_MODE. "
+    echo "Except the tests: $CONFIG_EXCEPTIONS"
+    ALWAYS_EXCEPT="$ALWAYS_EXCEPT $CONFIG_EXCEPTIONS"
+fi
+
  init_test_env $@
  
  . ${CONFIG:=$LUSTRE/tests/cfg/$NAME.sh}
  
-#
+remote_mds_nodsh && skip "remote MDS with nodsh" && exit 0
+
  [ "$SLOW" = "no" ] && EXCEPT_SLOW="1 2 3 4 5 14"
  
  build_test_filter
  
-cleanup_and_setup_lustre
+check_and_setup_lustre
+MOUNTED=$(mounted_lustre_filesystems)
+if ! $(echo $MOUNTED | grep -w -q $MOUNT2); then
+    zconf_mount $HOSTNAME $MOUNT2
+    MOUNTED2=yes
+fi
+
  assert_DIR
  rm -rf $DIR/[df][0-9]*
  
@@ -241,7 +255,7 @@ test_13() {
  }
  run_test 13 "close resend timeout"
  
-test_14() {
+test_14a() {
      replay_barrier mds
      createmany -o $MOUNT1/$tfile- 25
      createmany -o $MOUNT2/$tfile-2- 1
@@ -250,16 +264,46 @@ test_14() {
  
      facet_failover mds
      # expect recovery to fail due to missing client 2
-    df $MOUNT && return 1
+    df $MOUNT1 && return 1
      sleep 1
  
-    # first 25 files should have been replayed 
+    # first 25 files should have been replayed
      unlinkmany $MOUNT1/$tfile- 25 || return 2
  
-    zconf_mount `hostname` $MOUNT2 || error "mount $MOUNT2 fail" 
+    zconf_mount `hostname` $MOUNT2 || error "mount $MOUNT2 fail"
      return 0
  }
-run_test 14 "timeouts waiting for lost client during replay"
+run_test 14a "timeouts waiting for lost client during replay"
+
+test_14b() {
+    BEFOREUSED=`df -P $DIR | tail -1 | awk '{ print $3 }'`
+    #lfs setstripe --index=0 --count=1 $MOUNT1
+    mkdir -p $MOUNT1/$tdir
+    #lfs setstripe --index=0 --count=1 $MOUNT1/$tdir
+    replay_barrier mds
+    createmany -o $MOUNT1/$tfile- 5
+    echo "data" > $MOUNT2/$tdir/$tfile-2
+    createmany -o $MOUNT1/$tfile-3- 5
+    umount $MOUNT2
+
+    facet_failover mds
+    # expect recovery don't fail due to VBR
+    df $MOUNT1 || return 1
+
+    # first 25 files should have been replayed
+    unlinkmany $MOUNT1/$tfile- 5 || return 2
+    unlinkmany $MOUNT1/$tfile-3- 5 || return 3
+
+    zconf_mount `hostname` $MOUNT2 || error "mount $MOUNT2 fail"
+    # give ost time to process llogs
+    sleep 3
+    AFTERUSED=`df -P $DIR | tail -1 | awk '{ print $3 }'`
+    log "before $BEFOREUSED, after $AFTERUSED"
+    [ $AFTERUSED -ne $BEFOREUSED ] && \
+        error "after $AFTERUSED > before $BEFOREUSED" && return 4
+    return 0
+}
+run_test 14b "delete ost orphans if gap occured in objids due to VBR"
  
  test_15a() {   # was test_15
      replay_barrier mds
@@ -281,14 +325,14 @@ run_test 15a "timeout waiting for lost client during replay, 1 client completes"
  test_15c() {
      replay_barrier mds
      for ((i = 0; i < 2000; i++)); do
-       echo "data" > "$MOUNT2/${tfile}-$i" || error "create ${tfile}-$i failed"
+        echo "data" > "$MOUNT2/${tfile}-$i" || error "create ${tfile}-$i failed"
      done
-    
+
      umount $MOUNT2
      facet_failover mds
  
      df $MOUNT || return 1
-    
+
      zconf_mount `hostname` $MOUNT2 || error "mount $MOUNT2 fail"
      return 0
  }
@@ -314,6 +358,8 @@ test_16() {
  run_test 16 "fail MDS during recovery (3571)"
  
  test_17() {
+    remote_ost_nodsh && skip "remote OST with nodsh" && return 0
+
      createmany -o $MOUNT1/$tfile- 25
      createmany -o $MOUNT2/$tfile-2- 1
  
@@ -369,8 +415,36 @@ test_19() { # Bug 10991 - resend of open request does not fail assertion.
  }
  run_test 19 "resend of open request"
  
+test_20() { #16389
+    BEFORE=`date +%s`
+    replay_barrier mds
+    touch $MOUNT1/a
+    touch $MOUNT2/b
+    umount $MOUNT2
+    facet_failover mds
+    df $MOUNT1 || return 1
+    rm $MOUNT1/a
+    zconf_mount `hostname` $MOUNT2 || error "mount $MOUNT2 fail"
+    TIER1=$((`date +%s` - BEFORE))
+    BEFORE=`date +%s`
+    replay_barrier mds
+    touch $MOUNT1/a
+    touch $MOUNT2/b
+    umount $MOUNT2
+    facet_failover mds
+    df $MOUNT1 || return 1
+    rm $MOUNT1/a
+    zconf_mount `hostname` $MOUNT2 || error "mount $MOUNT2 fail"
+    TIER2=$((`date +%s` - BEFORE))
+    [ $TIER2 -ge $((TIER1 * 2)) ] && \
+        error "recovery time is growing $TIER2 > $TIER1"
+    return 0
+}
+run_test 20 "recovery time is not increasing"
+
  equals_msg `basename $0`: test complete, cleaning up
  SLEEP=$((`date +%s` - $NOW))
  [ $SLEEP -lt $TIMEOUT ] && sleep $SLEEP
+[ "$MOUNTED2" = yes ] && zconf_umount $HOSTNAME $MOUNT2 || true
  check_and_cleanup_lustre
-[ -f "$TESTSUITELOG" ] && cat $TESTSUITELOG || true
+[ -f "$TESTSUITELOG" ] && cat $TESTSUITELOG && grep -q FAIL $TESTSUITELOG && exit 1 || true
diff --git a/lustre/tests/replay-ost-single.sh b/lustre/tests/replay-ost-single.sh

index 128f020..c284349 100755 (executable)
--- a/lustre/tests/replay-ost-single.sh
+++ b/lustre/tests/replay-ost-single.sh
@@ -10,9 +10,18 @@ CLEANUP=${CLEANUP:-""}
  init_test_env $@
  . ${CONFIG:=$LUSTRE/tests/cfg/$NAME.sh}
  
-ostfailover_HOST=${ostfailover_HOST:-$ost_HOST}
+# While we do not use OSTCOUNT=1 setup anymore,
+# ost1failover_HOST is used
+#ostfailover_HOST=${ostfailover_HOST:-$ost_HOST}
  #failover= must be defined in OST_MKFS_OPTIONS if ostfailover_HOST != ost_HOST
  
+remote_ost_nodsh && skip "remote OST with nodsh" && exit 0
+
+if [ "$FAILURE_MODE" = "HARD" ] && mixed_ost_devs; then
+    skip "$0: Several ost services on one ost node are used with FAILURE_MODE=$FAILURE_MODE. "
+    exit 0
+fi
+
  # Tests that fail on uml
  CPU=`awk '/model/ {print $4}' /proc/cpuinfo`
  [ "$CPU" = "UML" ] && EXCEPT="$EXCEPT 6"
@@ -22,89 +31,91 @@ CPU=`awk '/model/ {print $4}' /proc/cpuinfo`
  ALWAYS_EXCEPT="$REPLAY_OST_SINGLE_EXCEPT"
  
  #                                      
-[ "$SLOW" = "no" ] && EXCEPT_SLOW=""
-
-# It is replay-ost-single, after all
-OSTCOUNT=1
+[ "$SLOW" = "no" ] && EXCEPT_SLOW="5"
  
  build_test_filter
  
-REFORMAT=--reformat cleanup_and_setup_lustre
+check_and_setup_lustre
  assert_DIR
  rm -rf $DIR/[df][0-9]*
  
+TDIR=$DIR/d0.${TESTSUITE}
+mkdir -p $TDIR 
+$LFS setstripe $TDIR -i 0 -c 1
+$LFS getstripe $TDIR
+
  test_0a() {
      zconf_umount `hostname` $MOUNT -f
      # needs to run during initial client->OST connection
      #define OBD_FAIL_OST_ALL_REPLY_NET       0x211
-    do_facet ost "lctl set_param fail_loc=0x80000211"
+    do_facet ost1 "lctl set_param fail_loc=0x80000211"
      zconf_mount `hostname` $MOUNT && df $MOUNT || error "0a mount fail"
  }
  run_test 0a "target handle mismatch (bug 5317) `date +%H:%M:%S`"
  
  test_0b() {
      fail ost1
-    cp /etc/profile  $DIR/$tfile
+    cp /etc/profile  $TDIR/$tfile
      sync
-    diff /etc/profile $DIR/$tfile
-    rm -f $DIR/$tfile
+    diff /etc/profile $TDIR/$tfile
+    rm -f $TDIR/$tfile
  }
  run_test 0b "empty replay"
  
  test_1() {
-    date > $DIR/$tfile
+    date > $TDIR/$tfile
      fail ost1
-    $CHECKSTAT -t file $DIR/$tfile || return 1
-    rm -f $DIR/$tfile
+    $CHECKSTAT -t file $TDIR/$tfile || return 1
+    rm -f $TDIR/$tfile
  }
  run_test 1 "touch"
  
  test_2() {
      for i in `seq 10`; do
-        echo "tag-$i" > $DIR/$tfile-$i
+        echo "tag-$i" > $TDIR/$tfile-$i
      done 
      fail ost1
      for i in `seq 10`; do
-      grep -q "tag-$i" $DIR/$tfile-$i || error "f2-$i"
+      grep -q "tag-$i" $TDIR/$tfile-$i || error "f2-$i"
      done 
-    rm -f $DIR/$tfile-*
+    rm -f $TDIR/$tfile-*
  }
  run_test 2 "|x| 10 open(O_CREAT)s"
  
  test_3() {
      verify=$ROOT/tmp/verify-$$
-    dd if=/dev/urandom bs=4096 count=1280 | tee $verify > $DIR/$tfile &
+    dd if=/dev/urandom bs=4096 count=1280 | tee $verify > $TDIR/$tfile &
      ddpid=$!
      sync &
      fail ost1
      wait $ddpid || return 1
-    cmp $verify $DIR/$tfile || return 2
-    rm -f $verify $DIR/$tfile
+    cmp $verify $TDIR/$tfile || return 2
+    rm -f $verify $TDIR/$tfile
  }
  run_test 3 "Fail OST during write, with verification"
  
  test_4() {
      verify=$ROOT/tmp/verify-$$
-    dd if=/dev/urandom bs=4096 count=1280 | tee $verify > $DIR/$tfile
+    dd if=/dev/urandom bs=4096 count=1280 | tee $verify > $TDIR/$tfile
      # invalidate cache, so that we're reading over the wire
      cancel_lru_locks osc
-    cmp $verify $DIR/$tfile &
+    cmp $verify $TDIR/$tfile &
      cmppid=$!
      fail ost1
      wait $cmppid || return 1
-    rm -f $verify $DIR/$tfile
+    rm -f $verify $TDIR/$tfile
  }
  run_test 4 "Fail OST during read, with verification"
  
  test_5() {
-    [ -z "`which iozone 2> /dev/null`" ] && log "iozone missing" && return
-    FREE=`df -P $DIR | tail -n 1 | awk '{ print $4/2 }'`
+    [ -z "`which iozone 2> /dev/null`" ] && skip "iozone missing" && return 0
+    FREE=`df -P $TDIR | tail -n 1 | awk '{ print $4/2 }'`
      GB=1048576  # 1048576KB == 1GB
      if (( FREE > GB )); then
          FREE=$GB
      fi
      IOZONE_OPTS="-i 0 -i 1 -i 2 -+d -r 4 -s $FREE"
-    iozone $IOZONE_OPTS -f $DIR/$tfile &
+    iozone $IOZONE_OPTS -f $TDIR/$tfile &
      PID=$!
      
      sleep 8
@@ -112,22 +123,26 @@ test_5() {
      wait $PID
      RC=$?
      log "iozone rc=$RC"
-    rm -f $DIR/$tfile
+    rm -f $TDIR/$tfile
      [ $RC -ne 0 ] && return $RC || true
  }
  run_test 5 "Fail OST during iozone"
  
  kbytesfree() {
-   awk '{total+=$1} END {print total}' /proc/fs/lustre/osc/*-osc-*/kbytesfree
+   calc_osc_kbytes kbytesfree
  }
  
  test_6() {
-    f=$DIR/$tfile
+    remote_mds_nodsh && skip "remote MDS with nodsh" && return 0
+
+    f=$TDIR/$tfile
      rm -f $f
      sync && sleep 2 && sync    # wait for delete thread
      before=`kbytesfree`
      dd if=/dev/urandom bs=4096 count=1280 of=$f || return 28
      lfs getstripe $f
+    get_stripe_info client $f
+
      sync
      sleep 2                                    # ensure we have a fresh statfs
      sync
@@ -137,7 +152,7 @@ test_6() {
      log "before: $before after_dd: $after_dd"
      (( $before > $after_dd )) || return 1
      rm -f $f
-    fail ost1
+    fail ost$((stripe_index + 1))
      $CHECKSTAT -t file $f && return 2 || true
      sync
      # let the delete happen
@@ -149,7 +164,7 @@ test_6() {
  run_test 6 "Fail OST before obd_destroy"
  
  test_7() {
-    f=$DIR/$tfile
+    f=$TDIR/$tfile
      rm -f $f
      sync && sleep 2 && sync    # wait for delete thread
      before=`kbytesfree`
@@ -175,4 +190,4 @@ run_test 7 "Fail OST before obd_destroy"
  
  equals_msg `basename $0`: test complete, cleaning up
  check_and_cleanup_lustre
-[ -f "$TESTSUITELOG" ] && cat $TESTSUITELOG || true
+[ -f "$TESTSUITELOG" ] && cat $TESTSUITELOG && grep -q FAIL $TESTSUITELOG && exit 1 || true
diff --git a/lustre/tests/replay-single.sh b/lustre/tests/replay-single.sh

index baab91d..33ac920 100755 (executable)
--- a/lustre/tests/replay-single.sh
+++ b/lustre/tests/replay-single.sh
@@ -16,16 +16,25 @@ init_test_env $@
  CHECK_GRANT=${CHECK_GRANT:-"yes"}
  GRANT_CHECK_LIST=${GRANT_CHECK_LIST:-""}
  
+remote_mds_nodsh && log "SKIP: remote MDS with nodsh" && exit 0
+
  # Skip these tests
  # bug number:
  ALWAYS_EXCEPT="$REPLAY_SINGLE_EXCEPT"
  
+if [ "$FAILURE_MODE" = "HARD" ] && mixed_ost_devs; then
+    CONFIG_EXCEPTIONS="0b 42 47 61a 61c"
+    echo -n "Several ost services on one ost node are used with FAILURE_MODE=$FAILURE_MODE. "
+    echo "Except the tests: $CONFIG_EXCEPTIONS"
+    ALWAYS_EXCEPT="$ALWAYS_EXCEPT $CONFIG_EXCEPTIONS"
+fi
+
  #                                                  63 min  7 min  AT AT AT AT"
  [ "$SLOW" = "no" ] && EXCEPT_SLOW="1 2 3 4 6 12 16 44a     44b    65 66 67 68"
  
  build_test_filter
  
-cleanup_and_setup_lustre
+check_and_setup_lustre
  
  mkdir -p $DIR
  
@@ -39,6 +48,8 @@ test_0a() {   # was test_0
  run_test 0a "empty replay"
  
  test_0b() {
+    remote_ost_nodsh && skip "remote OST with nodsh" && return 0
+
      # this test attempts to trigger a race in the precreation code, 
      # and must run before any other objects are created on the filesystem
      fail ost1
@@ -47,6 +58,15 @@ test_0b() {
  }
  run_test 0b "ensure object created after recover exists. (3284)"
  
+test_0c() {
+    replay_barrier mds
+    umount $DIR
+    facet_failover mds
+    zconf_mount `hostname` $DIR || error "mount fails"
+    df $DIR || error "post-failover df failed"
+}
+run_test 0c "expired recovery with no clients"
+
  test_1() {
      replay_barrier mds
      mcreate $DIR/$tfile
@@ -303,7 +323,6 @@ test_15() {
  }
  run_test 15 "open(O_CREAT), unlink |X|  touch new, close"
  
-
  test_16() {
      replay_barrier mds
      mcreate $DIR/$tfile
@@ -416,6 +435,7 @@ test_20c() { # bug 10480
      kill -USR1 $pid
      test -s $DIR/$tfile || error "File was truncated"
  
+    wait $pid || return 1
      return 0
  }
  run_test 20c "check that client eviction does not affect file content"
@@ -627,7 +647,8 @@ test_32() {
      df $MOUNT || sleep 1 && df $MOUNT || return 1
      kill -USR1 $pid1
      kill -USR1 $pid2
-    sleep 1
+    wait $pid1 || return 4
+    wait $pid2 || return 5
      return 0
  }
  run_test 32 "close() notices client eviction; close() after client eviction"
@@ -652,6 +673,7 @@ test_34() {
      replay_barrier mds
      fail_abort mds
      kill -USR1 $pid
+    wait $pid || return 3
      [ -e $DIR/$tfile ] && return 1
      sync
      return 0
@@ -703,6 +725,7 @@ test_37() {
      fail_abort mds
      kill -USR1 $pid
      dmesg | grep  "mds_unlink_orphan.*error .* unlinking orphan" && return 1
+    wait $pid || return 3
      sync
      return 0
  }
@@ -821,6 +844,8 @@ run_test 42 "recovery after ost failure"
  
  # timeout in MDS/OST recovery RPC will LBUG MDS
  test_43() { # bug 2530
+    remote_ost_nodsh && skip "remote OST with nodsh" && return 0
+
      replay_barrier mds
  
      # OBD_FAIL_OST_CREATE_NET 0x204
@@ -913,6 +938,8 @@ test_46() {
  run_test 46 "Don't leak file handle after open resend (3325)"
  
  test_47() { # bug 2824
+    remote_ost_nodsh && skip "remote OST with nodsh" && return 0
+
      # create some files to make sure precreate has been done on all 
      # OSTs. (just in case this test is run independently)
      createmany -o $DIR/$tfile 20  || return 1
@@ -936,17 +963,18 @@ test_47() { # bug 2824
  run_test 47 "MDS->OSC failure during precreate cleanup (2824)"
  
  test_48() {
+    remote_ost_nodsh && skip "remote OST with nodsh" && return 0
+    [ "$OSTCOUNT" -lt "2" ] && skip "$OSTCOUNT < 2 OSTs -- skipping" && return
+
      replay_barrier mds
      createmany -o $DIR/$tfile 20  || return 1
      # OBD_FAIL_OST_EROFS 0x216
-    fail mds
+    facet_failover mds
      do_facet ost1 "lctl set_param fail_loc=0x80000216"
      df $MOUNT || return 2
  
      createmany -o $DIR/$tfile 20 20 || return 2
      unlinkmany $DIR/$tfile 40 || return 3
-
-    do_facet ost1 "lctl set_param fail_loc=0"
      return 0
  }
  run_test 48 "MDS->OSC failure during precreate cleanup (2824)"
@@ -989,7 +1017,7 @@ test_53a() {
      #define OBD_FAIL_MDS_CLOSE_NET 0x115
      do_facet mds "lctl set_param fail_loc=0x80000115"
      kill -USR1 $close_pid
-    cancel_lru_locks MDC # force the close
+    cancel_lru_locks mdc # force the close
      do_facet mds "lctl set_param fail_loc=0"
      mcreate $DIR/${tdir}-2/f || return 1
      
@@ -1019,7 +1047,7 @@ test_53b() {
  
      do_facet mds "lctl set_param fail_loc=0"
      kill -USR1 $close_pid
-    cancel_lru_locks MDC # force the close
+    cancel_lru_locks mdc # force the close
      wait $close_pid || return 1
      # open should still be here
      [ -d /proc/$open_pid ] || return 2
@@ -1047,7 +1075,7 @@ test_53c() {
  
      do_facet mds "lctl set_param fail_loc=0x80000115"
      kill -USR1 $close_pid
-    cancel_lru_locks MDC  # force the close
+    cancel_lru_locks mdc  # force the close
  
      replay_barrier_nodf mds
      fail_nodf mds
@@ -1074,10 +1102,10 @@ test_53d() {
      # define OBD_FAIL_MDS_CLOSE_NET_REP 0X138    
      do_facet mds "lctl set_param fail_loc=0x8000013b"
      kill -USR1 $close_pid
-    cancel_lru_locks MDC  # force the close
+    cancel_lru_locks mdc  # force the close
      do_facet mds "lctl set_param fail_loc=0"
      mcreate $DIR/${tdir}-2/f || return 1
-    
+
      # close should still be here
      [ -d /proc/$close_pid ] || return 2
      replay_barrier_nodf mds
@@ -1101,14 +1129,14 @@ test_53e() {
      mcreate $DIR/${tdir}-2/f &
      open_pid=$!
      sleep 1
-    
+
      do_facet mds "lctl set_param fail_loc=0"
      kill -USR1 $close_pid
-    cancel_lru_locks MDC  # force the close
+    cancel_lru_locks mdc  # force the close
      wait $close_pid || return 1
      # open should still be here
      [ -d /proc/$open_pid ] || return 2
-    
+
      replay_barrier_nodf mds
      fail mds
      wait $open_pid || return 3
@@ -1132,7 +1160,7 @@ test_53f() {
  
          do_facet mds "lctl set_param fail_loc=0x8000013b"
          kill -USR1 $close_pid
-        cancel_lru_locks MDC
+        cancel_lru_locks mdc
  
          replay_barrier_nodf mds
          fail_nodf mds
@@ -1161,7 +1189,7 @@ test_53g() {
  
          do_facet mds "lctl set_param fail_loc=0x80000115"
          kill -USR1 $close_pid
-        cancel_lru_locks MDC # force the close
+        cancel_lru_locks mdc # force the close
  
          do_facet mds "lctl set_param fail_loc=0"
          replay_barrier_nodf mds
@@ -1187,10 +1215,10 @@ test_53h() {
      mcreate $DIR/${tdir}-2/f &
      open_pid=$!
      sleep 1
-    
+
      do_facet mds "lctl set_param fail_loc=0x8000013b"
      kill -USR1 $close_pid
-    cancel_lru_locks MDC  # force the close
+    cancel_lru_locks mdc  # force the close
      sleep 1
  
      replay_barrier_nodf mds
@@ -1245,7 +1273,7 @@ test_57() {
  run_test 57 "test recovery from llog for setattr op"
  
  #recovery many mds-ost setattr from llog
-test_58() {
+test_58a() {
      mkdir -p $DIR/$tdir
  #define OBD_FAIL_MDS_OST_SETATTR       0x12c
      do_facet mds "lctl set_param fail_loc=0x8000012c"
@@ -1258,11 +1286,46 @@ test_58() {
      unlinkmany $DIR/$tdir/$tfile-%d 2500
      rmdir $DIR/$tdir
  }
-run_test 58 "test recovery from llog for setattr op (test llog_gen_rec)"
+run_test 58a "test recovery from llog for setattr op (test llog_gen_rec)"
+
+test_58b() {
+    mount_client $MOUNT2
+    mkdir -p $DIR/$tdir
+    touch $DIR/$tdir/$tfile
+    replay_barrier mds
+    setfattr -n trusted.foo -v bar $DIR/$tdir/$tfile
+    fail mds
+    VAL=`getfattr --absolute-names --only-value -n trusted.foo $MOUNT2/$tdir/$tfile`
+    [ x$VAL = x"bar" ] || return 1
+    rm -f $DIR/$tdir/$tfile
+    rmdir $DIR/$tdir
+    zconf_umount `hostname` $MOUNT2
+}
+run_test 58b "test replay of setxattr op"
+
+test_58c() { # bug 16570
+        mount_client $MOUNT2
+        mkdir -p $DIR/$tdir
+        touch $DIR/$tdir/$tfile
+        drop_request "setfattr -n trusted.foo -v bar $DIR/$tdir/$tfile" || \
+                return 1
+        VAL=`getfattr --absolute-names --only-value -n trusted.foo $MOUNT2/$tdir/$tfile`
+        [ x$VAL = x"bar" ] || return 2
+        drop_reint_reply "setfattr -n trusted.foo1 -v bar1 $DIR/$tdir/$tfile" || \
+                return 3
+        VAL=`getfattr --absolute-names --only-value -n trusted.foo1 $MOUNT2/$tdir/$tfile`
+        [ x$VAL = x"bar1" ] || return 4
+        rm -f $DIR/$tdir/$tfile
+        rmdir $DIR/$tdir
+               zconf_umount `hostname` $MOUNT2
+}
+run_test 58c "resend/reconstruct setxattr op"
  
  # log_commit_thread vs filter_destroy race used to lead to import use after free
  # bug 11658
  test_59() {
+    remote_ost_nodsh && skip "remote OST with nodsh" && return 0
+
      mkdir -p $DIR/$tdir
      createmany -o $DIR/$tdir/$tfile-%d 200
      sync
@@ -1293,6 +1356,8 @@ run_test 60 "test llog post recovery init vs llog unlink"
  
  #test race  llog recovery thread vs llog cleanup
  test_61a() {
+    remote_ost_nodsh && skip "remote OST with nodsh" && return 0
+
      mkdir -p $DIR/$tdir
      createmany -o $DIR/$tdir/$tfile-%d 800
      replay_barrier ost1 
@@ -1322,6 +1387,8 @@ run_test 61b "test race mds llog sync vs llog cleanup"
  
  #test race  cancel cookie cb vs llog cleanup
  test_61c() {
+    remote_ost_nodsh && skip "remote OST with nodsh" && return 0
+
  #   OBD_FAIL_OST_CANCEL_COOKIE_TIMEOUT 0x222 
      touch $DIR/$tfile 
      do_facet ost "lctl set_param fail_loc=0x80000222"
@@ -1331,6 +1398,16 @@ test_61c() {
  }
  run_test 61c "test race mds llog sync vs llog cleanup"
  
+test_61d() { # bug 16002
+#define OBD_FAIL_OBD_LLOG_SETUP        0x605
+    stop mds
+    do_facet mds "lctl set_param fail_loc=0x80000605"
+    start mds $MDSDEV $MDS_MOUNT_OPTS && error "mds start should have failed"
+    do_facet mds "lctl set_param fail_loc=0"
+    start mds $MDSDEV $MDS_MOUNT_OPTS || error "cannot restart mds"
+}
+run_test 61d "error in llog_setup should cleanup the llog context correctly"
+
  test_62() { # Bug 15756 - don't mis-drop resent replay
      mkdir -p $DIR/$tdir
      replay_barrier mds
@@ -1347,19 +1424,29 @@ run_test 62 "don't mis-drop resent replay"
  
  #Adaptive Timeouts (bug 3055)
  AT_MAX_SET=0
+# Suppose that all osts have the same at_max
+for facet in mds client ost; do
+    eval AT_MAX_SAVE_${facet}=$(at_max_get $facet)
+done
  
  at_start()
  {
+    local at_max_new=600
      if ! at_is_valid; then
          skip "AT env is invalid"
          return 1
      fi
  
-    if ! at_is_enabled; then
-        echo "AT is disabled, enable it by force temporarily"
-        at_max_set 600 mds ost client
-        AT_MAX_SET=1
-    fi
+    local at_max
+
+    for facet in mds client ost; do
+        at_max=$(at_max_get $facet)
+        if [ $at_max -ne $at_max_new ]; then
+            echo "AT value on $facet is $at_max, set it by force temporarily to $at_max_new"
+            at_max_set $at_max_new $facet
+            AT_MAX_SET=1
+        fi
+    done
  
      if [ -z "$ATOLDBASE" ]; then
         local at_history=$(do_facet mds "find /sys/ -name at_history")
@@ -1368,24 +1455,35 @@ at_start()
          # speed up the timebase so we can check decreasing AT
         do_facet mds "echo 8 >> $at_history"
         do_facet ost1 "echo 8 >> $at_history"
+
+       # sleep for a while to cool down, should be > 8s and also allow
+       # at least one ping to be sent. simply use TIMEOUT to be safe.
+       sleep $TIMEOUT
      fi
  }
  
  test_65a() #bug 3055
  {
+    remote_ost_nodsh && skip "remote OST with nodsh" && return 0
+
      at_start || return 0
      $LCTL dk > /dev/null
      debugsave
      lctl set_param debug="+other"
-    # slow down a request
-    do_facet mds lctl set_param fail_val=30000
+    # Slow down a request to the current service time, this is critical
+    # because previous tests may have caused this value to increase.
+    REQ_DELAY=`lctl get_param -n mdc.${FSNAME}-MDT0000-mdc-*.timeouts |
+               awk '/portal 12/ {print $5}'`
+    REQ_DELAY=$((${REQ_DELAY} + ${REQ_DELAY} / 4 + 5))
+
+    do_facet mds lctl set_param fail_val=$((${REQ_DELAY} * 1000))
  #define OBD_FAIL_PTLRPC_PAUSE_REQ        0x50a
      do_facet mds lctl set_param fail_loc=0x8000050a
      createmany -o $DIR/$tfile 10 > /dev/null
      unlinkmany $DIR/$tfile 10 > /dev/null
      # check for log message
      $LCTL dk | grep "Early reply #" || error "No early reply" 
-    # client should show 30s estimates
+    # client should show REQ_DELAY estimates
      lctl get_param -n mdc.${FSNAME}-MDT0000-mdc-*.timeouts | grep portal
      sleep 9
      lctl get_param -n mdc.${FSNAME}-MDT0000-mdc-*.timeouts | grep portal
@@ -1394,13 +1492,20 @@ run_test 65a "AT: verify early replies"
  
  test_65b() #bug 3055
  {
+    remote_ost_nodsh && skip "remote OST with nodsh" && return 0
+
      at_start || return 0
      # turn on D_ADAPTTO
      debugsave
      lctl set_param debug="+other"
      $LCTL dk > /dev/null
-    # slow down bulk i/o
-    do_facet ost1 lctl set_param fail_val=30
+    # Slow down a request to the current service time, this is critical
+    # because previous tests may have caused this value to increase.
+    REQ_DELAY=`lctl get_param -n osc.${FSNAME}-OST0000-osc-*.timeouts |
+               awk '/portal 6/ {print $5}'`
+    REQ_DELAY=$((${REQ_DELAY} + ${REQ_DELAY} / 4 + 5))
+
+    do_facet ost1 lctl set_param fail_val=${REQ_DELAY}
  #define OBD_FAIL_OST_BRW_PAUSE_PACK      0x224
      do_facet ost1 lctl set_param fail_loc=0x224
  
@@ -1413,13 +1518,15 @@ test_65b() #bug 3055
      # check for log message
      $LCTL dk | grep "Early reply #" || error "No early reply"
      debugrestore
-    # client should show 30s estimates
+    # client should show REQ_DELAY estimates
      lctl get_param -n osc.${FSNAME}-OST0000-osc-*.timeouts | grep portal
  }
  run_test 65b "AT: verify early replies on packed reply / bulk"
  
  test_66a() #bug 3055
  {
+    remote_ost_nodsh && skip "remote OST with nodsh" && return 0
+
      at_start || return 0
      lctl get_param -n mdc.${FSNAME}-MDT0000-mdc-*.timeouts | grep "portal 12"
      # adjust 5s at a time so no early reply is sent (within deadline)
@@ -1448,6 +1555,8 @@ run_test 66a "AT: verify MDT service time adjusts with no early replies"
  
  test_66b() #bug 3055
  {
+    remote_ost_nodsh && skip "remote OST with nodsh" && return 0
+
      at_start || return 0
      ORIG=$(lctl get_param -n mdc.${FSNAME}-*.timeouts | awk '/network/ {print $4}')
      lctl set_param fail_val=$(($ORIG + 5))
@@ -1464,6 +1573,8 @@ run_test 66b "AT: verify net latency adjusts"
  
  test_67a() #bug 3055
  {
+    remote_ost_nodsh && skip "remote OST with nodsh" && return 0
+
      at_start || return 0
      CONN1=$(lctl get_param -n osc.*.stats | awk '/_connect/ {total+=$2} END {print total}')
      # sleeping threads may drive values above this
@@ -1483,6 +1594,8 @@ run_test 67a "AT: verify slow request processing doesn't induce reconnects"
  
  test_67b() #bug 3055
  {
+    remote_ost_nodsh && skip "remote OST with nodsh" && return 0
+
      at_start || return 0
      CONN1=$(lctl get_param -n osc.*.stats | awk '/_connect/ {total+=$2} END {print total}')
  #define OBD_FAIL_OST_PAUSE_CREATE        0x223
@@ -1490,7 +1603,7 @@ test_67b() #bug 3055
      do_facet ost1 "lctl set_param fail_loc=0x80000223"
      cp /etc/profile $DIR/$tfile || error "cp failed"
      client_reconnect
-    lctl get_param -n ost.OSS.ost_create.timeouts
+    do_facet ost1 "lctl get_param -n ost.OSS.ost_create.timeouts"
      log "phase 2"
      CONN2=$(lctl get_param -n osc.*.stats | awk '/_connect/ {total+=$2} END {print total}')
      ATTEMPTS=$(($CONN2 - $CONN1))
@@ -1500,8 +1613,8 @@ test_67b() #bug 3055
      cp /etc/profile $DIR/$tfile || error "cp failed"
      do_facet ost1 "lctl set_param fail_loc=0"
      client_reconnect
-    lctl get_param -n ost.OSS.ost_create.timeouts
-    CONN3=$(`lctl get_param -n osc.*.stats` | awk '/_connect/ {total+=$2} END {print total}')
+    do_facet ost1 "lctl get_param -n ost.OSS.ost_create.timeouts"
+    CONN3=$(lctl get_param -n osc.*.stats | awk '/_connect/ {total+=$2} END {print total}')
      ATTEMPTS=$(($CONN3 - $CONN2))
      echo "$ATTEMPTS osc reconnect attemps on 2nd slow"
      [ $ATTEMPTS -gt 0 ] && error "AT should have prevented reconnect"
@@ -1511,6 +1624,8 @@ run_test 67b "AT: verify instant slowdown doesn't induce reconnects"
  
  test_68 () #bug 13813
  {
+    remote_ost_nodsh && skip "remote OST with nodsh" && return 0
+
      at_start || return 0
      local ldlm_enqueue_min=$(find /sys -name ldlm_enqueue_min)
      [ -z "$ldlm_enqueue_min" ] && skip "missing /sys/.../ldlm_enqueue_min" && return 0
@@ -1538,8 +1653,15 @@ if [ -n "$ATOLDBASE" ]; then
  fi
  
  if [ $AT_MAX_SET -ne 0 ]; then
-    echo "restore AT status to be disabled"
-    at_max_set 0 mds ost client
+    for facet in mds client ost; do
+        var=AT_MAX_SAVE_${facet}
+        echo restore AT on $facet to saved value ${!var}
+        at_max_set ${!var} $facet
+        AT_NEW=$(at_max_get $facet)
+        echo Restored AT value on $facet $AT_NEW 
+        [ $AT_NEW -ne ${!var} ] && \
+            error "$facet : AT value was not restored SAVED ${!var} NEW $AT_NEW"
+    done
  fi
  
  # end of AT tests includes above lines
@@ -1562,7 +1684,7 @@ test_70a () {
                                 error "dd failed on $CLIENT"
         done
  
-       local prev_client=$(echo $clients | sed 's/^.* \(\w\+\)$/\1/') 
+       local prev_client=$(echo $clients | sed 's/^.* \(.\+\)$/\1/') 
         for C in ${CLIENTS//,/ }; do
                 do_node $prev_client dd if=$DIR/${tfile}_${C} of=/dev/null 2>/dev/null || \
                         error "dd if=$DIR/${tfile}_${C} failed on $prev_client"
@@ -1570,8 +1692,6 @@ test_70a () {
         done
         
         ls $DIR
-
-       zconf_umount_clients $CLIENTS $DIR
  }
  run_test 70a "check multi client t-f"
  
@@ -1583,37 +1703,187 @@ test_70b () {
  
         zconf_mount_clients $CLIENTS $DIR
         
-       local duration="-t 60"
-       local cmd="rundbench 1 $duration "
+       local duration=120
+       [ "$SLOW" = "no" ] && duration=60
+       local cmd="rundbench 1 -t $duration"
         local PID=""
-       for CLIENT in ${CLIENTS//,/ }; do
-               $PDSH $CLIENT "set -x; PATH=:$PATH:$LUSTRE/utils:$LUSTRE/tests/:${DBENCH_LIB} DBENCH_LIB=${DBENCH_LIB} $cmd" &
-               PID=$!
-               echo $PID >pid.$CLIENT
-               echo "Started load PID=`cat pid.$CLIENT`"
-       done
+       do_nodes $CLIENTS "set -x; PATH=:$PATH:$LUSTRE/utils:$LUSTRE/tests/:$DBENCH_LIB DBENCH_LIB=$DBENCH_LIB $cmd" &
+       PID=$!
+       log "Started rundbench load PID=$PID ..."
  
+       sleep $((duration / 4))
         replay_barrier mds 
         sleep 3 # give clients a time to do operations
  
         log "$TESTNAME fail mds 1"
         fail mds
  
-# wait for client to reconnect to MDS
-       sleep $TIMEOUT
-
-       for CLIENT in ${CLIENTS//,/ }; do
-               PID=`cat pid.$CLIENT`
-               wait $PID
-               rc=$?
-               echo "load on ${CLIENT} returned $rc"
-       done
+       wait $PID || error "rundbench load on $CLIENTS failed!"
  
-       zconf_umount_clients $CLIENTS $DIR 
  }
  run_test 70b "mds recovery; $CLIENTCOUNT clients"
  # end multi-client tests
  
+# vbr export handling
+test_71a() {
+    delayed_recovery_enabled || { skip "No delayed recovery support"; return 0; }
+
+    UUID=$(lctl dl | awk '/mdc.*-mdc-/ { print $5 }')
+    echo "Client UUID is $UUID"
+    replay_barrier mds
+    umount $DIR
+    facet_failover mds
+    zconf_mount `hostname` $DIR || error "mount fails"
+    df $DIR || error "post-failover df failed"
+    do_facet mds "lctl get_param -n mds.${mds_svc}.stale_exports|grep $UUID" || \
+        error "no delayed exports"
+    OLD_AGE=$(do_facet mds "lctl get_param -n mds.${mds_svc}.stale_export_age")
+    NEW_AGE=10
+    do_facet mds "lctl set_param mds.${mds_svc}.stale_export_age=$NEW_AGE"
+    sleep $((NEW_AGE + 2))
+    do_facet mds "lctl get_param -n mds.${mds_svc}.stale_exports|grep \"$UUID.*EXPIRED\"" || \
+        error "exports didn't expire"
+    do_facet mds "lctl set_param mds.${mds_svc}.evict_client=$UUID"
+    do_facet mds "lctl get_param -n mds.${mds_svc}.stale_exports|grep $UUID" && \
+        error "Export wasn't removed manually"
+    do_facet mds "lctl set_param mds.${mds_svc}.stale_export_age=$OLD_AGE"
+    return 0;
+}
+run_test 71a "lost client export is kept"
+
+test_71b() {
+    delayed_recovery_enabled || { skip "No delayed recovery support"; return 0; }
+
+    FAKE_NUM=10
+    create_fake_exports mds $FAKE_NUM
+    NUM=$(do_facet mds "lctl get_param -n mds.${mds_svc}.stale_exports|wc -l")
+    [ $NUM -eq 0 ] && error "no fake exports $NUM - $FAKE_NUM"
+    OLD_AGE=$(do_facet mds "lctl get_param -n mds.${mds_svc}.stale_export_age")
+    NEW_AGE=10
+    do_facet mds "lctl set_param mds.${mds_svc}.stale_export_age=$NEW_AGE"
+    sleep $((NEW_AGE + 2))
+    EX_NUM=$(do_facet mds "lctl get_param -n mds.${mds_svc}.stale_exports|grep -c EXPIRED")
+    [ "$EX_NUM" -eq "$NUM" ] || error "not all exports are expired $EX_NUM != $NUM"
+    do_facet mds "lctl set_param mds.${mds_svc}.flush_stale_exports=1"
+    do_facet mds "lctl get_param -n mds.${mds_svc}.stale_exports|grep EXPIRED" && \
+        error "Exports weren't flushed"
+    do_facet mds "lctl set_param mds.${mds_svc}.stale_export_age=$OLD_AGE"
+    return 0;
+}
+run_test 71b "stale exports are expired, lctl flushes them"
+
+test_71c() {
+    delayed_recovery_enabled || { skip "No delayed recovery support"; return 0; }
+
+    FAKE_NUM=10
+    create_fake_exports mds $FAKE_NUM
+    NUM=$(do_facet mds "lctl get_param -n mds.${mds_svc}.stale_exports|wc -l")
+    [ "$NUM" -eq "$FAKE_NUM" ] || error "no fake exports $NUM - $FAKE_NUM"
+    OLD_AGE=$(do_facet mds "lctl get_param -n mds.${mds_svc}.stale_export_age")
+    NEW_AGE=10
+    do_facet mds "lctl set_param mds.${mds_svc}.stale_export_age=$NEW_AGE"
+    sleep $((NEW_AGE + 2))
+    EX_NUM=$(do_facet mds "lctl get_param -n mds.${mds_svc}.stale_exports|grep -c EXPIRED")
+    [ "$EX_NUM" -eq "$NUM" ] || error "not all exports are expired $EX_NUM != $NUM"
+
+    umount $DIR
+    zconf_mount `hostname` $DIR || error "mount fails"
+
+    NUM=$(do_facet mds "lctl get_param -n mds.${mds_svc}.stale_exports|wc -l")
+    [ $NUM -eq 0 ] || error "$NUM fake exports are still exists"
+    do_facet mds "lctl set_param mds.${mds_svc}.stale_export_age=$OLD_AGE"
+    return 0;
+}
+run_test 71c "stale exports are expired, new client connection flush them"
+
+test_71d() {
+    delayed_recovery_enabled || { skip "No delayed recovery support"; return 0; }
+
+    FAKE_NUM=10
+    create_fake_exports mds $FAKE_NUM
+    NUM=$(do_facet mds "lctl get_param -n mds.${mds_svc}.stale_exports|wc -l")
+    [ "$NUM" -eq "$FAKE_NUM" ] || error "no fake exports $NUM - $FAKE_NUM"
+    OLD_AGE=$(do_facet mds "lctl get_param -n mds.${mds_svc}.stale_export_age")
+    NEW_AGE=10
+    do_facet mds "lctl conf_param ${mds_svc}.mdt.stale_export_age=$NEW_AGE"
+    sleep $((NEW_AGE + 2))
+    EX_NUM=$(do_facet mds "lctl get_param -n mds.${mds_svc}.stale_exports|grep -c EXPIRED")
+    [ "$EX_NUM" -eq "$NUM" ] || error "not all exports are expired $EX_NUM != $NUM"
+
+    fail mds
+
+    FAIL_AGE=$(do_facet mds "lctl get_param -n mds.${mds_svc}.stale_export_age")
+    [ $FAIL_AGE -eq $NEW_AGE ] || error "new age wasn't set after recovery"
+    NUM=$(do_facet mds "lctl get_param -n mds.${mds_svc}.stale_exports|wc -l")
+    [ $NUM -eq 0 ] || error "$NUM fake exports are still exists"
+    do_facet mds "lctl conf_param ${mds_svc}.mdt.stale_export_age=$OLD_AGE"
+    return 0;
+}
+run_test 71d "expired exports, server init removes them, conf_param works"
+
+# end vbr exports tests
+
+test_72() { #bug 16711
+    replay_barrier mds
+    multiop_bg_pause $DIR/$tfile O_c || return 4
+    pid=$!
+#define OBD_FAIL_TGT_REPLAY_DELAY 0x709
+    do_facet mds "lctl set_param fail_loc=0x80000709"
+    fail mds
+    kill -USR1 $pid || return 1
+    wait $pid || return 2
+    $CHECKSTAT -t file $DIR/$tfile || return 3
+}
+run_test 72 "target_finish_recovery vs process_recovery_queue race"
+
+test_73a() {
+    multiop_bg_pause $DIR/$tfile O_tSc || return 3
+    pid=$!
+    rm -f $DIR/$tfile
+
+    replay_barrier mds
+#define OBD_FAIL_LDLM_ENQUEUE       0x302
+    do_facet mds "lctl set_param fail_loc=0x80000302"
+    fail mds
+    kill -USR1 $pid
+    wait $pid || return 1
+    [ -e $DIR/$tfile ] && return 2
+    return 0
+}
+run_test 73a "open(O_CREAT), unlink, replay, reconnect before open replay , close"
+
+test_73b() {
+    multiop_bg_pause $DIR/$tfile O_tSc || return 3
+    pid=$!
+    rm -f $DIR/$tfile
+
+    replay_barrier mds
+#define OBD_FAIL_LDLM_REPLY       0x30c
+    do_facet mds "lctl set_param fail_loc=0x8000030c"
+    fail mds
+    kill -USR1 $pid
+    wait $pid || return 1
+    [ -e $DIR/$tfile ] && return 2
+    return 0
+}
+run_test 73b "open(O_CREAT), unlink, replay, reconnect at open_replay reply, close"
+
+test_73c() {
+    multiop_bg_pause $DIR/$tfile O_tSc || return 3
+    pid=$!
+    rm -f $DIR/$tfile
+
+    replay_barrier mds
+#define OBD_FAIL_TGT_LAST_REPLAY       0x710
+    do_facet mds "lctl set_param fail_loc=0x80000710"
+    fail mds
+    kill -USR1 $pid
+    wait $pid || return 1
+    [ -e $DIR/$tfile ] && return 2
+    return 0
+}
+run_test 73c "open(O_CREAT), unlink, replay, reconnect at last_replay, close"
+
  equals_msg `basename $0`: test complete, cleaning up
  check_and_cleanup_lustre
-[ -f "$TESTSUITELOG" ] && cat $TESTSUITELOG || true
+[ -f "$TESTSUITELOG" ] && cat $TESTSUITELOG && grep -q FAIL $TESTSUITELOG && exit 1 || true
diff --git a/lustre/tests/replay-vbr.sh b/lustre/tests/replay-vbr.sh

new file mode 100644 (file)

index 0000000..f4ffeb3
--- /dev/null
+++ b/lustre/tests/replay-vbr.sh
@@ -0,0 +1,724 @@
+#!/bin/bash
+
+set -e
+
+# bug number:
+ALWAYS_EXCEPT="3c 4b 4c 10 $REPLAY_VBR_EXCEPT"
+
+SAVE_PWD=$PWD
+PTLDEBUG=${PTLDEBUG:--1}
+LUSTRE=${LUSTRE:-`dirname $0`/..}
+SETUP=${SETUP:-""}
+CLEANUP=${CLEANUP:-""}
+. $LUSTRE/tests/test-framework.sh
+
+init_test_env $@
+
+. ${CONFIG:=$LUSTRE/tests/cfg/$NAME.sh}
+
+[ -n "$CLIENTS" ] || { skip "Need two or more clients" && exit 0; }
+[ $CLIENTCOUNT -ge 2 ] || \
+    { skip "Need two or more clients, have $CLIENTCOUNT" && exit 0; }
+remote_mds_nodsh && skip "remote MDS with nodsh" && exit 0
+
+[ "$SLOW" = "no" ] && EXCEPT_SLOW=""
+
+
+[ ! "$NAME" = "ncli" ] && ALWAYS_EXCEPT="$ALWAYS_EXCEPT"
+[ "$NAME" = "ncli" ] && MOUNT_2=""
+MOUNT_2=""
+build_test_filter
+
+check_and_setup_lustre
+rm -rf $DIR/[df][0-9]*
+
+[ "$DAEMONFILE" ] && $LCTL debug_daemon start $DAEMONFILE $DAEMONSIZE
+
+[ "$CLIENTS" ] && zconf_umount_clients $CLIENTS $DIR
+
+test_1() {
+    echo "mount client $CLIENT1,$CLIENT2..."
+    zconf_mount_clients $CLIENT1 $DIR
+    zconf_mount_clients $CLIENT2 $DIR
+
+    do_node $CLIENT2 mkdir -p $DIR/$tdir
+    replay_barrier mds
+    do_node $CLIENT1 createmany -o $DIR/$tfile- 25
+    do_node $CLIENT2 createmany -o $DIR/$tdir/$tfile-2- 1
+    do_node $CLIENT1 createmany -o $DIR/$tfile-3- 25
+    zconf_umount $CLIENT2 $DIR
+
+    facet_failover mds
+    # recovery shouldn't fail due to missing client 2
+    do_node $CLIENT1 df $DIR || return 1
+
+    # All 50 files should have been replayed
+    do_node $CLIENT1 unlinkmany $DIR/$tfile- 25 || return 2
+    do_node $CLIENT1 unlinkmany $DIR/$tfile-3- 25 || return 3
+
+    zconf_mount $CLIENT2 $DIR || error "mount $CLIENT2 $DIR fail"
+    [ -e $DIR/$tdir/$tfile-2-0 ] && error "$tfile-2-0 exists"
+
+    zconf_umount_clients $CLIENTS $DIR
+    return 0
+}
+run_test 1 "VBR: client during replay doesn't affect another one"
+
+test_2() {
+    #ls -al $DIR/$tdir/$tfile
+
+    zconf_mount_clients $CLIENT1 $DIR
+    zconf_mount_clients $CLIENT2 $DIR
+
+    do_node $CLIENT2 mkdir -p $DIR/$tdir
+    replay_barrier mds
+    do_node $CLIENT2 mcreate $DIR/$tdir/$tfile
+    do_node $CLIENT1 createmany -o $DIR/$tfile- 25
+    #do_node $CLIENT2 createmany -o $DIR/$tdir/$tfile-2- 1
+    do_node $CLIENT1 $CHECKSTAT $DIR/$tdir/$tfile
+    do_node $CLIENT1 createmany -o $DIR/$tfile-3- 25
+    zconf_umount $CLIENT2 $DIR
+
+    facet_failover mds
+    # recovery shouldn't fail due to missing client 2
+    do_node $CLIENT1 df $DIR || return 1
+
+    # All 50 files should have been replayed
+    do_node $CLIENT1 unlinkmany $DIR/$tfile- 25 || return 2
+    do_node $CLIENT1 unlinkmany $DIR/$tfile-3- 25 || return 3
+
+    do_node $CLIENT1 $CHECKSTAT $DIR/$tdir/$tfile && return 4
+
+    zconf_mount $CLIENT2 $DIR || error "mount $CLIENT2 $DIR fail"
+
+    zconf_umount_clients $CLIENTS $DIR
+    return 0
+}
+run_test 2 "VBR: lost data due to missed REMOTE client during replay"
+
+test_3a() {
+    zconf_mount_clients $CLIENT1 $DIR
+    zconf_mount_clients $CLIENT2 $DIR
+
+    #make sure the time will change
+    do_facet mds "$LCTL set_param mds.${mds_svc}.atime_diff=0" || return
+    do_node $CLIENT1 touch $DIR/$tfile
+    do_node $CLIENT2 $CHECKSTAT $DIR/$tfile
+    sleep 1
+    replay_barrier mds
+    #change time
+    do_node $CLIENT2 touch $DIR/$tfile
+    do_node $CLIENT2 $CHECKSTAT $DIR/$tfile
+    #another change
+    do_node $CLIENT1 touch $DIR/$tfile
+    #remove file
+    do_node $CLIENT2 rm $DIR/$tfile
+    zconf_umount $CLIENT2 $DIR
+
+    facet_failover mds
+    # recovery shouldn't fail due to missing client 2
+    do_node $CLIENT1 df $DIR || return 1
+    do_node $CLIENT1 $CHECKSTAT $DIR/$tfile && return 2
+
+    zconf_mount $CLIENT2 $DIR || error "mount $CLIENT2 $DIR fail"
+
+    zconf_umount_clients $CLIENTS $DIR
+
+    return 0
+}
+run_test 3a "VBR: setattr of time/size doesn't change version"
+
+test_3b() {
+    zconf_mount_clients $CLIENT1 $DIR
+    zconf_mount_clients $CLIENT2 $DIR
+
+    #make sure the time will change
+    do_facet mds "$LCTL set_param mds.${mds_svc}.atime_diff=0" || return
+    do_facet mds "$LCTL set_param mds.${mds_svc}.sync_permission=0" || return
+    do_node $CLIENT1 touch $DIR/$tfile
+    do_node $CLIENT2 $CHECKSTAT $DIR/$tfile
+    sleep 1
+    replay_barrier mds
+    #change mode
+    do_node $CLIENT2 chmod +x $DIR/$tfile
+    do_node $CLIENT2 $CHECKSTAT $DIR/$tfile
+    #abother chmod
+    do_node $CLIENT1 chmod -x $DIR/$tfile
+    zconf_umount $CLIENT2 $DIR
+
+    facet_failover mds
+    # recovery should fail due to missing client 2
+    do_node $CLIENT1 df $DIR && return 1
+
+    do_node $CLIENT1 $CHECKSTAT -p 755 $DIR/$tfile && return 2
+    zconf_mount $CLIENT2 $DIR || error "mount $CLIENT2 $DIR fail"
+
+    zconf_umount_clients $CLIENTS $DIR
+
+    return 0
+}
+run_test 3b "VBR: setattr of permissions changes version"
+
+test_3c() {
+    [ "$FAILURE_MODE" = HARD ] || \
+        { skip "The HARD failure is needed" && return 0; }
+
+    [ $RUNAS_ID -eq $UID ] && skip "RUNAS_ID = UID = $UID -- skipping" && return
+
+    zconf_mount_clients $CLIENT1 $DIR
+    zconf_mount_clients $CLIENT2 $DIR
+
+    # check that permission changes are synced
+    do_facet mds "$LCTL set_param mds.${mds_svc}.sync_permission=1"
+
+    do_node $CLIENT1 mkdir -p $DIR/d3c/sub || error
+    #chown -R $RUNAS_ID $MOUNT1/d3
+    do_node $CLIENT1 ls -la $DIR/d3c
+
+    # only HARD failure will work as we use sync operation
+    replay_barrier mds
+    do_node $CLIENT2 mcreate $DIR/d3c/$tfile-2
+    #set permissions
+    do_node $CLIENT1 chmod 0700 $UID $DIR/d3c
+    #secret file
+    do_node $CLIENT1 mcreate $DIR/d3c/sub/$tfile
+    do_node $CLIENT1 echo "Top Secret" > $DIR/d3c/sub/$tfile
+    #check user can't access new file
+    do_node $CLIENT2 $RUNAS ls $DIR/d3c && return 3
+    do_node $CLIENT1 $RUNAS ls $DIR/d3c && return 4
+    do_node $CLIENT1 $RUNAS cat $DIR/d3c/sub/$tfile && return 5
+
+    zconf_umount $CLIENT2 $DIR
+
+    facet_failover mds
+    # recovery shouldn't fail due to missing client 2
+    do_node $CLIENT1 df $DIR || return 1
+    sleep 1
+
+    zconf_mount $CLIENT2 $DIR || error "mount $CLIENT2 $DIR fail"
+    do_node $CLIENT1 $RUNAS cat $DIR/d3c/sub/$tfile && return 6
+    do_node $CLIENT2 $RUNAS cat $DIR/d3c/sub/$tfile && return 7
+    do_facet mds "$LCTL set_param mds.${mds_svc}.sync_permission=0"
+
+    return 0
+}
+run_test 3c "VBR: permission dependency failure"
+
+vbr_deactivate_client() {
+    local client=$1
+    echo "Deactivating client $client";
+    do_node $client "sysctl -w lustre.fail_loc=0x50d"
+}
+
+vbr_activate_client() {
+    local client=$1
+    echo "Activating client $client";
+    do_node $client "sysctl -w lustre.fail_loc=0x0"
+}
+
+remote_server ()
+{
+    local client=$1
+    [ -z "$(do_node $client lctl dl | grep mdt)" ] && \
+    [ -z "$(do_node $client lctl dl | grep ost)" ]
+}
+
+test_4a() {
+    delayed_recovery_enabled || { skip "No delayed recovery support"; return 0; }
+
+    remote_server $CLIENT2 || \
+        { skip "Client $CLIENT2 is on the server node" && return 0; }
+
+    zconf_mount_clients $CLIENT1 $DIR
+    zconf_mount_clients $CLIENT2 $DIR
+
+    do_node $CLIENT2 mkdir -p $DIR/$tdir
+    replay_barrier mds
+    do_node $CLIENT1 createmany -o $DIR/$tfile- 25
+    do_node $CLIENT2 createmany -o $DIR/$tdir/$tfile-2- 25
+    do_node $CLIENT1 createmany -o $DIR/$tfile-3- 25
+    vbr_deactivate_client $CLIENT2
+
+    facet_failover mds
+    do_node $CLIENT1 df $DIR || return 1
+
+    # All 50 files should have been replayed
+    do_node $CLIENT1 unlinkmany $DIR/$tfile- 25 || return 2
+    do_node $CLIENT1 unlinkmany $DIR/$tfile-3- 25 || return 3
+
+    vbr_activate_client $CLIENT2
+    do_node $CLIENT2 df $DIR || return 4
+    # All 25 files from client2 should have been replayed
+    do_node $CLIENT2 unlinkmany $DIR/$tdir/$tfile-2- 25 || return 5
+
+    zconf_umount_clients $CLIENTS $DIR
+    return 0
+}
+run_test 4a "fail MDS, delayed recovery"
+
+test_4b() {
+    delayed_recovery_enabled || { skip "No delayed recovery support"; return 0; }
+
+    remote_server $CLIENT2 || \
+        { skip "Client $CLIENT2 is on the server node" && return 0; }
+
+    zconf_mount_clients $CLIENT1 $DIR
+    zconf_mount_clients $CLIENT2 $DIR
+
+    replay_barrier mds
+    do_node $CLIENT1 createmany -o $DIR/$tfile- 25
+    do_node $CLIENT2 createmany -o $DIR/$tdir/$tfile-2- 25
+    vbr_deactivate_client $CLIENT2
+
+    facet_failover mds
+    do_node $CLIENT1 df $DIR || return 1
+
+    # create another set of files
+    do_node $CLIENT1 createmany -o $DIR/$tfile-3- 25
+
+    vbr_activate_client $CLIENT2
+    do_node $CLIENT2 df $DIR || return 2
+
+    # All files from should have been replayed
+    do_node $CLIENT1 unlinkmany $DIR/$tfile- 25 || return 3
+    do_node $CLIENT1 unlinkmany $DIR/$tfile-3- 25 || return 4
+    do_node $CLIENT2 unlinkmany $DIR/$tdir/$tfile-2- 25 || return 5
+
+    zconf_umount_clients $CLIENTS $DIR
+}
+run_test 4b "fail MDS, normal operation, delayed open recovery"
+
+test_4c() {
+    delayed_recovery_enabled || { skip "No delayed recovery support"; return 0; }
+
+    remote_server $CLIENT2 || \
+        { skip "Client $CLIENT2 is on the server node" && return 0; }
+
+    zconf_mount_clients $CLIENT1 $DIR
+    zconf_mount_clients $CLIENT2 $DIR
+
+    replay_barrier mds
+    do_node $CLIENT1 createmany -m $DIR/$tfile- 25
+    do_node $CLIENT2 createmany -m $DIR/$tdir/$tfile-2- 25
+    vbr_deactivate_client $CLIENT2
+
+    facet_failover mds
+    do_node $CLIENT1 df $DIR || return 1
+
+    # create another set of files
+    do_node $CLIENT1 createmany -m $DIR/$tfile-3- 25
+
+    vbr_activate_client $CLIENT2
+    do_node $CLIENT2 df $DIR || return 2
+
+    # All files from should have been replayed
+    do_node $CLIENT1 unlinkmany $DIR/$tfile- 25 || return 3
+    do_node $CLIENT1 unlinkmany $DIR/$tfile-3- 25 || return 4
+    do_node $CLIENT2 unlinkmany $DIR/$tdir/$tfile-2- 25 || return 5
+
+    zconf_umount_clients $CLIENTS $DIR
+}
+run_test 4c "fail MDS, normal operation, delayed recovery"
+
+test_5a() {
+    delayed_recovery_enabled || { skip "No delayed recovery support"; return 0; }
+
+    remote_server $CLIENT2 || \
+        { skip "Client $CLIENT2 is on the server node" && return 0; }
+
+    zconf_mount_clients $CLIENT1 $DIR
+    zconf_mount_clients $CLIENT2 $DIR
+
+    replay_barrier mds
+    do_node $CLIENT1 createmany -o $DIR/$tfile- 25
+    do_node $CLIENT2 createmany -o $DIR/$tfile-2- 1
+    do_node $CLIENT1 createmany -o $DIR/$tfile-3- 1
+    vbr_deactivate_client $CLIENT2
+
+    facet_failover mds
+    do_node $CLIENT1 df $DIR && return 1
+
+    vbr_activate_client $CLIENT2
+    do_node $CLIENT2 df $DIR || return 2
+
+    # First 25 files should have been replayed
+    do_node $CLIENT1 unlinkmany $DIR/$tfile- 25 || return 3
+    # Third file is failed due to missed client2
+    do_node $CLIENT1 $CHECKSTAT $DIR/$tfile-3-0 && error "$tfile-3-0 exists"
+    # file from client2 should exists
+    do_node $CLIENT2 unlinkmany $DIR/$tfile-2- 1 || return 4
+
+    zconf_umount_clients $CLIENTS $DIR
+}
+run_test 5a "fail MDS, delayed recovery should fail"
+
+test_5b() {
+    delayed_recovery_enabled || { skip "No delayed recovery support"; return 0; }
+
+    remote_server $CLIENT2 || \
+        { skip "Client $CLIENT2 is on the server node" && return 0; }
+
+    zconf_mount_clients $CLIENT1 $DIR
+    zconf_mount_clients $CLIENT2 $DIR
+
+    replay_barrier mds
+    do_node $CLIENT1 createmany -o $DIR/$tfile- 25
+    do_node $CLIENT2 createmany -o $DIR/$tfile-2- 1
+    vbr_deactivate_client $CLIENT2
+
+    facet_failover mds
+    do_node $CLIENT1 df $DIR || return 1
+    do_node $CLIENT1 $CHECKSTAT $DIR/$tfile-2-0 && error "$tfile-2-0 exists"
+
+    # create another set of files
+    do_node $CLIENT1 createmany -o $DIR/$tfile-3- 25
+
+    vbr_activate_client $CLIENT2
+    do_node $CLIENT2 df $DIR && return 4
+    # file from client2 should fail
+    do_node $CLIENT2 $CHECKSTAT $DIR/$tfile-2-0 && error "$tfile-2-0 exists"
+
+    # All 50 files from client 1 should have been replayed
+    do_node $CLIENT1 unlinkmany $DIR/$tfile- 25 || return 2
+    do_node $CLIENT1 unlinkmany $DIR/$tfile-3- 25 || return 3
+
+    zconf_umount_clients $CLIENTS $DIR
+}
+run_test 5b "fail MDS, normal operation, delayed recovery should fail"
+
+test_6a() {
+    delayed_recovery_enabled || { skip "No delayed recovery support"; return 0; }
+
+    remote_server $CLIENT2 || \
+        { skip "Client $CLIENT2 is on the server node" && return 0; }
+
+    zconf_mount_clients $CLIENT1 $DIR
+    zconf_mount_clients $CLIENT2 $DIR
+
+    do_node $CLIENT2 mkdir -p $DIR/$tdir
+    replay_barrier mds
+    do_node $CLIENT1 createmany -o $DIR/$tfile- 25
+    do_node $CLIENT2 createmany -o $DIR/$tdir/$tfile-2- 25
+    do_node $CLIENT1 createmany -o $DIR/$tfile-3- 25
+    vbr_deactivate_client $CLIENT2
+
+    facet_failover mds
+    # replay only 5 requests
+    do_node $CLIENT2 "sysctl -w lustre.fail_val=5"
+#define OBD_FAIL_PTLRPC_REPLAY        0x50e
+    do_node $CLIENT2 "sysctl -w lustre.fail_loc=0x2000050e"
+    do_node $CLIENT2 df $DIR
+    # vbr_activate_client $CLIENT2
+    # need way to know that client stops replays
+    sleep 5
+
+    facet_failover mds
+    do_node $CLIENT1 df $DIR || return 1
+
+    # All files should have been replayed
+    do_node $CLIENT1 unlinkmany $DIR/$tfile- 25 || return 2
+    do_node $CLIENT1 unlinkmany $DIR/$tfile-3- 25 || return 3
+    do_node $CLIENT2 unlinkmany $DIR/$tdir/$tfile-2- 25 || return 5
+
+    zconf_umount_clients $CLIENTS $DIR
+    return 0
+}
+run_test 6a "fail MDS, delayed recovery, fail MDS"
+
+test_7a() {
+    delayed_recovery_enabled || { skip "No delayed recovery support"; return 0; }
+
+    remote_server $CLIENT2 || \
+        { skip "Client $CLIENT2 is on the server node" && return 0; }
+
+    zconf_mount_clients $CLIENT1 $DIR
+    zconf_mount_clients $CLIENT2 $DIR
+
+    do_node $CLIENT2 mkdir -p $DIR/$tdir
+    replay_barrier mds
+    do_node $CLIENT1 createmany -o $DIR/$tfile- 25
+    do_node $CLIENT2 createmany -o $DIR/$tdir/$tfile-2- 25
+    do_node $CLIENT1 createmany -o $DIR/$tfile-3- 25
+    vbr_deactivate_client $CLIENT2
+
+    facet_failover mds
+    vbr_activate_client $CLIENT2
+    do_node $CLIENT2 df $DIR || return 4
+
+    facet_failover mds
+    do_node $CLIENT1 df $DIR || return 1
+
+    # All files should have been replayed
+    do_node $CLIENT1 unlinkmany $DIR/$tfile- 25 || return 2
+    do_node $CLIENT1 unlinkmany $DIR/$tfile-3- 25 || return 3
+    do_node $CLIENT2 unlinkmany $DIR/$tdir/$tfile-2- 25 || return 5
+
+    zconf_umount_clients $CLIENTS $DIR
+    return 0
+}
+run_test 7a "fail MDS, delayed recovery, fail MDS"
+
+rmultiop_start() {
+    local client=$1
+    local file=$2
+
+    # We need to run do_node in bg, because pdsh does not exit
+    # if child process of run script exists.
+    # I.e. pdsh does not exit when runmultiop_bg_pause exited,
+    # because of multiop_bg_pause -> $MULTIOP_PROG &
+    # By the same reason we need sleep a bit after do_nodes starts 
+    # to let runmultiop_bg_pause start muliop and
+    # update /tmp/multiop_bg.pid ;
+    # The rm /tmp/multiop_bg.pid guarantees here that 
+    # we have the updated by runmultiop_bg_pause
+    # /tmp/multiop_bg.pid file
+
+    local pid_file=$TMP/multiop_bg.pid.$$
+    do_node $client "rm -f $pid_file && MULTIOP_PID_FILE=$pid_file LUSTRE= runmultiop_bg_pause $file O_tSc" & 
+    local pid=$!
+    sleep 3
+    local multiop_pid
+    multiop_pid=$(do_node $client cat $pid_file)
+    [ -n "$multiop_pid" ] || error "$client : Can not get multiop_pid from $pid_file "
+    eval export ${client}_multiop_pid=$multiop_pid
+    eval export ${client}_do_node_pid=$pid
+    local var=${client}_multiop_pid
+    echo client $client multiop_bg started multiop_pid=${!var}
+    return $?
+}
+
+rmultiop_stop() {
+    local client=$1
+    local multiop_pid=${client}_multiop_pid
+    local do_node_pid=${client}_do_node_pid
+
+    echo "Stopping multiop_pid=${!multiop_pid} (kill ${!multiop_pid} on $client)"
+    do_node $client kill -USR1 ${!multiop_pid}
+
+    wait ${!do_node_pid} || true
+}
+
+test_8a() {
+    delayed_recovery_enabled || { skip "No delayed recovery support"; return 0; }
+
+    remote_server $CLIENT2 || \
+        { skip "Client $CLIENT2 is on the server node" && return 0; }
+
+    zconf_mount_clients $CLIENT1 $DIR
+    zconf_mount_clients $CLIENT2 $DIR
+
+    rmultiop_start $CLIENT2 $DIR/$tfile || return 1
+    do_node $CLIENT2 rm -f $DIR/$tfile
+    replay_barrier mds
+    rmultiop_stop $CLIENT2 || return 2
+
+    vbr_deactivate_client $CLIENT2
+    facet_failover mds
+    do_node $CLIENT1 df $DIR || return 3
+    #client1 is back and will try to open orphan
+    vbr_activate_client $CLIENT2
+    do_node $CLIENT2 df $DIR || return 4
+
+    do_node $CLIENT2 $CHECKSTAT $DIR/$tfile && error "$tfile exists"
+    zconf_umount_clients $CLIENTS $DIR
+    return 0
+}
+run_test 8a "orphans are kept until delayed recovery"
+
+test_8b() {
+    delayed_recovery_enabled || { skip "No delayed recovery support"; return 0; }
+
+    remote_server $CLIENT2 || \
+        { skip "Client $CLIENT2 is on the server node" && return 0; }
+
+    zconf_mount_clients $CLIENT1 $DIR
+    zconf_mount_clients $CLIENT2 $DIR
+
+    rmultiop_start $CLIENT2 $DIR/$tfile || return 1
+    replay_barrier mds
+    do_node $CLIENT1 rm -f $DIR/$tfile
+
+    vbr_deactivate_client $CLIENT2
+    facet_failover mds
+    do_node $CLIENT1 df $DIR || return 2
+    #client1 is back and will try to open orphan
+    vbr_activate_client $CLIENT2
+    do_node $CLIENT2 df $DIR || return 3
+
+    rmultiop_stop $CLIENT2 || return 1
+    do_node $CLIENT2 $CHECKSTAT $DIR/$tfile && error "$tfile exists"
+    zconf_umount_clients $CLIENTS $DIR
+    return 0
+}
+run_test 8b "open1 | unlink2 X delayed_replay1, close1"
+
+test_8c() {
+    delayed_recovery_enabled || { skip "No delayed recovery support"; return 0; }
+
+    remote_server $CLIENT2 || \
+        { skip "Client $CLIENT2 is on the server node" && return 0; }
+
+    zconf_mount_clients $CLIENT1 $DIR
+    zconf_mount_clients $CLIENT2 $DIR
+
+    rmultiop_start $CLIENT2 $DIR/$tfile || return 1
+    replay_barrier mds
+    do_node $CLIENT1 rm -f $DIR/$tfile
+    rmultiop_stop $CLIENT2 || return 2
+
+    vbr_deactivate_client $CLIENT2
+    facet_failover mds
+    do_node $CLIENT1 df $DIR || return 3
+    #client1 is back and will try to open orphan
+    vbr_activate_client $CLIENT2
+    do_node $CLIENT2 df $DIR || return 4
+
+    do_node $CLIENT2 $CHECKSTAT $DIR/$tfile && error "$tfile exists"
+    zconf_umount_clients $CLIENTS $DIR
+    return 0
+}
+run_test 8c "open1 | unlink2, close1 X delayed_replay1"
+
+test_8d() {
+    delayed_recovery_enabled || { skip "No delayed recovery support"; return 0; }
+
+    remote_server $CLIENT2 || \
+        { skip "Client $CLIENT2 is on the server node" && return 0; }
+
+    zconf_mount_clients $CLIENT1 $DIR
+    zconf_mount_clients $CLIENT2 $DIR
+
+    rmultiop_start $CLIENT1 $DIR/$tfile || return 1
+    rmultiop_start $CLIENT2 $DIR/$tfile || return 2
+    replay_barrier mds
+    do_node $CLIENT1 rm -f $DIR/$tfile
+    rmultiop_stop $CLIENT2 || return 3
+    rmultiop_stop $CLIENT1 || return 4
+
+    vbr_deactivate_client $CLIENT2
+    facet_failover mds
+    do_node $CLIENT1 df $DIR || return 6
+
+    #client1 is back and will try to open orphan
+    vbr_activate_client $CLIENT2
+    do_node $CLIENT2 df $DIR || return 8
+
+    do_node $CLIENT2 $CHECKSTAT $DIR/$tfile && error "$tfile exists"
+    zconf_umount_clients $CLIENTS $DIR
+    return 0
+}
+run_test 8d "open1, open2 | unlink2, close1, close2 X delayed_replay1"
+
+test_8e() {
+    zconf_mount $CLIENT1 $DIR
+    zconf_mount $CLIENT2 $DIR
+
+    do_node $CLIENT1 mcreate $DIR/$tfile
+    do_node $CLIENT1 mkdir $DIR/$tfile-2
+    replay_barrier mds
+    # missed replay from client1 will lead to recovery by versions
+    do_node $CLIENT1 touch $DIR/$tfile-2/$tfile
+    do_node $CLIENT2 rm $DIR/$tfile || return 1
+    do_node $CLIENT2 touch $DIR/$tfile || return 2
+
+    zconf_umount $CLIENT1 $DIR
+    facet_failover mds
+    do_node $CLIENT2 df $DIR || return 6
+
+    do_node $CLIENT2 rm $DIR/$tfile || error "$tfile doesn't exists"
+    zconf_umount_clients $CLIENTS $DIR
+    return 0
+}
+run_test 8e "create | unlink, create shouldn't fail"
+
+test_8f() {
+    zconf_mount_clients $CLIENT1 $DIR
+    zconf_mount_clients $CLIENT2 $DIR
+
+    do_node $CLIENT1 touch $DIR/$tfile
+    do_node $CLIENT1 mkdir $DIR/$tfile-2
+    replay_barrier mds
+    # missed replay from client1 will lead to recovery by versions
+    do_node $CLIENT1 touch $DIR/$tfile-2/$tfile
+    do_node $CLIENT2 rm -f $DIR/$tfile || return 1
+    do_node $CLIENT2 mcreate $DIR/$tfile || return 2
+
+    zconf_umount $CLIENT1 $DIR
+    facet_failover mds
+    do_node $CLIENT2 df $DIR || return 6
+
+    do_node $CLIENT2 rm $DIR/$tfile || error "$tfile doesn't exists"
+    zconf_umount $CLIENT2 $DIR
+    return 0
+}
+run_test 8f "create | unlink, create shouldn't fail"
+
+test_8g() {
+    zconf_mount_clients $CLIENT1 $DIR
+    zconf_mount_clients $CLIENT2 $DIR
+
+    do_node $CLIENT1 touch $DIR/$tfile
+    do_node $CLIENT1 mkdir $DIR/$tfile-2
+    replay_barrier mds
+    # missed replay from client1 will lead to recovery by versions
+    do_node $CLIENT1 touch $DIR/$tfile-2/$tfile
+    do_node $CLIENT2 rm -f $DIR/$tfile || return 1
+    do_node $CLIENT2 mkdir $DIR/$tfile || return 2
+
+    zconf_umount $CLIENT1 $DIR
+    facet_failover mds
+    do_node $CLIENT2 df $DIR || return 6
+
+    do_node $CLIENT2 rmdir $DIR/$tfile || error "$tfile doesn't exists"
+    zconf_umount $CLIENT2 $DIR
+    return 0
+}
+run_test 8g "create | unlink, create shouldn't fail"
+
+test_10 () {
+    delayed_recovery_enabled || { skip "No delayed recovery support"; return 0; }
+
+    [ -z "$DBENCH_LIB" ] && skip "DBENCH_LIB is not set" && return 0
+
+    zconf_mount_clients $CLIENTS $DIR
+
+    local duration="-t 60"
+    local cmd="rundbench 1 $duration "
+    local PID=""
+    for CLIENT in ${CLIENTS//,/ }; do
+        $PDSH $CLIENT "set -x; PATH=:$PATH:$LUSTRE/utils:$LUSTRE/tests/:${DBENCH_LIB} DBENCH_LIB=${DBENCH_LIB} $cmd" &
+        PID=$!
+        echo $PID >pid.$CLIENT
+        echo "Started load PID=`cat pid.$CLIENT`"
+    done
+
+    replay_barrier mds
+    sleep 3 # give clients a time to do operations
+
+    vbr_deactivate_client $CLIENT2
+
+    log "$TESTNAME fail mds 1"
+    fail mds
+
+# wait for client to reconnect to MDS
+    sleep $TIMEOUT
+
+    vbr_activate_client $CLIENT2
+    do_node $CLIENT2 df $DIR || return 4
+
+    for CLIENT in ${CLIENTS//,/ }; do
+        PID=`cat pid.$CLIENT`
+        wait $PID
+        rc=$?
+        echo "load on ${CLIENT} returned $rc"
+    done
+
+    zconf_umount_clients $CLIENTS $DIR
+}
+run_test 10 "mds version recovery; $CLIENTCOUNT clients"
+
+equals_msg `basename $0`: test complete, cleaning up
+#SLEEP=$((`date +%s` - $NOW))
+#[ $SLEEP -lt $TIMEOUT ] && sleep $SLEEP
+check_and_cleanup_lustre
+[ -f "$TESTSUITELOG" ] && cat $TESTSUITELOG && grep -q FAIL $TESTSUITELOG && exit 1 || true
diff --git a/lustre/tests/rmdirmany.c b/lustre/tests/rmdirmany.c

index d0c663a..0db1c8f 100755 (executable)
--- a/lustre/tests/rmdirmany.c
+++ b/lustre/tests/rmdirmany.c
@@ -1,3 +1,39 @@
+/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
+ * vim:expandtab:shiftwidth=8:tabstop=8:
+ *
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright  2008 Sun Microsystems, Inc. All rights reserved
+ * Use is subject to license terms.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
  #include <stdio.h>
  #include <sys/types.h>
  #include <sys/stat.h>
diff --git a/lustre/tests/run-llog.sh b/lustre/tests/run-llog.sh

index fe885f7..e8d1826 100644 (file)
--- a/lustre/tests/run-llog.sh
+++ b/lustre/tests/run-llog.sh
@@ -1,26 +1,28 @@
  #!/bin/bash
  
+LUSTRE=${LUSTRE:-$(cd $(dirname $0)/..; echo $PWD)}
+
  load_llog_test() {
      grep -q llog_test /proc/modules && return
      # Module should have been placed with other lustre modules...
      modprobe llog_test 2>&1 | grep -v "llog_test not found"
      grep -q llog_test /proc/modules && return
      # But maybe we're running from a developer tree...
-    insmod ../obdclass/llog_test.ko
+    insmod $LUSTRE/obdclass/llog_test.ko
      grep -q llog_test /proc/modules && return
      # This is for 2.4 kernels (deprecated!)
-    insmod ../obdclass/llog_test.o
+    insmod $LUSTRE/obdclass/llog_test.o
      grep -q llog_test /proc/modules && return
      echo "Unable to load llog_test module!"
      false
      return
  }
  
-PATH=`dirname $0`:`dirname $0`/../utils:$PATH
+PATH=`dirname $0`:$LUSTRE/utils:$PATH
  TMP=${TMP:-/tmp}
  
-MDS=`ls $LPROC/mds | grep -v num_refs | head -n 1`
-[ -z "$MDS" ] && echo "no MDS available, skipping llog test" && exit 0
+MDS=`lctl dl | grep mds | awk '{print $4}' | head -n 1`
+[ -z "$MDS" ] && echo "$0: SKIP: no MDS available, skipping llog test" && exit 0
  
  load_llog_test || exit 0
  lctl modules > $TMP/ogdb-`hostname`
diff --git a/lustre/tests/run_dbench.sh b/lustre/tests/run_dbench.sh

new file mode 100755 (executable)

index 0000000..f82d9dd
--- /dev/null
+++ b/lustre/tests/run_dbench.sh
@@ -0,0 +1,65 @@
+#!/bin/bash
+set -x
+
+TMP=${TMP:-/tmp}
+
+TESTSUITELOG=${TESTSUITELOG:-$TMP/recovery-mds-scale}
+LOG=${TESTSUITELOG}_$(basename $0)-$(hostname)
+DEBUGLOG=${LOG}.debug
+
+mkdir -p ${LOG%/*}
+
+rm -f $LOG $DEBUGLOG
+exec 2>$DEBUGLOG
+
+if [ -z "$MOUNT" -o -z "$END_RUN_FILE" -o -z "$LOAD_PID_FILE" ]; then
+    echo "The following must be set: MOUNT END_RUN_FILE LOAD_PID_FILE"
+    exit 1
+fi
+
+echoerr () { echo "$@" 1>&2 ; }
+
+signaled() {
+    trap 0
+    echoerr "$(date +'%F %H:%M:%S'): client load was signaled to terminate"
+    kill  $load_pid 
+    kill -TERM -$PPID
+    sleep 5
+    kill -KILL -$PPID
+}
+
+trap signaled TERM
+
+# recovery-mds-scale uses this to signal the client loads to die
+echo $$ >$LOAD_PID_FILE
+
+TESTDIR=$MOUNT/dbench-$(hostname)
+
+CONTINUE=true
+
+while [ ! -e "$END_RUN_FILE" ] && $CONTINUE; do
+    echoerr "$(date +'%F %H:%M:%S'): dbench run starting"
+
+    mkdir -p $TESTDIR
+    rundbench -D $TESTDIR 2 1>$LOG &
+    load_pid=$!
+
+    wait $load_pid
+    if [ ${PIPESTATUS[0]} -eq 0 ]; then
+       echoerr "$(date +'%F %H:%M:%S'): dbench succeeded"
+       cd $TMP
+       rm -rf $TESTDIR
+       echoerr "$(date +'%F %H:%M:%S'): dbench run finished"
+    else
+       echoerr "$(date +'%F %H:%M:%S'): dbench failed"
+       if [ -z "$ERRORS_OK" ]; then
+           echo $(hostname) >> $END_RUN_FILE
+       fi
+       if [ $BREAK_ON_ERROR ]; then
+           # break
+            CONTINUE=false
+       fi
+    fi
+done
+
+echoerr "$(date +'%F %H:%M:%S'): dbench run exiting"
diff --git a/lustre/tests/run_dd.sh b/lustre/tests/run_dd.sh

new file mode 100755 (executable)

index 0000000..96a4950
--- /dev/null
+++ b/lustre/tests/run_dd.sh
@@ -0,0 +1,62 @@
+#!/bin/bash
+set -x
+
+TMP=${TMP:-/tmp}
+
+TESTSUITELOG=${TESTSUITELOG:-$TMP/recovery-mds-scale}
+LOG=${TESTSUITELOG}_$(basename $0)-$(hostname)
+DEBUGLOG=${LOG}.debug
+
+mkdir -p ${LOG%/*}
+
+rm -f $LOG $DEBUGLOG
+exec 2>$DEBUGLOG
+
+if [ -z "$MOUNT" -o -z "$END_RUN_FILE" -o -z "$LOAD_PID_FILE" ]; then
+    echo "The following must be set: MOUNT END_RUN_FILE LOAD_PID_FILE"
+    exit 1
+fi
+
+echoerr () { echo "$@" 1>&2 ; }
+
+signaled() {
+    echoerr "$(date +'%F %H:%M:%S'): client load was signaled to terminate"
+    kill -TERM -$PPID
+    sleep 5
+    kill -KILL -$PPID
+}
+
+trap signaled TERM
+
+# recovery-mds-scale uses this to signal the client loads to die
+echo $$ >$LOAD_PID_FILE
+
+TESTDIR=$MOUNT/dd-$(hostname)
+
+CONTINUE=true
+while [ ! -e "$END_RUN_FILE" ] && $CONTINUE; do
+    echoerr "$(date +'%F %H:%M:%S'): dd run starting"
+    mkdir -p $TESTDIR
+    cd $TESTDIR
+    dd bs=4k count=1000000 if=/dev/zero of=$TESTDIR/dd-file 1>$LOG &
+    load_pid=$!
+    wait $load_pid
+
+    if [ $? -eq 0 ]; then
+       echoerr "$(date +'%F %H:%M:%S'): dd succeeded"
+       cd $TMP
+       rm -rf $TESTDIR
+       echoerr "$(date +'%F %H:%M:%S'): dd run finished"
+    else
+       echoerr "$(date +'%F %H:%M:%S'): dd failed"
+       if [ -z "$ERRORS_OK" ]; then
+           echo $(hostname) >> $END_RUN_FILE
+       fi
+       if [ $BREAK_ON_ERROR ]; then
+           # break
+            CONTINUE=false
+       fi
+    fi
+done
+
+echoerr "$(date +'%F %H:%M:%S'): dd run exiting"
diff --git a/lustre/tests/run_iozone.sh b/lustre/tests/run_iozone.sh

new file mode 100755 (executable)

index 0000000..2b71118
--- /dev/null
+++ b/lustre/tests/run_iozone.sh
@@ -0,0 +1,77 @@
+#!/bin/bash
+set -x
+
+TMP=${TMP:-/tmp}
+
+TESTSUITELOG=${TESTSUITELOG:-$TMP/recovery-mds-scale}
+LOG=${TESTSUITELOG}_$(basename $0)-$(hostname)
+DEBUGLOG=${LOG}.debug
+
+mkdir -p ${LOG%/*}
+
+rm -f $LOG $DEBUGLOG
+exec 2>$DEBUGLOG
+
+if [ -z "$MOUNT" -o -z "$END_RUN_FILE" -o -z "$LOAD_PID_FILE" ]; then
+    echo "The following must be set: MOUNT END_RUN_FILE LOAD_PID_FILE"
+    exit 1
+fi
+
+echoerr () { echo "$@" 1>&2 ; }
+
+signaled() {
+    echoerr "$(date +'%F %H:%M:%S'): client load was signaled to terminate"
+    kill -TERM -$PPID
+    sleep 5
+    kill -KILL -$PPID
+}
+
+trap signaled TERM
+
+# recovery-mds-scale uses this to signal the client loads to die
+echo $$ >$LOAD_PID_FILE
+
+TESTDIR=$MOUNT/iozone-$(hostname)
+
+# needed to debug oom problem
+#echo 1 > /proc/sys/vm/vm_gfp_debug
+#killpids=""
+#vmstat 1 1000000 >$TMP/iozone.vmstat.out &
+#killpids="$killpids $!"
+#$LUSTRE_TESTS/runvmstat > $TMP/iozone.runvmstat.out &
+#killpids="$killpids $!"
+
+CONTINUE=true
+while [ ! -e "$END_RUN_FILE" ] && $CONTINUE; do
+    echoerr "$(date +'%F %H:%M:%S'): iozone run starting"
+    mkdir -p $TESTDIR
+    cd $TESTDIR
+    iozone -a -M -R -V 0xab -g 100M -q 512k -i0 -i1 -f $TESTDIR/iozone-file 1>$LOG &
+    load_pid=$!
+    wait $load_pid
+    if [ ${PIPESTATUS[0]} -eq 0 ]; then
+       echoerr "$(date +'%F %H:%M:%S'): iozone succeeded"
+       cd $TMP
+       rm -rf $TESTDIR
+        if [ -d $TESTDIR ]; then
+           echoerr "$(date +'%F %H:%M:%S'): failed to remove $TESTDIR"
+           echo $(hostname) >> $END_RUN_FILE
+            CONTINUE=false
+        fi
+       echoerr "$(date +'%F %H:%M:%S'): iozone run finished"
+    else
+       echoerr "$(date +'%F %H:%M:%S'): iozone failed"
+       if [ -z "$ERRORS_OK" ]; then
+           echo $(hostname) >> $END_RUN_FILE
+       fi
+       if [ $BREAK_ON_ERROR ]; then
+           # break
+            CONTINUE=false
+       fi
+    fi
+done
+
+echoerr "$(date +'%F %H:%M:%S'): iozone run exiting"
+#kill $killpids
+#sleep 5
+#kill -9 $killpids
diff --git a/lustre/tests/run_tar.sh b/lustre/tests/run_tar.sh

new file mode 100755 (executable)

index 0000000..7502c24
--- /dev/null
+++ b/lustre/tests/run_tar.sh
@@ -0,0 +1,68 @@
+#!/bin/bash
+set -x
+
+TMP=${TMP:-/tmp}
+
+TESTSUITELOG=${TESTSUITELOG:-$TMP/recovery-mds-scale}
+LOG=${TESTSUITELOG}_$(basename $0)-$(hostname)
+DEBUGLOG=${LOG}.debug
+
+mkdir -p ${LOG%/*}
+
+rm -f $LOG $DEBUGLOG
+exec 2>$DEBUGLOG
+
+if [ -z "$MOUNT" -o -z "$END_RUN_FILE" -o -z "$LOAD_PID_FILE" ]; then
+    echo "The following must be set: MOUNT END_RUN_FILE LOAD_PID_FILE"
+    exit 1
+fi
+
+echoerr () { echo "$@" 1>&2 ; }
+
+signaled() {
+    echoerr "$(date +'%F %H:%M:%S'): client load was signaled to terminate"
+    kill -TERM -$PPID
+    sleep 5
+    kill -KILL -$PPID
+}
+
+trap signaled TERM
+
+# recovery-mds-scale uses this to signal the client loads to die
+echo $$ >$LOAD_PID_FILE
+
+TESTDIR=$MOUNT/tar-$(hostname)
+
+CONTINUE=true
+while [ ! -e "$END_RUN_FILE" ] && $CONTINUE; do
+    echoerr "$(date +'%F %H:%M:%S'): tar run starting"
+    mkdir -p $TESTDIR
+    cd $TESTDIR
+    tar cf - /etc | tar xf - 2>&1 | tee $LOG &
+    load_pid=$!
+ps -e f -o "pid ppid pgrp comm" >$TMP/client-load.ps-list
+    wait $load_pid
+    RC=${PIPESTATUS[0]}
+    PREV_ERRORS=$(grep "exit delayed from previous errors" $LOG) || true
+    if [ $RC -ne 0 -a "$ERRORS_OK" -a "$PREV_ERRORS" ]; then
+        echoerr "$(date +'%F %H:%M:%S'): tar errors earlier, ignoring"
+        RC=0
+    fi
+    if [ $RC -eq 0 ]; then
+       echoerr "$(date +'%F %H:%M:%S'): tar succeeded"
+       cd $TMP
+       rm -rf $TESTDIR
+       echoerr "$(date +'%F %H:%M:%S'): tar run finished"
+    else
+       echoerr "$(date +'%F %H:%M:%S'): tar failed"
+       if [ -z "$ERRORS_OK" ]; then
+           echo $(hostname) >> $END_RUN_FILE
+       fi
+       if [ $BREAK_ON_ERROR ]; then
+           # break
+            CONTINUE=false
+       fi
+    fi
+done
+
+echoerr "$(date +'%F %H:%M:%S'): tar run exiting"
diff --git a/lustre/tests/runas.c b/lustre/tests/runas.c

index 4db7617..6a1fbab 100644 (file)
--- a/lustre/tests/runas.c
+++ b/lustre/tests/runas.c
@@ -1,6 +1,39 @@
  /* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
   * vim:expandtab:shiftwidth=8:tabstop=8:
+ *
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
   */
+/*
+ * Copyright  2008 Sun Microsystems, Inc. All rights reserved
+ * Use is subject to license terms.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
  #include <stdio.h>
  #include <stdlib.h>
  #include <unistd.h>
@@ -166,4 +199,3 @@ int main(int argc, char **argv)
                  errno, strerror(errno));
          exit(-1);
  }
-
diff --git a/lustre/tests/rundbench b/lustre/tests/rundbench

index 40a8fde..c3fa9cb 100755 (executable)
--- a/lustre/tests/rundbench
+++ b/lustre/tests/rundbench
@@ -1,20 +1,90 @@
  #!/bin/sh
+
+LUSTRE=${LUSTRE:-$(cd $(dirname $0)/..; echo $PWD)}
+. $LUSTRE/tests/test-framework.sh
+
+usage() {
+    echo "-C use chroot instead of cd"
+    echo "-D DIR - use 'DIR' as work directory"
+    echo
+    exit;
+}
+
+PATH=${DBENCH_LIB}:${PATH}
  MOUNT=${MOUNT:-/mnt/lustre}
  DIR=${DIR:-$MOUNT/`hostname`}
-#[ -e /proc/sys/lnet/debug ] && echo 0 > /proc/sys/lnet/debug 
+
+PREFIX="on"
+while getopts "CD:" opt $*; do
+       case $opt in
+               D) [ -d $OPTARG ] && DIR=$OPTARG ;;
+               C) CHROOT="yes" ;;
+               \?) usage ;;
+       esac
+done
+
  mkdir -p $DIR
  TGT=$DIR/client.txt
-DBENCH_LIB=${DBENCH_LIB:-/usr/share/dbench}
-SRC=${SRC:-$DBENCH_LIB/client.txt}
-[ ! -s $TGT -a -s $SRC ] && echo "copying $SRC to $TGT" && cp $SRC $TGT
-SRC=${SRC:-/usr/lib/dbench/client.txt}
-[ ! -s $TGT -a -s $SRC ] && echo "copying $SRC to $TGT" && cp $SRC $TGT
-SRC=/usr/lib/dbench/client_plain.txt
-[ ! -s $TGT -a -s $SRC ] && echo "copying $SRC to $TGT" && cp $SRC $TGT
-[ ! -s $TGT ] && echo "$0: $TGT doesn't exist (SRC=$SRC)" && exit 1
+CLIENT_PREFIX="${DBENCH_LIB} /usr/share/dbench /usr/local/share /usr/lib/dbench"
+CLIENT_FILE="client.txt client_plain.txt dbench_client"
+if ! which dbench > /dev/null 2>&1 ; then
+    [ "$MISSING_DBENCH_OK" ] || { error "dbench is not installed !" && exit 3; }
+    skip "$0: dbench is not installed"
+    exit 0
+fi
+CLIENT=""
+
+for prefix in $CLIENT_PREFIX; do
+       for file in $CLIENT_FILE; do
+               if [ -s "${prefix}/${file}" ]; then
+                       CLIENT="${prefix}/${file}";
+                       break;
+               fi
+       done
+       [ "x$CLIENT" != "x" ] && break;
+done
+
+if [ -n "$SRC" -a -s "$SRC" ]; then
+       CLIENT=${SRC}
+fi
+
+[ ! -s "$CLIENT" ] && \
+    skip "$0: no client file found for dbench DBENCH_LIB=$DBENCH_LIB SRC=$SRC" && \
+        exit 0 
+
+[ ! -s "$TGT" ] && echo "copying $CLIENT to $TGT" && cp $CLIENT $TGT
+[ ! -s "$TGT" ] && \
+    echo "$0: $TGT file doesn't exist after cp $CLIENT $TGT" && exit 1
+
+if [ "x$CHROOT" == "xyes" ]; then
+       echo "copying necessary libs to $DIR"
+       cp `which dbench` $DIR
+       LIBS71=$(ldd $DIR/dbench|sed -e 's/\t*//' -e 's/.*=> //' -e 's/ .*//' -e 's/^\///')
+       (cd / && tar chf - $LIBS71) | (cd $DIR && tar xvf -)
+       [ $? != 0 ] && echo "can't copy libs $LIBS71 to $DIR" && exit 1
+       RUN="chroot $DIR"
+       PREFIX="in"
+       PATH=.:/:$PATH
+fi
+
+shift $((OPTIND - 1))
+
+trap '
+echo kill dbench main pid=$DBENCHPID
+kill $DBENCHPID
+rm -rf dbench $LIBS71 client.txt
+exit 0
+' TERM
+
  cd $DIR
-echo "running 'dbench $@' on $PWD at `date`"
-dbench -c client.txt $@
+echo "running 'dbench $@' $PREFIX $PWD at `date`"
+
+$RUN dbench -c client.txt $@ &
+DBENCHPID=$!
+echo "dbench PID=$DBENCHPID"
+wait $DBENCHPID
  RC=$?
  [ $RC -ne 0 ] && killall -9 dbench
+
+rm -rf dbench $LIBS71 client.txt
  exit $RC
diff --git a/lustre/tests/runmultiop_bg_pause b/lustre/tests/runmultiop_bg_pause

index 823ebdd..3450e64 100644 (file)
--- a/lustre/tests/runmultiop_bg_pause
+++ b/lustre/tests/runmultiop_bg_pause
@@ -7,5 +7,12 @@ PTLDEBUG=${PTLDEBUG:--1}
  LUSTRE=${LUSTRE:-`dirname $0`/..}
  . $LUSTRE/tests/test-framework.sh
  
-multiop_bg_pause $*
-exit $?
+TMP=${TMP:-/tmp}
+MULTIOP_PID_FILE=${MULTIOP_PID_FILE:-$TMP/multiop_bg.pid}
+rm -f $MULTIOP_PID_FILE
+
+pid=$(multiop_bg_pause $* | tail -1)
+rc=${PIPESTATUS[0]}
+
+[ "$rc" = 0 ] && echo $pid > $MULTIOP_PID_FILE
+exit $rc
diff --git a/lustre/tests/runracer b/lustre/tests/runracer

new file mode 100644 (file)

index 0000000..fcc26ed
--- /dev/null
+++ b/lustre/tests/runracer
@@ -0,0 +1,113 @@
+#!/bin/bash
+#set -vx
+set -e
+
+LUSTRE=${LUSTRE:-$(cd $(dirname $0)/..; echo $PWD)}
+. $LUSTRE/tests/test-framework.sh
+init_test_env $@
+. ${CONFIG:=$LUSTRE/tests/cfg/$NAME.sh}
+
+racer=`which racer.sh`
+[ -z "$racer" ] && echo racer is not installed && exit 1
+
+CLIENTS=${CLIENTS:-$HOSTNAME}
+RDIR=$DIR/racer
+mkdir -p $RDIR
+DURATION=${DURATION:-120}
+
+assert_env CLIENTS
+
+timer_on () {
+       sleep $1 && kill -s ALRM $$ &
+       TIMERPID=$!
+       echo TIMERPID=$TIMERPID
+}
+
+do_racer_cleanup () {
+       trap 0
+
+       local WAIT=0
+       local INTERVAL=5
+        local pids
+       local rc=0
+
+       echo "DOING RACER CLEANUP ... "
+
+       # Check if all processes are killed
+
+       local clients=$CLIENTS
+
+       # 1.Let chance to racer to kill all it's processes
+       # FIXME: not sure how long does it take for racer to kill all processes
+       # 80 is sometimes are enough for 2 clients; sometimes it takes more than 150 sec
+       while [ $WAIT -lt 90 ]; do
+               running=$(do_nodes $clients "ps uax | grep $RDIR " | egrep -v "(acceptance|grep|pdsh|bash)" || true)
+               [ -z "$running" ] && rc=0 && break
+               echo "clients $clients are still running the racer processes. Waited $WAIT secs"
+               echo $running
+               rc=1
+               [ $INTERVAL -lt 40 ] && INTERVAL=$((INTERVAL + INTERVAL))
+               sleep $INTERVAL
+               WAIT=$((WAIT + INTERVAL))
+       done
+
+       # 2. Kill the remaining processes
+       if [ $rc -ne 0 ]; then
+               for C in ${clients//,/ } ; do
+                       pids=$(do_node $C "ps uax | grep $RDIR " | egrep -v "(acceptance|grep|PATH)" | awk '{print $2}' || true)
+                       if [ ! -z "$pids" ]; then
+                               echo "client $C still running racer processes after $WAIT seconds. Killing $pids"
+                               do_node $C "ps uax | grep $RDIR " | egrep -v "(acceptance|grep|PATH)"
+                               do_node $C kill -TERM $pids || true
+                               # let processes to be killed
+                               sleep 2
+       # 3. Check if the processes were killed
+       # exit error if the processes still exist
+                               for pid in $pids; do
+                                       do_node $C "ps -P $pid" && RC=1 || true
+                               done
+                       else
+                               echo "All processes on client $C exited after $WAIT seconds. OK."
+                       fi
+               done
+       else
+               echo "No racer processes running after $WAIT seconds. OK."
+               wait_remote_prog $racer 10
+       fi
+}
+
+racer_cleanup () {
+       if [ "$timeout" == "timeout" ]; then
+               echo $timeout killing RACERPID=$RACERPID
+               kill $RACERPID || true
+               sleep 2 # give chance racer to kill it's processes
+               do_racer_cleanup
+       else
+               echo "Racer completed before DURATION=$DURATION expired. Cleaning up..."
+               kill $TIMERPID
+               do_racer_cleanup
+       fi
+}
+
+racer_timeout () {
+       timeout="timeout"
+       racer_cleanup
+       echo "$0: completed $RC"
+       exit $RC
+}
+
+# run racer
+log "Start racer on clients: $CLIENTS DURATION=$DURATION"
+RC=0
+
+trap racer_timeout ALRM
+
+timer_on $((DURATION + 5))
+
+do_nodes $CLIENTS "DURATION=$DURATION $racer $RDIR" &
+RACERPID=$!
+echo RACERPID=$RACERPID
+wait $RACERPID || RC=2
+racer_cleanup
+echo "$0: completed $RC"
+exit $RC
diff --git a/lustre/tests/sanity-quota.sh b/lustre/tests/sanity-quota.sh

index b8164a5..7834df3 100644 (file)
--- a/lustre/tests/sanity-quota.sh
+++ b/lustre/tests/sanity-quota.sh
@@ -46,8 +46,12 @@ LUSTRE=${LUSTRE:-`dirname $0`/..}
  . $LUSTRE/tests/test-framework.sh
  init_test_env $@
  . ${CONFIG:=$LUSTRE/tests/cfg/$NAME.sh}
+DIRECTIO=${DIRECTIO:-$LUSTRE/tests/directio}
  
-[ "$SLOW" = "no" ] && EXCEPT_SLOW="9 10 11 21"
+remote_mds_nodsh && skip "remote MDS with nodsh" && exit 0
+remote_ost_nodsh && skip "remote OST with nodsh" && exit 0
+
+[ "$SLOW" = "no" ] && EXCEPT_SLOW="9 10 11 18b 21"
  
  QUOTALOG=${TESTSUITELOG:-$TMP/$(basename $0 .sh).log}
  
@@ -56,13 +60,13 @@ QUOTALOG=${TESTSUITELOG:-$TMP/$(basename $0 .sh).log}
  DIR=${DIR:-$MOUNT}
  DIR2=${DIR2:-$MOUNT2}
  
-cleanup_and_setup_lustre
+check_and_setup_lustre
  
  LOVNAME=`lctl get_param -n llite.*.lov.common_name | tail -n 1`
  OSTCOUNT=`lctl get_param -n lov.$LOVNAME.numobd`
  
-SHOW_QUOTA_USER="$LFS quota -u $TSTUSR $DIR"
-SHOW_QUOTA_GROUP="$LFS quota -g $TSTUSR $DIR"
+SHOW_QUOTA_USER="$LFS quota -v -u $TSTUSR $DIR"
+SHOW_QUOTA_GROUP="$LFS quota -v -g $TSTUSR $DIR"
  SHOW_QUOTA_INFO="$LFS quota -t $DIR"
  
  # control the time of tests
@@ -76,38 +80,38 @@ eval ONLY_99=true
  
  # set_blk_tunables(btune_sz)
  set_blk_tunesz() {
-        local btune=$(($1 * BLK_SZ))
+       local btune=$(($1 * BLK_SZ))
         # set btune size on all obdfilters
-       do_facet ost1 "lctl set_param obdfilter.*.quota_btune_sz=$btune"
+       do_facet ost1 "lctl set_param lquota.${FSNAME}-OST*.quota_btune_sz=$btune"
         # set btune size on mds
-       do_facet mds "lctl set_param mds.${FSNAME}-MDT*.quota_btune_sz=$btune"
+       do_facet mds  "lctl set_param lquota.${FSNAME}-MDT*.quota_btune_sz=$btune"
  }
  
  # set_blk_unitsz(bunit_sz)
  set_blk_unitsz() {
         local bunit=$(($1 * BLK_SZ))
         # set bunit size on all obdfilters
-       do_facet ost1 "lctl set_param obdfilter.*.quota_bunit_sz=$bunit"
+       do_facet ost1 "lctl set_param lquota.${FSNAME}-OST*.quota_bunit_sz=$bunit"
         # set bunit size on mds
-       do_facet mds "lctl set_param mds.${FSNAME}-MDT*.quota_bunit_sz=$bunit"
+       do_facet mds  "lctl set_param lquota.${FSNAME}-MDT*.quota_bunit_sz=$bunit"
  }
  
  # set_file_tunesz(itune_sz)
  set_file_tunesz() {
         local itune=$1
         # set itune size on all obdfilters
-       do_facet ost1 "lctl set_param obdfilter.*.quota_itune_sz=$itune"
+       do_facet ost1 "lctl set_param lquota.${FSNAME}-OST*.quota_itune_sz=$itune"
         # set itune size on mds
-       do_facet mds "lctl set_param mds.${FSNAME}-MDT*.quota_itune_sz=$itune"
+       do_facet mds  "lctl set_param lquota.${FSNAME}-MDT*.quota_itune_sz=$itune"
  }
  
  # set_file_unitsz(iunit_sz)
  set_file_unitsz() {
         local iunit=$1
         # set iunit size on all obdfilters
-       do_facet ost1 "lctl set_param obdfilter.*.quota_iunit_sz=$iunit"
+       do_facet ost1 "lctl set_param lquota.${FSNAME}-OST*.quota_iunit_sz=$iunit"
         # set iunit size on mds
-       do_facet mds "lctl set_param mds.${FSNAME}-MDT*.quota_iunit_sz=$iunit"
+       do_facet mds  "lctl set_param lquota.${FSNAME}-MDT*.quota_iunit_sz=$iunit"
  }
  
  lustre_fail() {
@@ -131,13 +135,31 @@ lustre_fail() {
         esac
  }
  
-RUNAS="runas -u $TSTID"
-RUNAS2="runas -u $TSTID2"
-FAIL_ON_ERROR=true check_runas_id $TSTID $RUNAS
-FAIL_ON_ERROR=true check_runas_id $TSTID2 $RUNAS2
+RUNAS="runas -u $TSTID -g $TSTID"
+RUNAS2="runas -u $TSTID2 -g $TSTID2"
+FAIL_ON_ERROR=true check_runas_id $TSTID $TSTID $RUNAS
+FAIL_ON_ERROR=true check_runas_id $TSTID2 $TSTID2 $RUNAS2
  
  FAIL_ON_ERROR=false
  
+run_test_with_stat() {
+       (($# != 2)) && error "the number of arguments is wrong"
+
+       do_facet mds  "lctl set_param lquota.${FSNAME}-MDT*.stats=0" > /dev/null
+       for j in `seq $OSTCOUNT`; do
+           do_facet ost$j "lctl set_param lquota.${FSNAME}-OST*.stats=0" > /dev/null
+       done
+       run_test "$@"
+       if [ ${STAT:-"yes"} != "no" -a -z "$LAST_SKIPPED" ]; then
+           echo "statistics info begin ***************************************"
+           do_facet mds  "lctl get_param lquota.${FSNAME}-MDT*.stats"
+           for j in `seq $OSTCOUNT`; do
+               do_facet ost$j "lctl get_param lquota.${FSNAME}-OST*.stats"
+           done
+           echo "statistics info end   ***************************************"
+       fi
+}
+
  # set quota
  test_0() {
         $LFS quotaoff -ug $DIR
@@ -152,7 +174,7 @@ test_0() {
             do_facet ost$num "lctl set_param debug=+quota"
         done
  }
-run_test 0 "Set quota ============================="
+run_test_with_stat 0 "Set quota ============================="
  
  # test for specific quota limitation, qunit, qtune $1=block_quota_limit
  test_1_sub() {
@@ -241,7 +263,7 @@ test_1() {
             set_blk_tunesz $((128 * 1024 / 2))
          done
  }
-run_test 1 "Block hard limit (normal use and out of quota) ==="
+run_test_with_stat 1 "Block hard limit (normal use and out of quota) ==="
  
  # test for specific quota limitation, qunit, qtune $1=block_quota_limit
  test_2_sub() {
@@ -331,7 +353,7 @@ test_2() {
             set_file_tunesz 2560
          done
  }
-run_test 2 "File hard limit (normal use and out of quota) ==="
+run_test_with_stat 2 "File hard limit (normal use and out of quota) ==="
  
  test_block_soft() {
         TESTFILE=$1
@@ -428,7 +450,7 @@ test_3() {
         test_block_soft $TESTFILE $GRACE
         $LFS setquota -g $TSTUSR -b 0 -B 0 -i 0 -I 0 $DIR
  }
-run_test 3 "Block soft limit (start timer, timer goes off, stop timer) ==="
+run_test_with_stat 3 "Block soft limit (start timer, timer goes off, stop timer) ==="
  
  test_file_soft() {
         TESTFILE=$1
@@ -514,7 +536,7 @@ test_4a() { # was test_4
         $LFS setquota -t -u --block-grace $MAX_DQ_TIME --inode-grace $MAX_IQ_TIME $DIR
         $LFS setquota -t -g --block-grace $MAX_DQ_TIME --inode-grace $MAX_IQ_TIME $DIR
  }
-run_test 4a "File soft limit (start timer, timer goes off, stop timer) ==="
+run_test_with_stat 4a "File soft limit (start timer, timer goes off, stop timer) ==="
  
  test_4b() {    # was test_4a
          GR_STR1="1w3d"
@@ -542,7 +564,7 @@ test_4b() { # was test_4a
          $LFS setquota -t -u --block-grace $MAX_DQ_TIME --inode-grace $MAX_IQ_TIME $DIR
          $LFS setquota -t -g --block-grace $MAX_DQ_TIME --inode-grace $MAX_IQ_TIME $DIR
  }
-run_test 4b "Grace time strings handling ==="
+run_test_with_stat 4b "Grace time strings handling ==="
  
  # chown & chgrp (chown & chgrp successfully even out of block/file quota)
  test_5() {
@@ -576,7 +598,7 @@ test_5() {
         $LFS setquota -u $TSTUSR -b 0 -B 0 -i 0 -I 0 $DIR
         $LFS setquota -g $TSTUSR -b 0 -B 0 -i 0 -I 0 $DIR
  }
-run_test 5 "Chown & chgrp successfully even out of block/file quota ==="
+run_test_with_stat 5 "Chown & chgrp successfully even out of block/file quota ==="
  
  # block quota acquire & release
  test_6() {
@@ -644,14 +666,13 @@ test_6() {
         $LFS setquota -g $TSTUSR -b 0 -B 0 -i 0 -I 0 $DIR
         return 0
  }
-run_test 6 "Block quota acquire & release ========="
+run_test_with_stat 6 "Block quota acquire & release ========="
  
  # quota recovery (block quota only by now)
  test_7()
  {
         mkdir -p $DIR/$tdir
         chmod 0777 $DIR/$tdir
-       remote_mds && skip "remote mds" && return 0
  
         wait_delete_completed
  
@@ -684,7 +705,7 @@ test_7()
  
         # check limits
         PATTERN="`echo $DIR | sed 's/\//\\\\\//g'`"
-       TOTAL_LIMIT="`$LFS quota -u $TSTUSR $DIR | awk '/^.*'$PATTERN'.*[[:digit:]+][[:space:]+]/ { print $4 }'`"
+       TOTAL_LIMIT="`$LFS quota -v -u $TSTUSR $DIR | awk '/^.*'$PATTERN'.*[[:digit:]+][[:space:]+]/ { print $4 }'`"
         [ $TOTAL_LIMIT -eq $LIMIT ] || error "total limits not recovery!"
         echo "  total limits = $TOTAL_LIMIT"
  
@@ -697,46 +718,31 @@ test_7()
         # cleanup
         $LFS setquota -u $TSTUSR -b 0 -B 0 -i 0 -I 0 $DIR
  }
-run_test 7 "Quota recovery (only block limit) ======"
+run_test_with_stat 7 "Quota recovery (only block limit) ======"
  
  # run dbench with quota enabled
  test_8() {
         mkdir -p $DIR/$tdir
         BLK_LIMIT=$((100 * 1024 * 1024)) # 100G
         FILE_LIMIT=1000000
-       DBENCH_LIB=${DBENCH_LIB:-/usr/lib/dbench}
-
-       [ ! -d $DBENCH_LIB ] && skip "dbench not installed" && return 0
  
         wait_delete_completed
  
         echo "  Set enough high limit for user: $TSTUSR"
         $LFS setquota -u $TSTUSR -b 0 -B $BLK_LIMIT -i 0 -I $FILE_LIMIT $DIR
         echo "  Set enough high limit for group: $TSTUSR"
-       $LFS setquota -g $USER -b 0 -B $BLK_LIMIT -i 0 -I $FILE_LIMIT $DIR
-
-       TGT=$DIR/$tdir/client.txt
-       SRC=${SRC:-$DBENCH_LIB/client.txt}
-       [ ! -e $TGT -a -e $SRC ] && echo "copying $SRC to $TGT" && cp $SRC $TGT
-       SRC=$DBENCH_LIB/client_plain.txt
-       [ ! -e $TGT -a -e $SRC ] && echo "copying $SRC to $TGT" && cp $SRC $TGT
+       $LFS setquota -g $TSTUSR -b 0 -B $BLK_LIMIT -i 0 -I $FILE_LIMIT $DIR
  
         chmod 0777 $DIR/$tdir
-       SAVE_PWD=$PWD
-       cd $DIR/$tdir
         local duration=""
         [ "$SLOW" = "no" ] && duration=" -t 120"
-       $RUNAS dbench -c client.txt 3 $duration
-       RC=$?
-       [ $RC -ne 0 ] && killall -9 dbench
+       $RUNAS bash rundbench -D $DIR/$tdir 3 $duration || error "dbench failed!"
  
-       rm -f client.txt
         sync; sleep 3; sync;
  
-       cd $SAVE_PWD
-       return $RC
+       return 0 
  }
-run_test 8 "Run dbench with quota enabled ==========="
+run_test_with_stat 8 "Run dbench with quota enabled ==========="
  
  # run for fixing bug10707, it needs a big room. test for 64bit
  KB=1024
@@ -808,7 +814,7 @@ test_9() {
  
          return $RC
  }
-run_test 9 "run for fixing bug10707(64bit) ==========="
+run_test_with_stat 9 "run for fixing bug10707(64bit) ==========="
  
  # run for fixing bug10707, it need a big room. test for 32bit
  test_10() {
@@ -818,8 +824,8 @@ test_10() {
  
         wait_delete_completed
  
-       set_blk_tunesz 512
-       set_blk_unitsz 1024
+       set_blk_tunesz 512
+       set_blk_unitsz 1024
  
         # make qd_count 32 bit
         lustre_fail mds_ost 0xA00
@@ -866,7 +872,7 @@ test_10() {
  
         return $RC
  }
-run_test 10 "run for fixing bug10707(32bit) ==========="
+run_test_with_stat 10 "run for fixing bug10707(32bit) ==========="
  
  test_11() {
         wait_delete_completed
@@ -935,7 +941,7 @@ test_11() {
         fi
         return $RV
  }
-run_test 11 "run for fixing bug10912 ==========="
+run_test_with_stat 11 "run for fixing bug10912 ==========="
  
  
  # test a deadlock between quota and journal b=11693
@@ -977,7 +983,7 @@ test_12() {
         echo  "   step2: testing ......"
         count=0
         while [ true ]; do
-           if [ -z `ps -ef | awk '$2 == '${DDPID1}' { print $8 }'` ]; then break; fi
+           if ! ps -p ${DDPID1} > /dev/null 2>&1; then break; fi
             count=$[count+1]
             if [ $count -gt 64 ]; then
                 lustre_fail ost 0
@@ -993,7 +999,7 @@ test_12() {
         echo  "   step3: testing ......"
         count=0
         while [ true ]; do
-           if [ -z `ps -ef | awk '$2 == '${DDPID}' { print $8 }'` ]; then break; fi
+           if ! ps -p ${DDPID} > /dev/null 2>&1; then break; fi
             count=$[count+1]
             if [ $count -gt 150 ]; then
                 error "dd should be finished!"
@@ -1007,7 +1013,7 @@ test_12() {
  
          $LFS setquota -u $TSTUSR -b 0 -B 0 -i 0 -I 0 $DIR
  }
-run_test 12 "test a deadlock between quota and journal ==="
+run_test_with_stat 12 "test a deadlock between quota and journal ==="
  
  # test multiple clients write block quota b=11693
  test_13() {
@@ -1037,7 +1043,7 @@ test_13() {
         echo  "   step2: testing ......"
         count=0
         while [ true ]; do
-           if [ -z `ps -ef | awk '$2 == '${DDPID}' { print $8 }'` ]; then break; fi
+           if ! ps -p ${DDPID} > /dev/null 2>&1; then break; fi
             count=$[count+1]
             if [ $count -gt 64 ]; then
                 error "dd should be finished!"
@@ -1048,7 +1054,7 @@ test_13() {
  
         count=0
         while [ true ]; do
-           if [ -z `ps -ef | awk '$2 == '${DDPID1}' { print $8 }'` ]; then break; fi
+           if ! ps -p ${DDPID1} > /dev/null 2>&1 ; then break; fi
             count=$[count+1]
             if [ $count -gt 64 ]; then
                 error "dd should be finished!"
@@ -1071,10 +1077,10 @@ test_13() {
  
         $LFS setquota -u $TSTUSR -b 0 -B 0 -i 0 -I 0 $DIR
  }
-run_test 13 "test multiple clients write block quota ==="
+run_test_with_stat 13 "test multiple clients write block quota ==="
  
  check_if_quota_zero(){
-        line=`$LFS quota -$1 $2 $DIR | wc -l`
+        line=`$LFS quota -v -$1 $2 $DIR | wc -l`
         for i in `seq 3 $line`; do
             if [ $i -eq 3 ]; then
                 field="3 4 6 7"
@@ -1082,26 +1088,23 @@ check_if_quota_zero(){
                 field="3 5"
             fi
             for j in $field; do
-               tmp=`$LFS quota -$1 $2 $DIR | sed -n ${i}p |
+               tmp=`$LFS quota -v -$1 $2 $DIR | sed -n ${i}p |
                       awk  '{print $'"$j"'}'`
-               [ -n "$tmp" ] && [ $tmp -ne 0 ] && $LFS quota -$1 $2 $DIR && \
+               [ -n "$tmp" ] && [ $tmp -ne 0 ] && $LFS quota -v -$1 $2 $DIR && \
                     error "quota on $2 isn't clean"
             done
         done
         echo "pass check_if_quota_zero"
  }
  
-pre_test_14 () {
+test_14a() {   # was test_14 b=12223 -- setting quota on root
+       TESTFILE="$DIR/$tdir/$tfile"
+
          # reboot the lustre
          sync; sleep 5; sync
-        cd $T_PWD; sh llmountcleanup.sh || error "llmountcleanup failed"
-        sh llmount.sh
-        run_test 0 "reboot lustre"
-}
-pre_test_14
+        cleanup_and_setup_lustre
+        test_0
  
-test_14a() {   # was test_14 b=12223 -- setting quota on root
-       TESTFILE="$DIR/$tdir/$tfile"
         mkdir -p $DIR/$tdir
  
         # out of root's file and block quota
@@ -1129,13 +1132,13 @@ test_14a() {    # was test_14 b=12223 -- setting quota on root
         rm -f $TESTFILE
         sync; sleep 3; sync;
  }
-run_test 14a "test setting quota on root ==="
+run_test_with_stat 14a "test setting quota on root ==="
  
  # set quota version (both administrative and operational quotas)
  quota_set_version() {
-        do_facet mds "lctl set_param mds.${FSNAME}-MDT*.quota_type=$1"
+        do_facet mds "lctl set_param lquota.${FSNAME}-MDT*.quota_type=$1"
          for j in `seq $OSTCOUNT`; do
-                do_facet ost$j "lctl set_param obdfilter.*.quota_type=$1"
+                do_facet ost$j "lctl set_param lquota.${FSNAME}-OST*.quota_type=$1"
          done
  }
  
@@ -1158,7 +1161,7 @@ test_14b(){
  
          MISSING_USERS=""
          for i in `seq 1 30`; do
-                check_runas_id_ret quota15_$i "runas -u quota15_$i" >/dev/null 2>/dev/null
+                check_runas_id_ret quota15_$i quota_usr "runas -u quota15_$i -g quota_usr" >/dev/null 2>/dev/null
                  if [ "$?" != "0" ]; then
                         MISSING_USERS="$MISSING_USERS quota15_$i"
                  fi
@@ -1174,10 +1177,11 @@ test_14b(){
          quota_set_version 1
          echo "running quotacheck"
          $LFS quotacheck -ug $DIR
+        mkdir -p $DIR/$tdir
          chmod 0777 $DIR/$tdir
          for i in `seq 1 30`; do
                  l=$[$i*1024*128] # set limits in 128 Mb units
-                $LFS setquota -u quota15_$i $l $l $l $l $DIR || error "lfs setquota failed"
+                $LFS setquota -u quota15_$i -b $l -B $l -i $l -I $l $DIR || error "lfs setquota failed"
                  runas -u quota15_$i dd if=/dev/zero of="$DIR/$tdir/quota15_$i" \
                        bs=1048576 count=$[($i+1)/2] || error "dd failed"
          done
@@ -1186,7 +1190,7 @@ test_14b(){
          
          echo "saving quota data"
          for i in `seq 1 30`; do
-                CURSPACE[$i]=`$LFS quota -u quota15_$i $MOUNT | awk '{if(start) {start=0; sum += $1} if(($1 ~ /OST/) && (NF==1)) {start=1;} 
+                CURSPACE[$i]=`$LFS quota -v -u quota15_$i $MOUNT | awk '{if(start) {start=0; sum += $1} if(($1 ~ /OST/) && (NF==1)) {start=1;} 
                                if(($1 ~ /OST/) && (NF != 1)) {sum += $2}; } END { print sum }'`
          done
  
@@ -1203,18 +1207,18 @@ test_14b(){
                  l=$[$i*1024*128]
                  # the format is "mntpnt   curspace[*]   bsoftlimit   bhardlimit   [time]   curinodes[*]    isoftlimit  ihardlimit"
                  echo "checking administrative quota migration results for user quota15_$i"
-                $LFS quota -u quota15_$i $DIR | grep -E '^ *'$MOUNT' *[0-9]+\** *'$l' *'$l' *[0-9]+\** *'$l' *'$l \
+                $LFS quota -v -u quota15_$i $DIR | grep -E '^ *'$MOUNT' *[0-9]+\** *'$l' *'$l' *[0-9]+\** *'$l' *'$l \
                    || error "lfs quota output is unexpected"
                  echo "checking operational quota migration results for user quota15_$i, curspace should be ${CURSPACE[$i]}"
-                l=`$LFS quota -u quota15_$i $MOUNT | awk '{if(start) {start=0; sum += $1} if(($1 ~ /OST/) && (NF==1)) {start=1;} 
+                l=`$LFS quota -v -u quota15_$i $MOUNT | awk '{if(start) {start=0; sum += $1} if(($1 ~ /OST/) && (NF==1)) {start=1;} 
                     if(($1 ~ /OST/) && (NF != 1)) {sum += $2}; } END { print sum }'`
                  echo "...real is $l"
                  [ "$l" -eq "${CURSPACE[$i]}" ] || error "curspace mismatch"
                  rm $DIR/$tdir/quota15_$i || error "could not remove quota15_$i"
-                $LFS setquota -u quota15_$i 0 0 0 0 $DIR || error "ifs setquota clear failed"
+                $LFS setquota -u quota15_$i -b 0 -B 0 -i 0 -I 0 $DIR || error "lfs setquota clear failed"
          done
  }
-run_test 14b "setting 30 quota entries in quota v1 file before conversion ==="
+run_test_with_stat 14b "setting 30 quota entries in quota v1 file before conversion ==="
  
  test_15(){
          LIMIT=$((24 * 1024 * 1024 * 1024 * 1024)) # 24 TB
@@ -1224,14 +1228,14 @@ test_15(){
  
          # test for user
          $LFS setquota -u $TSTUSR -b 0 -B $LIMIT -i 0 -I 0 $DIR
-        TOTAL_LIMIT="`$LFS quota -u $TSTUSR $DIR | awk '/^.*'$PATTERN'.*[[:digit:]+][[:space:]+]/ { print $4 }'`"
+        TOTAL_LIMIT="`$LFS quota -v -u $TSTUSR $DIR | awk '/^.*'$PATTERN'.*[[:digit:]+][[:space:]+]/ { print $4 }'`"
          [ $TOTAL_LIMIT -eq $LIMIT ] || error "  (user)total limits = $TOTAL_LIMIT; limit = $LIMIT, failed!"
          echo "  (user)total limits = $TOTAL_LIMIT; limit = $LIMIT, successful!"
          $LFS setquota -u $TSTUSR -b 0 -B 0 -i 0 -I 0 $DIR
  
          # test for group
          $LFS setquota -g $TSTUSR -b 0 -B $LIMIT -i 0 -I 0 $DIR
-        TOTAL_LIMIT="`$LFS quota -g $TSTUSR $DIR | awk '/^.*'$PATTERN'.*[[:digit:]+][[:space:]+]/ { print $4 }'`"
+        TOTAL_LIMIT="`$LFS quota -v -g $TSTUSR $DIR | awk '/^.*'$PATTERN'.*[[:digit:]+][[:space:]+]/ { print $4 }'`"
          [ $TOTAL_LIMIT -eq $LIMIT ] || error "  (group)total limits = $TOTAL_LIMIT; limit = $LIMIT, failed!"
          echo "  (group)total limits = $TOTAL_LIMIT; limit = $LIMIT, successful!"
          $LFS setquota -g $TSTUSR -b 0 -B 0 -i 0 -I 0 $DIR
@@ -1242,7 +1246,7 @@ test_15(){
          echo "Testing that >4GB quota limits fail on volume with quota v1"
          ! $LFS setquota -u $TSTUSR -b 0 -B $LIMIT -i 0 -I 0 $DIR
  }
-run_test 15 "set block quota more than 4T ==="
+run_test_with_stat 15 "set block quota more than 4T ==="
  
  # $1=u/g $2=with qunit adjust or not
  test_16_tub() {
@@ -1303,7 +1307,7 @@ test_16 () {
         set_blk_unitsz $((128 * 1024))
         set_blk_tunesz $((128 * 1024 / 2))
  }
-run_test 16 "test without adjusting qunit"
+run_test_with_stat 16 "test without adjusting qunit"
  
  # run for fixing bug14526, failed returned quota reqs shouldn't ruin lustre.
  test_17() {
@@ -1358,9 +1362,12 @@ test_17() {
         set_blk_unitsz $((128 * 1024))
         set_blk_tunesz $((128 * 1024 / 2))
  
+       $LFS setquota -u $TSTUSR -b 0 -B 0 -i 0 -I 0 $MOUNT
+       $LFS setquota -g $TSTUSR -b 0 -B 0 -i 0 -I 0 $MOUNT
+
         return $RC
  }
-run_test 17 "run for fixing bug14526 ==========="
+run_test_with_stat 17 "run for fixing bug14526 ==========="
  
  # test when mds takes a long time to handle a quota req so that
  # the ost has dropped it, the ost still could work well b=14840
@@ -1395,20 +1402,18 @@ test_18() {
         count=0
         timeout=$(lctl get_param -n timeout)
         while [ true ]; do
-           if [ -z `ps -ef | awk '$2 == '${DDPID}' { print $8 }'` ]; then break; fi
+           if ! ps -p ${DDPID} > /dev/null 2>&1; then break; fi
             count=$[count+1]
-           if [ $count -gt $((2 * $timeout)) ]; then
+           if [ $count -gt $((4 * $timeout)) ]; then
                 error "count=$count dd should be finished!"
             fi
             sleep 1
         done
          log "(dd_pid=$DDPID, time=$count, timeout=$timeout)"
-        if [ $count -lt $(($timeout - 10)) ]; then
-            error " should take longer!"
-        else
-            echo " successful"
-        fi
  
+        testfile_size=$(stat -c %s $TESTFILE)
+        [ $testfile_size -ne $((BLK_SZ * 1024 * 100)) ] && \
+           error "expect $((BLK_SZ * 1024 * 100)), got ${testfile_size}. Verifying file failed!"
         rm -f $TESTFILE
         sync; sleep 3; sync;
  
@@ -1417,7 +1422,7 @@ test_18() {
         set_blk_unitsz $((128 * 1024))
         set_blk_tunesz $((128 * 1024 / 2))
  }
-run_test 18 "run for fixing bug14840 ==========="
+run_test_with_stat 18 "run for fixing bug14840 ==========="
  
  # test when mds drops a quota req, the ost still could work well b=14840
  test_18a() {
@@ -1448,7 +1453,7 @@ test_18a() {
         count=0
         timeout=$(lctl get_param -n timeout)
         while [ true ]; do
-           if [ -z `ps -ef | awk '$2 == '${DDPID}' { print $8 }'` ]; then break; fi
+           if ! ps -p ${DDPID} > /dev/null 2>&1; then break; fi
             count=$[count+1]
             if [ $count -gt $((3 * $timeout)) ]; then
                 lustre_fail mds 0
@@ -1457,12 +1462,6 @@ test_18a() {
             sleep 1
         done
          log "(dd_pid=$DDPID, time=$count, timeout=$timeout)"
-        if [ $count -lt $(($timeout - 10)) ]; then
-           lustre_fail mds 0
-            error " should take longer!"
-        else
-            echo " successful"
-        fi
  
          lustre_fail mds 0
  
@@ -1474,14 +1473,105 @@ test_18a() {
         set_blk_unitsz $((128 * 1024))
         set_blk_tunesz $((128 * 1024 / 2))
  }
-run_test 18a "run for fixing bug14840 ==========="
+run_test_with_stat 18a "run for fixing bug14840 ==========="
  
-test_19() {
-       # 1 Mb bunit per each MDS/OSS
-       LIMIT=$((($OSTCOUNT + 1) * 1024))
-       TESTFILE="$DIR/$tdir/$tfile"
-       mkdir -p $DIR/$tdir
+# test when mds do failover, the ost still could work well without trigger
+# watchdog b=14840
+test_18bc_sub() {
+        type=$1
+
+        LIMIT=$((110 * 1024 )) # 110M
+        TESTFILE="$DIR/$tdir/$tfile"
+        mkdir -p $DIR/$tdir
+
+        wait_delete_completed
+
+        set_blk_tunesz 512
+        set_blk_unitsz 1024
+
+        log "   User quota (limit: $LIMIT kbytes)"
+        $LFS setquota -u $TSTUSR -b 0 -B $LIMIT -i 0 -I 0 $MOUNT
+        $SHOW_QUOTA_USER
+
+        $LFS setstripe $TESTFILE -i 0 -c 1
+        chown $TSTUSR.$TSTUSR $TESTFILE
+
+        timeout=$(sysctl -n lustre.timeout)
+
+       if [ $type = "directio" ]; then
+           log "   write 100M block(directio) ..."
+           $RUNAS $DIRECTIO write $TESTFILE 0 100 $((BLK_SZ * 1024)) &
+       else
+           log "   write 100M block(normal) ..."
+           $RUNAS dd if=/dev/zero of=$TESTFILE bs=$((BLK_SZ * 1024)) count=100 &
+       fi
+
+        DDPID=$!
+        do_facet mds "$LCTL conf_param ${FSNAME}-MDT*.mdt.quota_type=ug"
+
+       log "failing mds for $((2 * timeout)) seconds"
+        fail mds $((2 * timeout))
+
+        # check if quotaon successful
+        $LFS quota -u $TSTUSR $MOUNT 2>&1 | grep -q "quotas are not enabled"
+        if [ $? -eq 0 ]; then
+            error "quotaon failed!"
+            rm -rf $TESTFILE
+            return
+        fi
+
+        count=0
+        while [ true ]; do
+           if ! ps -p ${DDPID} > /dev/null 2>&1; then break; fi
+            if [ $((++count % (2 * timeout) )) -eq 0 ]; then
+                log "it took $count second"
+            fi
+            sleep 1
+        done
+        log "(dd_pid=$DDPID, time=$count, timeout=$timeout)"
+        sync; sleep 1; sync
+
+        testfile_size=$(stat -c %s $TESTFILE)
+        [ $testfile_size -ne $((BLK_SZ * 1024 * 100)) ] && \
+           error "expect $((BLK_SZ * 1024 * 100)), got ${testfile_size}. Verifying file failed!"
+        $SHOW_QUOTA_USER
+        $LFS setquota -u $TSTUSR -b 0 -B 0 -i 0 -I 0 $MOUNT
+        rm -rf $TESTFILE
+        sync; sleep 1; sync
+}
  
+# test when mds does failover, the ost still could work well
+# this test shouldn't trigger watchdog b=14840
+test_18b() {
+       test_18bc_sub normal
+       test_18bc_sub directio
+       # check if watchdog is triggered
+       do_facet ost1 dmesg > $TMP/lustre-log-${TESTNAME}.log
+       watchdog=`awk '/test 18b/ {start = 1;}
+                      /Watchdog triggered/ {
+                              if (start) {
+                                      print;
+                              }
+                      }' $TMP/lustre-log-${TESTNAME}.log`
+       if [ -n "$watchdog" ]; then error "$watchdog"; fi
+       rm -f $TMP/lustre-log-${TESTNAME}.log
+}
+run_test_with_stat 18b "run for fixing bug14840(mds failover, no watchdog) ==========="
+
+# test when mds does failover, the ost still could work well
+# this test will prevent OST_DISCONNET from happening b=14840
+test_18c() {
+       # define OBD_FAIL_OST_DISCONNECT_NET 0x202(disable ost_disconnect for osts)
+       lustre_fail ost  0x202
+       test_18bc_sub normal
+       test_18bc_sub directio
+       lustre_fail ost  0
+}
+run_test_with_stat 18c "run for fixing bug14840(mds failover, OST_DISCONNECT is disabled) ==========="
+
+run_to_block_limit() {
+       local LIMIT=$((($OSTCOUNT + 1) * $BUNIT_SZ))
+       local TESTFILE=$1
         wait_delete_completed
  
         # set 1 Mb quota unit size
@@ -1499,13 +1589,21 @@ test_19() {
         $LFS setquota -u $TSTUSR -b 0 -B $LIMIT -i 0 -I 0 $MOUNT
         $SHOW_QUOTA_USER
  
-       $RUNAS dd if=/dev/zero of=$TESTFILE bs=$BLK_SZ count=1028 || true
+       RUNDD="$RUNAS dd if=/dev/zero of=$TESTFILE bs=$BLK_SZ"
+       $RUNDD count=$BUNIT_SZ || error "(usr) write failure, but expect success"
         # for now page cache of TESTFILE may still be dirty,
         # let's push it to the corresponding OST, this will also
         # cache NOQUOTA on the client from OST's reply
         cancel_lru_locks osc
-       $RUNAS dd if=/dev/zero of=$TESTFILE seek=1028 bs=$BLK_SZ count=1 && \
-               error "(usr) write success, should be EDQUOT"
+       $RUNDD seek=$BUNIT_SZ && error "(usr) write success, should be EDQUOT"
+}
+
+test_19() {
+       # 1 Mb bunit per each MDS/OSS
+       local TESTFILE="$DIR/$tdir/$tfile"
+       mkdir -p $DIR/$tdir
+
+       run_to_block_limit $TESTFILE
         $SHOW_QUOTA_USER
  
         # cleanup
@@ -1516,7 +1614,7 @@ test_19() {
         set_blk_tunesz $((128 * 1024 / 2))
  
  }
-run_test 19 "test if administrative limits updates do not zero operational limits (14790) ==="
+run_test_with_stat 19 "test if administrative limits updates do not zero operational limits (14790) ==="
  
  test_20()
  {
@@ -1531,7 +1629,7 @@ test_20()
                                   --inode-hardlimit ${LSTR[3]} \
                                   $MOUNT || error "could not set quota limits"
  
-        ($LFS quota -u $TSTUSR $MOUNT  | \
+        ($LFS quota -v -u $TSTUSR $MOUNT  | \
              grep -E '^ *'$MOUNT' *[0-9]+\** *'${LVAL[0]}' *'${LVAL[1]}' *[0-9]+\** *'${LVAL[2]}' *'${LVAL[3]}) \
                   || error "lfs quota output is unexpected"
  
@@ -1539,7 +1637,7 @@ test_20()
                                   $MOUNT || error "could not reset quota limits"
  
  }
-run_test 20 "test if setquota specifiers work properly (15754)"
+run_test_with_stat 20 "test if setquota specifiers work properly (15754)"
  
  test_21_sub() {
         local testfile=$1
@@ -1590,7 +1688,7 @@ test_21() {
  
         count=0
         while [ true ]; do
-           if [  $(ps -p ${DDPID1} | wc -l) -eq 1 ]; then break; fi
+           if ! ps -p ${DDPID1} > /dev/null 2>&1; then break; fi
             count=$[count+1]
             if [ $count -gt 60 ]; then
                 error "dd should be finished!"
@@ -1601,7 +1699,7 @@ test_21() {
  
         count=0
         while [ true ]; do
-           if [ $(ps -p ${DDPID2} | wc -l) -eq 1 ]; then break; fi
+           if ! ps -p ${DDPID2} > /dev/null 2>&1; then break; fi
             count=$[count+1]
             if [ $count -gt 60 ]; then
                 error "dd should be finished!"
@@ -1617,7 +1715,7 @@ test_21() {
  
         return $RC
  }
-run_test 21 "run for fixing bug16053 ==========="
+run_test_with_stat 21 "run for fixing bug16053 ==========="
  
  test_22() {
          local SAVEREFORMAT
@@ -1642,7 +1740,76 @@ test_22() {
  
          run_test 0 "reboot lustre"
  }
-run_test 22 "test if quota_type saved as permanent parameter ===="
+run_test_with_stat 22 "test if quota_type saved as permanent parameter ===="
+
+test_23_sub() {
+       mkdir -p $DIR/$tdir
+       chmod 0777 $DIR/$tdir
+       TESTFILE="$DIR/$tdir/$tfile-0"
+       local bs_unit=$((1024*1024))
+       LIMIT=$1
+
+       wait_delete_completed
+
+       # test for user
+       log "  User quota (limit: $LIMIT kbytes)"
+       $LFS setquota -u $TSTUSR -b 0 -B $LIMIT -i 0 -I 0 $DIR
+       sleep 3
+       $SHOW_QUOTA_USER
+
+       $LFS setstripe $TESTFILE -c 1
+       chown $TSTUSR.$TSTUSR $TESTFILE
+
+       log "    Step1: trigger quota with 0_DIRECT"
+       log "      Write half of file"
+       $RUNAS $DIRECTIO write $TESTFILE 0 $(($LIMIT/1024/2)) $bs_unit || error "(usr) write failure, but expect success"
+       log "      Write out of block quota ..."
+       $RUNAS $DIRECTIO write $TESTFILE $(($LIMIT/1024/2)) $(($LIMIT/1024/2)) $bs_unit && error "(usr) write success, but expect EDQUOT"
+       log "    Step1: done"
+
+       log "    Step2: rewrite should succeed"
+       $RUNAS $DIRECTIO write $TESTFILE $(($LIMIT/1024/2)) 1 $bs_unit 2>&1 || error "(usr) write failure, but expect success"
+       log "    Step2: done"
+
+       rm -f $TESTFILE
+       wait_delete_completed
+       OST0_UUID=`do_facet ost1 $LCTL dl | grep -m1 obdfilter | awk '{print $((NF-1))}'`
+       OST0_QUOTA_USED=`$LFS quota -o $OST0_UUID -u $TSTUSR $DIR | awk '/^.*[[:digit:]+][[:space:]+]/ { print $1 }'`
+       echo $OST0_QUOTA_USED
+       [ $OST0_QUOTA_USED -ne 0 ] && \
+           ($SHOW_QUOTA_USER; error "quota deleted isn't released")
+       $SHOW_QUOTA_USER
+       $LFS setquota -u $TSTUSR -b 0 -B 0 -i 0 -I 0 $DIR
+
+}
+
+test_23() {
+       log "run for $((OSTCOUNT * 3))MB test file"
+       test_23_sub $((OSTCOUNT * 3 * 1024))
+
+       OST0_MIN=120000
+       check_whether_skip && return 0
+       log "run for $((OSTCOUNT * 30))MB test file"
+       test_23_sub $((OSTCOUNT * 30 * 1024))
+}
+run_test_with_stat 23 "run for fixing bug16125 ==========="
+
+test_24() {
+       local TESTFILE="$DIR/$tdir/$tfile"
+       mkdir -p $DIR/$tdir
+
+       run_to_block_limit $TESTFILE
+       $SHOW_QUOTA_USER | grep '*' || error "no matching *"
+
+       # cleanup
+       rm -f $TESTFILE
+       $LFS setquota -u $TSTUSR -b 0 -B 0 -i 0 -I 0 $MOUNT
+
+       set_blk_unitsz $((128 * 1024))
+       set_blk_tunesz $((128 * 1024 / 2))
+        
+}
+run_test_with_stat 24 "test if lfs draws an asterix when limit is reached (16646) ==========="
  
  # turn off quota
  test_99()
@@ -1652,7 +1819,7 @@ test_99()
  
         return 0
  }
-run_test 99 "Quota off ==============================="
+run_test_with_stat 99 "Quota off ==============================="
  
  
  log "cleanup: ======================================================"
diff --git a/lustre/tests/sanity.sh b/lustre/tests/sanity.sh

index 727ff63..821f40e 100644 (file)
--- a/lustre/tests/sanity.sh
+++ b/lustre/tests/sanity.sh
@@ -7,8 +7,8 @@
  set -e
  
  ONLY=${ONLY:-"$*"}
-# bug number for skipped test:  13297 2108 9789 3637 9789 3561 12622 15528/2330 5188 10764
-ALWAYS_EXCEPT=${ALWAYS_EXCEPT:-"27u   42a  42b  42c  42d  45   51d   62         68   75 $SANITY_EXCEPT" }
+# bug number for skipped test:  13297 2108 9789 3637 9789 3561 12622 15528/2330 5188 10764 16410
+ALWAYS_EXCEPT=${ALWAYS_EXCEPT:-"27u   42a  42b  42c  42d  45   51d   62         68   75    76 $SANITY_EXCEPT"}
  # UPDATE THE COMMENT ABOVE WITH BUG NUMBERS WHEN CHANGING ALWAYS_EXCEPT!
  
  # Tests that fail on uml, maybe elsewhere, FIXME
@@ -33,7 +33,6 @@ CREATETEST=${CREATETEST:-createtest}
  LFS=${LFS:-lfs}
  SETSTRIPE=${SETSTRIPE:-"$LFS setstripe"}
  GETSTRIPE=${GETSTRIPE:-"$LFS getstripe"}
-LSTRIPE=${LSTRIPE:-"$LFS setstripe"}
  LFIND=${LFIND:-"$LFS find"}
  LVERIFY=${LVERIFY:-ll_dirstripe_verify}
  LSTRIPEINFO=${LSTRIPEINFO:-ll_getstripe_info}
@@ -71,7 +70,7 @@ init_test_env $@
  [ "$SLOW" = "no" ] && EXCEPT_SLOW="24o 27m 36f 36g 51b 51c 60c 63 64b 68 71 73 77f 78 101 103 115 120g 124b"
  
  SANITYLOG=${TESTSUITELOG:-$TMP/$(basename $0 .sh).log}
-FAIL_ON_ERROR=false
+FAIL_ON_ERROR=${FAIL_ON_ERROR:-false}
  
  cleanup() {
         echo -n "cln.."
@@ -90,8 +89,10 @@ check_kernel_version() {
         VERSION_FILE=version
         WANT_VER=$1
         GOT_VER=$(lctl get_param -n $VERSION_FILE | awk '/kernel:/ {print $2}')
-       [ $GOT_VER == "patchless" ] && return 0
-       [ $GOT_VER -ge $WANT_VER ] && return 0
+       case $GOT_VER in
+       patchless|patchless_client) return 0;;
+       *) [ $GOT_VER -ge $WANT_VER ] && return 0 ;;
+       esac
         log "test needs at least kernel version $WANT_VER, running $GOT_VER"
         return 1
  }
@@ -122,7 +123,7 @@ rm -rf $DIR/[Rdfs][0-9]*
  # $RUNAS_ID may get set incorrectly somewhere else
  [ $UID -eq 0 -a $RUNAS_ID -eq 0 ] && error "\$RUNAS_ID set to 0, but \$UID is also 0!"
  
-check_runas_id $RUNAS_ID $RUNAS
+check_runas_id $RUNAS_ID $RUNAS_ID $RUNAS
  
  build_test_filter
  
@@ -150,6 +151,12 @@ test_0b() {
  }
  run_test 0b "chmod 0755 $DIR ============================="
  
+test_0c() {
+    $LCTL get_param mdc.*.import | grep  "state: FULL" || error "import not FULL"
+    $LCTL get_param mdc.*.import | grep  "target: $FSNAME-MDT" || error "bad target"
+}
+run_test 0c "check import proc ============================="
+
  test_1a() {
         mkdir $DIR/d1
         mkdir $DIR/d1/d2
@@ -436,6 +443,14 @@ test_17e() {
  }
  run_test 17e "symlinks: create recursive symlink (should return error) ===="
  
+test_17g() {
+        mkdir -p $DIR/$tdir
+        LONGSYMLINK="$(dd if=/dev/zero bs=4095 count=1 | tr '\0' 'x')"
+        ln -s $LONGSYMLINK $DIR/$tdir/$tfile
+        ls -l $DIR/$tdir
+}
+run_test 17g "symlinks: really long symlink name ==============================="
+
  test_18() {
         touch $DIR/f
         ls $DIR || error
@@ -549,7 +564,7 @@ test_24d() {
  run_test 24d "mkdir .../R4/{f,g}; rename .../R4/f .../R4/g ====="
  
  test_24e() {
-       echo '-- cross directory renames --' 
+       echo '-- cross directory renames --'
         mkdir $DIR/R5{a,b}
         touch $DIR/R5a/f
         mv $DIR/R5a/f $DIR/R5b/g
@@ -604,7 +619,7 @@ test_24j() {
         $CHECKSTAT -a $DIR/R10/f || error
         $CHECKSTAT -a $DIR/R10/g || error
  }
-run_test 24j "source does not exist ============================" 
+run_test 24j "source does not exist ============================"
  
  test_24k() {
         mkdir $DIR/R11a $DIR/R11a/d
@@ -801,7 +816,7 @@ run_test 27c "create two stripe file f01 ======================="
  
  test_27d() {
         mkdir -p $DIR/d27
-       $SETSTRIPE $DIR/d27/fdef 0 -1 0 || error "lstripe failed"
+       $SETSTRIPE -c0 -i-1 -s0 $DIR/d27/fdef || error "lstripe failed"
         $CHECKSTAT -t file $DIR/d27/fdef || error "checkstat failed"
         dd if=/dev/zero of=$DIR/d27/fdef bs=4k count=4 || error
  }
@@ -896,24 +911,25 @@ reset_enospc() {
         [ "$1" ] && FAIL_LOC=$1 || FAIL_LOC=0
         mkdir -p $DIR/d27/nospc
         rmdir $DIR/d27/nospc
-       lctl set_param fail_loc=$FAIL_LOC
+       do_nodes $(comma_list $(osts_nodes)) lctl set_param fail_loc=$FAIL_LOC
  }
  
  exhaust_precreations() {
         OSTIDX=$1
-       OST=$(lctl get_param -n lov.${LOVNAME}.target_obd | grep ${OSTIDX}": " | \
+
+       OST=$(lfs osts | grep ${OSTIDX}": " | \
             awk '{print $2}' | sed -e 's/_UUID$//')
         # on the mdt's osc
-       last_id=$(lctl get_param -n osc.${OST}-osc.prealloc_last_id)
-       next_id=$(lctl get_param -n osc.${OST}-osc.prealloc_next_id)
+       last_id=$(do_facet mds lctl get_param -n osc.${OST}-osc.prealloc_last_id)
+       next_id=$(do_facet mds lctl get_param -n osc.${OST}-osc.prealloc_next_id)
  
         mkdir -p $DIR/d27/${OST}
         $SETSTRIPE $DIR/d27/${OST} -i $OSTIDX -c 1
         #define OBD_FAIL_OST_ENOSPC 0x215
-       lctl set_param fail_loc=0x215
+       do_facet ost$((OSTIDX + 1)) lctl set_param fail_loc=0x215
         echo "Creating to objid $last_id on ost $OST..."
         createmany -o $DIR/d27/${OST}/f $next_id $((last_id - next_id + 2))
-       lctl get_param -n osc.${OST}-osc.prealloc* | grep '[0-9]'
+       do_facet mds "lctl get_param -n osc.${OST}-osc.prealloc*" | grep '[0-9]'
         reset_enospc $2
  }
  
@@ -927,7 +943,8 @@ exhaust_all_precreations() {
  
  test_27n() {
         [ "$OSTCOUNT" -lt "2" ] && skip "too few OSTs" && return
-       remote_mds && skip "remote MDS" && return
+       remote_mds_nodsh && skip "remote MDS with nodsh" && return
+       remote_ost_nodsh && skip "remote OST with nodsh" && return
  
         reset_enospc
         rm -f $DIR/d27/f27n
@@ -941,7 +958,8 @@ run_test 27n "create file with some full OSTs =================="
  
  test_27o() {
         [ "$OSTCOUNT" -lt "2" ] && skip "too few OSTs" && return
-       remote_mds && skip "remote MDS" && return
+       remote_mds_nodsh && skip "remote MDS with nodsh" && return
+       remote_ost_nodsh && skip "remote OST with nodsh" && return
  
         reset_enospc
         rm -f $DIR/d27/f27o
@@ -956,7 +974,8 @@ run_test 27o "create file with all full OSTs (should error) ===="
  
  test_27p() {
         [ "$OSTCOUNT" -lt "2" ] && skip "too few OSTs" && return
-       remote_mds && skip "remote MDS" && return
+       remote_mds_nodsh && skip "remote MDS with nodsh" && return
+       remote_ost_nodsh && skip "remote OST with nodsh" && return
  
         reset_enospc
         rm -f $DIR/d27/f27p
@@ -975,7 +994,8 @@ run_test 27p "append to a truncated file with some full OSTs ==="
  
  test_27q() {
         [ "$OSTCOUNT" -lt "2" ] && skip "too few OSTs" && return
-       remote_mds && skip "remote MDS" && return
+       remote_mds_nodsh && skip "remote MDS with nodsh" && return
+       remote_ost_nodsh && skip "remote OST with nodsh" && return
  
         reset_enospc
         rm -f $DIR/d27/f27q
@@ -995,7 +1015,8 @@ run_test 27q "append to truncated file with all OSTs full (should error) ==="
  
  test_27r() {
         [ "$OSTCOUNT" -lt "2" ] && skip "too few OSTs" && return
-       remote_mds && skip "remote MDS" && return
+       remote_mds_nodsh && skip "remote MDS with nodsh" && return
+       remote_ost_nodsh && skip "remote OST with nodsh" && return
  
         reset_enospc
         rm -f $DIR/d27/f27r
@@ -1009,7 +1030,7 @@ run_test 27r "stripe file with some full OSTs (shouldn't LBUG) ="
  
  test_27s() { # bug 10725
         mkdir -p $DIR/$tdir
-       $LSTRIPE $DIR/$tdir $((2048 * 1024 * 1024)) -1 2 && \
+       $SETSTRIPE $DIR/$tdir $((4096 * 1024 * 1024)) -1 2 && \
                 error "stripe width >= 2^32 succeeded" || true
  }
  run_test 27s "lsm_xfersize overflow (should error) (bug 10725)"
@@ -1025,15 +1046,15 @@ test_27t() { # bug 10864
  run_test 27t "check that utils parse path correctly"
  
  test_27u() { # bug 4900
-       [ "$OSTCOUNT" -lt "2" ] && skip "too few OSTs" && return
-       remote_mds && skip "remote MDS" && return
+        [ "$OSTCOUNT" -lt "2" ] && skip "too few OSTs" && return
+        remote_mds_nodsh && skip "remote MDS with nodsh" && return
  
          #define OBD_FAIL_MDS_OSC_PRECREATE      0x139
  
-        lctl set_param fail_loc=0x139
+        do_facet mds lctl set_param fail_loc=0x139
          mkdir -p $DIR/d27u
          createmany -o $DIR/d27u/t- 1000
-        lctl set_param fail_loc=0
+        do_facet mds lctl set_param fail_loc=0
  
          TLOG=$DIR/$tfile.getstripe
          $GETSTRIPE $DIR/d27u > $TLOG
@@ -1046,7 +1067,8 @@ run_test 27u "skip object creation on OSC w/o objects =========="
  
  test_27v() { # bug 4900
         [ "$OSTCOUNT" -lt "2" ] && skip "too few OSTs" && return
-       remote_mds && skip "remote MDS" && return
+       remote_mds_nodsh && skip "remote MDS with nodsh" && return
+       remote_ost_nodsh && skip "remote OST with nodsh" && return
  
          exhaust_all_precreations
  
@@ -1073,14 +1095,15 @@ run_test 27v "skip object creation on slow OST ================="
  
  test_27w() { # bug 10997
          mkdir -p $DIR/d27w || error "mkdir failed"
-        $LSTRIPE $DIR/d27w/f0 -s 65536 || error "lstripe failed"
+        $SETSTRIPE $DIR/d27w/f0 -s 65536 || error "lstripe failed"
          size=`$LSTRIPEINFO $DIR/d27w/f0 | awk {'print $1'}`
          [ $size -ne 65536 ] && error "stripe size $size != 65536" || true
  
          [ "$OSTCOUNT" -lt "2" ] && skip "skipping multiple stripe count/offset test" && return
          for i in `seq 1 $OSTCOUNT`; do
                  offset=$(($i-1))
-                $LSTRIPE $DIR/d27w/f$i -c $i -i $offset || error "lstripe -c $i -i $offset failed"
+                log setstripe $DIR/d27w/f$i -c $i -i $offset
+                $SETSTRIPE $DIR/d27w/f$i -c $i -i $offset || error "lstripe -c $i -i $offset failed"
                  count=`$LSTRIPEINFO $DIR/d27w/f$i | awk {'print $2'}`
                  index=`$LSTRIPEINFO $DIR/d27w/f$i | awk {'print $3'}`
                  [ $count -ne $i ] && error "stripe count $count != $i" || true
@@ -1103,7 +1126,7 @@ test_29() {
         ls -l $DIR/d29
         LOCKCOUNTORIG=`lctl get_param -n ldlm.namespaces.*mdc*.lock_count`
         LOCKUNUSEDCOUNTORIG=`lctl get_param -n ldlm.namespaces.*mdc*.lock_unused_count`
-       [ -z $"LOCKCOUNTORIG" ] && echo "No mdc lock count" && return 1
+       [ -z $"LOCKCOUNTORIG" ] && error "No mdc lock count" && return 1
         log 'second d29'
         ls -l $DIR/d29
         log 'done'
@@ -1202,27 +1225,27 @@ run_test 31f "remove of open directory with open-unlink file ==="
  test_32a() {
         echo "== more mountpoints and symlinks ================="
         [ -e $DIR/d32a ] && rm -fr $DIR/d32a
-       mkdir -p $DIR/d32a/ext2-mountpoint 
+       mkdir -p $DIR/d32a/ext2-mountpoint
         mount -t ext2 -o loop $EXT2_DEV $DIR/d32a/ext2-mountpoint || error
-       $CHECKSTAT -t dir $DIR/d32a/ext2-mountpoint/.. || error  
+       $CHECKSTAT -t dir $DIR/d32a/ext2-mountpoint/.. || error
         $UMOUNT $DIR/d32a/ext2-mountpoint || error
  }
  run_test 32a "stat d32a/ext2-mountpoint/.. ====================="
  
  test_32b() {
         [ -e $DIR/d32b ] && rm -fr $DIR/d32b
-       mkdir -p $DIR/d32b/ext2-mountpoint 
+       mkdir -p $DIR/d32b/ext2-mountpoint
         mount -t ext2 -o loop $EXT2_DEV $DIR/d32b/ext2-mountpoint || error
         ls -al $DIR/d32b/ext2-mountpoint/.. || error
         $UMOUNT $DIR/d32b/ext2-mountpoint || error
  }
  run_test 32b "open d32b/ext2-mountpoint/.. ====================="
- 
+
  test_32c() {
         [ -e $DIR/d32c ] && rm -fr $DIR/d32c
-       mkdir -p $DIR/d32c/ext2-mountpoint 
+       mkdir -p $DIR/d32c/ext2-mountpoint
         mount -t ext2 -o loop $EXT2_DEV $DIR/d32c/ext2-mountpoint || error
-       mkdir -p $DIR/d32c/d2/test_dir    
+       mkdir -p $DIR/d32c/d2/test_dir
         $CHECKSTAT -t dir $DIR/d32c/ext2-mountpoint/../d2/test_dir || error
         $UMOUNT $DIR/d32c/ext2-mountpoint || error
  }
@@ -1230,9 +1253,9 @@ run_test 32c "stat d32c/ext2-mountpoint/../d2/test_dir ========="
  
  test_32d() {
         [ -e $DIR/d32d ] && rm -fr $DIR/d32d
-       mkdir -p $DIR/d32d/ext2-mountpoint 
+       mkdir -p $DIR/d32d/ext2-mountpoint
         mount -t ext2 -o loop $EXT2_DEV $DIR/d32d/ext2-mountpoint || error
-       mkdir -p $DIR/d32d/d2/test_dir    
+       mkdir -p $DIR/d32d/d2/test_dir
         ls -al $DIR/d32d/ext2-mountpoint/../d2/test_dir || error
         $UMOUNT $DIR/d32d/ext2-mountpoint || error
  }
@@ -1240,10 +1263,10 @@ run_test 32d "open d32d/ext2-mountpoint/../d2/test_dir ========="
  
  test_32e() {
         [ -e $DIR/d32e ] && rm -fr $DIR/d32e
-       mkdir -p $DIR/d32e/tmp    
-       TMP_DIR=$DIR/d32e/tmp       
-       ln -s $DIR/d32e $TMP_DIR/symlink11 
-       ln -s $TMP_DIR/symlink11 $TMP_DIR/../symlink01 
+       mkdir -p $DIR/d32e/tmp
+       TMP_DIR=$DIR/d32e/tmp
+       ln -s $DIR/d32e $TMP_DIR/symlink11
+       ln -s $TMP_DIR/symlink11 $TMP_DIR/../symlink01
         $CHECKSTAT -t link $DIR/d32e/tmp/symlink11 || error
         $CHECKSTAT -t link $DIR/d32e/symlink01 || error
  }
@@ -1251,20 +1274,20 @@ run_test 32e "stat d32e/symlink->tmp/symlink->lustre-subdir ===="
  
  test_32f() {
         [ -e $DIR/d32f ] && rm -fr $DIR/d32f
-       mkdir -p $DIR/d32f/tmp    
-       TMP_DIR=$DIR/d32f/tmp       
-       ln -s $DIR/d32f $TMP_DIR/symlink11 
-       ln -s $TMP_DIR/symlink11 $TMP_DIR/../symlink01 
+       mkdir -p $DIR/d32f/tmp
+       TMP_DIR=$DIR/d32f/tmp
+       ln -s $DIR/d32f $TMP_DIR/symlink11
+       ln -s $TMP_DIR/symlink11 $TMP_DIR/../symlink01
         ls $DIR/d32f/tmp/symlink11  || error
         ls $DIR/d32f/symlink01 || error
  }
  run_test 32f "open d32f/symlink->tmp/symlink->lustre-subdir ===="
  
  test_32g() {
-       TMP_DIR=$DIR/$tdir/tmp       
+       TMP_DIR=$DIR/$tdir/tmp
         mkdir -p $TMP_DIR $DIR/${tdir}2
-       ln -s $DIR/${tdir}2 $TMP_DIR/symlink12 
-       ln -s $TMP_DIR/symlink12 $TMP_DIR/../symlink02 
+       ln -s $DIR/${tdir}2 $TMP_DIR/symlink12
+       ln -s $TMP_DIR/symlink12 $TMP_DIR/../symlink02
         $CHECKSTAT -t link $TMP_DIR/symlink12 || error
         $CHECKSTAT -t link $DIR/$tdir/symlink02 || error
         $CHECKSTAT -t dir -f $TMP_DIR/symlink12 || error
@@ -1274,10 +1297,10 @@ run_test 32g "stat d32g/symlink->tmp/symlink->lustre-subdir/${tdir}2"
  
  test_32h() {
         rm -fr $DIR/$tdir $DIR/${tdir}2
-       TMP_DIR=$DIR/$tdir/tmp       
-       mkdir -p $TMP_DIR $DIR/${tdir}2 
-       ln -s $DIR/${tdir}2 $TMP_DIR/symlink12 
-       ln -s $TMP_DIR/symlink12 $TMP_DIR/../symlink02 
+       TMP_DIR=$DIR/$tdir/tmp
+       mkdir -p $TMP_DIR $DIR/${tdir}2
+       ln -s $DIR/${tdir}2 $TMP_DIR/symlink12
+       ln -s $TMP_DIR/symlink12 $TMP_DIR/../symlink02
         ls $TMP_DIR/symlink12 || error
         ls $DIR/$tdir/symlink02  || error
  }
@@ -1285,17 +1308,17 @@ run_test 32h "open d32h/symlink->tmp/symlink->lustre-subdir/${tdir}2"
  
  test_32i() {
         [ -e $DIR/d32i ] && rm -fr $DIR/d32i
-       mkdir -p $DIR/d32i/ext2-mountpoint 
+       mkdir -p $DIR/d32i/ext2-mountpoint
         mount -t ext2 -o loop $EXT2_DEV $DIR/d32i/ext2-mountpoint || error
         touch $DIR/d32i/test_file
-       $CHECKSTAT -t file $DIR/d32i/ext2-mountpoint/../test_file || error  
+       $CHECKSTAT -t file $DIR/d32i/ext2-mountpoint/../test_file || error
         $UMOUNT $DIR/d32i/ext2-mountpoint || error
  }
  run_test 32i "stat d32i/ext2-mountpoint/../test_file ==========="
  
  test_32j() {
         [ -e $DIR/d32j ] && rm -fr $DIR/d32j
-       mkdir -p $DIR/d32j/ext2-mountpoint 
+       mkdir -p $DIR/d32j/ext2-mountpoint
         mount -t ext2 -o loop $EXT2_DEV $DIR/d32j/ext2-mountpoint || error
         touch $DIR/d32j/test_file
         cat $DIR/d32j/ext2-mountpoint/../test_file || error
@@ -1305,8 +1328,8 @@ run_test 32j "open d32j/ext2-mountpoint/../test_file ==========="
  
  test_32k() {
         rm -fr $DIR/d32k
-       mkdir -p $DIR/d32k/ext2-mountpoint 
-       mount -t ext2 -o loop $EXT2_DEV $DIR/d32k/ext2-mountpoint  
+       mkdir -p $DIR/d32k/ext2-mountpoint
+       mount -t ext2 -o loop $EXT2_DEV $DIR/d32k/ext2-mountpoint
         mkdir -p $DIR/d32k/d2
         touch $DIR/d32k/d2/test_file || error
         $CHECKSTAT -t file $DIR/d32k/ext2-mountpoint/../d2/test_file || error
@@ -1316,7 +1339,7 @@ run_test 32k "stat d32k/ext2-mountpoint/../d2/test_file ========"
  
  test_32l() {
         rm -fr $DIR/d32l
-       mkdir -p $DIR/d32l/ext2-mountpoint 
+       mkdir -p $DIR/d32l/ext2-mountpoint
         mount -t ext2 -o loop $EXT2_DEV $DIR/d32l/ext2-mountpoint || error
         mkdir -p $DIR/d32l/d2
         touch $DIR/d32l/d2/test_file
@@ -1327,10 +1350,10 @@ run_test 32l "open d32l/ext2-mountpoint/../d2/test_file ========"
  
  test_32m() {
         rm -fr $DIR/d32m
-       mkdir -p $DIR/d32m/tmp    
-       TMP_DIR=$DIR/d32m/tmp       
-       ln -s $DIR $TMP_DIR/symlink11 
-       ln -s $TMP_DIR/symlink11 $TMP_DIR/../symlink01 
+       mkdir -p $DIR/d32m/tmp
+       TMP_DIR=$DIR/d32m/tmp
+       ln -s $DIR $TMP_DIR/symlink11
+       ln -s $TMP_DIR/symlink11 $TMP_DIR/../symlink01
         $CHECKSTAT -t link $DIR/d32m/tmp/symlink11 || error
         $CHECKSTAT -t link $DIR/d32m/symlink01 || error
  }
@@ -1338,10 +1361,10 @@ run_test 32m "stat d32m/symlink->tmp/symlink->lustre-root ======"
  
  test_32n() {
         rm -fr $DIR/d32n
-       mkdir -p $DIR/d32n/tmp    
-       TMP_DIR=$DIR/d32n/tmp       
-       ln -s $DIR $TMP_DIR/symlink11 
-       ln -s $TMP_DIR/symlink11 $TMP_DIR/../symlink01 
+       mkdir -p $DIR/d32n/tmp
+       TMP_DIR=$DIR/d32n/tmp
+       ln -s $DIR $TMP_DIR/symlink11
+       ln -s $TMP_DIR/symlink11 $TMP_DIR/../symlink01
         ls -l $DIR/d32n/tmp/symlink11  || error
         ls -l $DIR/d32n/symlink01 || error
  }
@@ -1349,11 +1372,11 @@ run_test 32n "open d32n/symlink->tmp/symlink->lustre-root ======"
  
  test_32o() {
         rm -fr $DIR/d32o $DIR/$tfile
-       touch $DIR/$tfile 
-       mkdir -p $DIR/d32o/tmp    
-       TMP_DIR=$DIR/d32o/tmp       
-       ln -s $DIR/$tfile $TMP_DIR/symlink12 
-       ln -s $TMP_DIR/symlink12 $TMP_DIR/../symlink02 
+       touch $DIR/$tfile
+       mkdir -p $DIR/d32o/tmp
+       TMP_DIR=$DIR/d32o/tmp
+       ln -s $DIR/$tfile $TMP_DIR/symlink12
+       ln -s $TMP_DIR/symlink12 $TMP_DIR/../symlink02
         $CHECKSTAT -t link $DIR/d32o/tmp/symlink12 || error
         $CHECKSTAT -t link $DIR/d32o/symlink02 || error
         $CHECKSTAT -t file -f $DIR/d32o/tmp/symlink12 || error
@@ -1367,15 +1390,15 @@ test_32p() {
      log 32p_2
         rm -f $DIR/$tfile
      log 32p_3
-       touch $DIR/$tfile 
+       touch $DIR/$tfile
      log 32p_4
-       mkdir -p $DIR/d32p/tmp    
+       mkdir -p $DIR/d32p/tmp
      log 32p_5
-       TMP_DIR=$DIR/d32p/tmp       
+       TMP_DIR=$DIR/d32p/tmp
      log 32p_6
-       ln -s $DIR/$tfile $TMP_DIR/symlink12 
+       ln -s $DIR/$tfile $TMP_DIR/symlink12
      log 32p_7
-       ln -s $TMP_DIR/symlink12 $TMP_DIR/../symlink02 
+       ln -s $TMP_DIR/symlink12 $TMP_DIR/../symlink02
      log 32p_8
         cat $DIR/d32p/tmp/symlink12 || error
      log 32p_9
@@ -1454,7 +1477,7 @@ test_34b() {
  run_test 34b "O_RDONLY opening file doesn't create objects ====="
  
  test_34c() {
-       [ ! -f $DIR/f34 ] && test_34a 
+       [ ! -f $DIR/f34 ] && test_34a
         $CHECKSTAT -s $TEST_34_SIZE $DIR/f34 || error
         $OPENFILE -f O_RDWR $DIR/f34
         $GETSTRIPE $DIR/f34 2>&1 | grep -q "no stripe info" && error
@@ -1463,7 +1486,7 @@ test_34c() {
  run_test 34c "O_RDWR opening file-with-size works =============="
  
  test_34d() {
-       [ ! -f $DIR/f34 ] && test_34a 
+       [ ! -f $DIR/f34 ] && test_34a
         dd if=/dev/zero of=$DIR/f34 conv=notrunc bs=4k count=1 || error
         $CHECKSTAT -s $TEST_34_SIZE $DIR/f34 || error
         rm $DIR/f34
@@ -1564,10 +1587,12 @@ test_36f() {
  run_test 36f "utime on file racing with OST BRW write =========="
  
  test_36g() {
-       remote_ost && skip "remote OST" && return
+       remote_ost_nodsh && skip "remote OST with nodsh" && return
+
+       mkdir -p $DIR/$tdir
         export FMD_MAX_AGE=`do_facet ost1 lctl get_param -n obdfilter.*.client_cache_seconds 2> /dev/null | head -n 1`
         FMD_BEFORE="`awk '/ll_fmd_cache/ { print $2 }' /proc/slabinfo`"
-       touch $DIR/d36/$tfile
+       touch $DIR/$tdir/$tfile
         sleep $((FMD_MAX_AGE + 12))
         FMD_AFTER="`awk '/ll_fmd_cache/ { print $2 }' /proc/slabinfo`"
         [ "$FMD_AFTER" -gt "$FMD_BEFORE" ] && \
@@ -1611,6 +1636,78 @@ test_39() {
  }
  run_test 39 "mtime changed on create ==========================="
  
+function get_times() {
+        FILE=$1
+        TIME=$2
+
+        i=0
+        for time in `stat -c "%X %Y %Z" $FILE`; do
+                eval "$TIME[$i]=$time"
+                i=$(($i + 1))
+        done
+}
+
+test_39b() {
+        ATIME=0
+        MTIME=1
+        CTIME=2
+        mkdir -p $DIR/$tdir
+        cp -p /etc/passwd $DIR/$tdir/fopen
+        cp -p /etc/passwd $DIR/$tdir/flink
+        cp -p /etc/passwd $DIR/$tdir/funlink
+        cp -p /etc/passwd $DIR/$tdir/frename
+        ln $DIR/$tdir/funlink $DIR/$tdir/funlink2
+
+        get_times $DIR/$tdir/fopen OPEN_OLD
+        get_times $DIR/$tdir/flink LINK_OLD
+        get_times $DIR/$tdir/funlink UNLINK_OLD
+        get_times $DIR/$tdir/frename RENAME_OLD
+
+        sleep 1
+        echo "aaaaaa" >> $DIR/$tdir/fopen
+        echo "aaaaaa" >> $DIR/$tdir/flink
+        echo "aaaaaa" >> $DIR/$tdir/funlink
+        echo "aaaaaa" >> $DIR/$tdir/frename
+
+        get_times $DIR/$tdir/fopen OPEN_NEW
+        get_times $DIR/$tdir/flink LINK_NEW
+        get_times $DIR/$tdir/funlink UNLINK_NEW
+        get_times $DIR/$tdir/frename RENAME_NEW
+
+        cat $DIR/$tdir/fopen > /dev/null
+        ln $DIR/$tdir/flink $DIR/$tdir/flink2
+        rm -f $DIR/$tdir/funlink2
+        mv -f $DIR/$tdir/frename $DIR/$tdir/frename2
+
+        get_times $DIR/$tdir/fopen OPEN_NEW2
+        get_times $DIR/$tdir/flink LINK_NEW2
+        get_times $DIR/$tdir/funlink UNLINK_NEW2
+        get_times $DIR/$tdir/frename2 RENAME_NEW2
+        echo ${OPEN_OLD[1]},${OPEN_NEW[$MTIME]},${OPEN_NEW2[$MTIME]}
+        echo ${LINK_OLD[1]},${LINK_NEW[$MTIME]},${LINK_NEW2[$MTIME]}
+        echo ${UNLINK_OLD[1]},${UNLINK_NEW[$MTIME]},${UNLINK_NEW2[$MTIME]}
+        echo ${RENAME_OLD[1]},${RENAME_NEW[$MTIME]},${RENAME_NEW2[$MTIME]}
+
+        [ ${OPEN_NEW2[$MTIME]} -eq ${OPEN_NEW[$MTIME]} ] || error "open file reverses mtime"
+        [ ${LINK_NEW2[$MTIME]} -eq ${LINK_NEW[$MTIME]} ] || error "link file reverses mtime"
+        [ ${UNLINK_NEW2[$MTIME]} -eq ${UNLINK_NEW[$MTIME]} ] || error "unlink file reverses mtime"
+        [ ${RENAME_NEW2[$MTIME]} -eq ${RENAME_NEW[$MTIME]} ] || error "rename file reverses mtime"
+}
+run_test 39b "mtime change on close ============================"
+
+# bug 11063
+test_39c() {
+        touch -m -d "10 years ago" $DIR1/$tfile
+        local MTIME1=`stat -c %y $DIR1/$tfile`
+        echo hello >> $DIR1/$tfile
+        local MTIME2=`stat -c %y $DIR1/$tfile`
+        mv $DIR1/$tfile $DIR1/$tfile-1
+        local MTIME3=`stat -c %y $DIR1/$tfile-1`
+        [ "$MTIME2" = "$MTIME3" ] ||
+                error "mtime ($MTIME2) changed (to $MTIME3) on rename (BZ#11063)"
+}
+run_test 39c "mtime change on rename ==========================="
+
  test_40() {
         dd if=/dev/zero of=$DIR/f40 bs=4096 count=1
         $RUNAS $OPENFILE -f O_WRONLY:O_TRUNC $DIR/f40 && error
@@ -1898,7 +1995,7 @@ page_size() {
  
  # in a 2 stripe file (lov.sh), page 1023 maps to page 511 in its object.  this
  # test tickles a bug where re-dirtying a page was failing to be mapped to the
-# objects offset and an assert hit when an rpc was built with 1023's mapped 
+# objects offset and an assert hit when an rpc was built with 1023's mapped
  # offset 511 and 511's raw 511 offset. it also found general redirtying bugs.
  test_46() {
         f="$DIR/f46"
@@ -2094,7 +2191,7 @@ test_51d() {
                 error "OST $N has less objects vs OST $NLAST (${OBJS[$N]} < ${OBJS[$NLAST]}"
             [ ${OBJS[$N]} -gt $((${OBJS[$NLAST]} + 20)) ] && \
                 error "OST $N has less objects vs OST $NLAST (${OBJS[$N]} < ${OBJS[$NLAST]}"
-           
+
             [ ${OBJS0[$N]} -lt $((${OBJS0[$NLAST]} - 20)) ] && \
                 error "OST $N has less #0 objects vs OST $NLAST (${OBJS0[$N]} < ${OBJS0[$NLAST]}"
             [ ${OBJS0[$N]} -gt $((${OBJS0[$NLAST]} + 20)) ] && \
@@ -2159,13 +2256,21 @@ test_52c() { # 12848 simulating client < 1.4.7
  run_test 52c "immutable flag test for client < 1.4.7 ======="
  
  test_53() {
-       remote_mds && skip "remote MDS" && return
-       
-       for VALUE in `lctl get_param osc.*-osc.prealloc_last_id`; do
+       remote_mds_nodsh && skip "remote MDS with nodsh" && return
+       remote_ost_nodsh && skip "remote OST with nodsh" && return
+
+       local param
+       local ostname
+       local mds_last
+       local ost_last
+       local ostnum
+
+       for VALUE in $(do_facet mds lctl get_param osc.*-osc.prealloc_last_id); do
                 param=`echo ${VALUE[0]} | cut -d "=" -f1`;
                 ostname=`echo $param | cut -d "." -f2 | cut -d - -f 1-2`
-               mds_last=`lctl get_param -n $param`
-               ost_last=`lctl get_param -n obdfilter.$ostname.last_id`
+               mds_last=$(do_facet mds lctl get_param -n $param)
+               ostnum=$(echo $ostname | sed "s/${FSNAME}-OST//g" | awk '{print ($1+1)}' )
+               ost_last=$(do_facet ost$ostnum lctl get_param -n obdfilter.$ostname.last_id)
                 echo "$ostname.last_id=$ost_last ; MDS.last_id=$mds_last"
                 if [ $ost_last != $mds_last ]; then
                         error "$ostname.last_id=$ost_last ; MDS.last_id=$mds_last"
@@ -2187,7 +2292,7 @@ test_54b() {
         f="$DIR/f54b"
         mknod $f c 1 3
         chmod 0666 $f
-       dd if=/dev/zero of=$f bs=`page_size` count=1 
+       dd if=/dev/zero of=$f bs=`page_size` count=1
  }
  run_test 54b "char device works in lustre ======================"
  
@@ -2209,7 +2314,7 @@ test_54c() {
         tdir="$DIR/d54c"
         loopdev="$DIR/loop54c"
  
-       find_loop_dev 
+       find_loop_dev
         [ -z "$LOOPNUM" ] && echo "couldn't find empty loop device" && return
         mknod $loopdev b 7 $LOOPNUM
         echo "make a loop file system with $tfile on $loopdev ($LOOPNUM)..."
@@ -2360,7 +2465,7 @@ setup_56_special() {
  }
  
  test_56g() {
-        $LSTRIPE -d $DIR
+        $SETSTRIPE -d $DIR
  
          setup_56 $NUMFILES $NUMDIRS
  
@@ -2376,7 +2481,7 @@ test_56g() {
  run_test 56g "check lfs find -name ============================="
  
  test_56h() {
-        $LSTRIPE -d $DIR
+        $SETSTRIPE -d $DIR
  
          setup_56 $NUMFILES $NUMDIRS
  
@@ -2514,12 +2619,13 @@ test_56q() {
  run_test 56q "check lfs find -gid and ! -gid ==============================="
  
  test_57a() {
-       remote_mds && skip "remote MDS" && return
+       remote_mds_nodsh && skip "remote MDS with nodsh" && return
+
         local MNTDEV="mds.*.mntdev"
-       DEV=$(lctl get_param -n $MNTDEV)
-       [ -z "$DEV" ] && error "can't access $MNTDEV" 
-       for DEV in `lctl get_param -n $MNTDEV`; do
-               dumpe2fs -h $DEV > $TMP/t57a.dump || error "can't access $DEV"
+       DEV=$(do_facet mds lctl get_param -n $MNTDEV)
+       [ -z "$DEV" ] && error "can't access $MNTDEV"
+       for DEV in $(do_facet mds lctl get_param -n $MNTDEV); do
+               do_facet mds dumpe2fs -h $DEV > $TMP/t57a.dump || error "can't access $DEV"
                 DEVISIZE=`awk '/Inode size:/ { print $3 }' $TMP/t57a.dump`
                 [ "$DEVISIZE" -gt 128 ] || error "inode size $DEVISIZE"
                 rm $TMP/t57a.dump
@@ -2611,7 +2717,7 @@ test_60b() { # bug 6411
  run_test 60b "limit repeated messages from CERROR/CWARN ========"
  
  test_60c() {
-       echo "create 5000 files" 
+       echo "create 5000 files"
         createmany -o $DIR/f60c- 5000
         #define OBD_FAIL_MDS_LLOG_CREATE_FAILED  0x137
         lctl set_param fail_loc=0x80000137
@@ -2754,7 +2860,7 @@ test_65e() {
         touch $DIR/d65/f6
         $LVERIFY $DIR/d65 $DIR/d65/f6 || error "lverify failed"
  }
-run_test 65e "directory setstripe 0 -1 0 ======================="
+run_test 65e "directory setstripe defaults ======================="
  
  test_65f() {
         mkdir -p $DIR/d65f
@@ -2779,9 +2885,9 @@ test_65h() {
            "`$GETSTRIPE -v $DIR/d65/dd1 | grep "^count"`" ] || error "stripe info inherit failed"
  }
  run_test 65h "directory stripe info inherit ===================="
- 
+
  test_65i() { # bug6367
-        $SETSTRIPE $MOUNT -s 65536 -c -1 
+        $SETSTRIPE $MOUNT -s 65536 -c -1
  }
  run_test 65i "set non-default striping on root directory (bug 6367)="
  
@@ -2798,7 +2904,7 @@ run_test 65j "set default striping on root directory (bug 6367)="
  
  test_65k() { # bug11679
          [ "$OSTCOUNT" -lt 2 ] && skip "too few OSTs" && return
-        remote_mds_nodsh && skip "remote MDS" && return
+        remote_mds_nodsh && skip "remote MDS with nodsh" && return
  
          echo "Check OST status: "
          MDS_OSCS=`do_facet mds lctl dl | awk '/[oO][sS][cC].*md[ts]/ { print $4 }'`
@@ -2851,7 +2957,9 @@ test_67a() { # was test_67 bug 3285 - supplementary group fails on MDS, passes o
         chgrp $RUNAS_ID $DIR/$tdir
         $RUNAS -u $RUNAS_ID -g $(($RUNAS_ID + 1)) -G1,2,$RUNAS_ID ls $DIR/$tdir
         RC=$?
-       GROUP_UPCALL=`lctl get_param -n mds.*.group_upcall`
+       GROUP_UPCALL=$(do_facet mds lctl get_param -n mds.*.group_upcall)
+       [ -z "$GROUP_UPCALL" ] && \
+               skip "lctl get_param failed! Useless to continue the test!" && return
         [ "$GROUP_UPCALL" = "NONE" -a $RC -eq 0 ] && \
                 error "no-upcall passed" || true
         [ "$GROUP_UPCALL" != "NONE" -a $RC -ne 0 ] && \
@@ -2862,23 +2970,32 @@ run_test 67a "supplementary group failure (should return error) ="
  cleanup_67b() {
         set +vx
         trap 0
-       lctl set_param -n mds.$MDS.group_upcall NONE
+       do_facet mds lctl set_param -n mds.*.group_upcall NONE
  }
  
  test_67b() { # bug 3285 - supplementary group fails on MDS, passes on client
-       T67_UID=${T67_UID:-1}   # needs to be in /etc/groups on MDS, gid == uid
+       # needs to be in /etc/groups on MDS, gid == uid
+       # Let's use RUNAS_ID
+       T67_UID=${T67_UID:-$RUNAS_ID}
+       
         [ "$UID" = "$T67_UID" ] && skip "UID = T67_UID = $UID -- skipping" && return
         check_kernel_version 35 || return 0
-       remote_mds && skip "remote MDS" && return
-       GROUP_UPCALL=`lctl get_param -n mds.$MDS.group_upcall`
-       [ "$GROUP_UPCALL" != "NONE" ] && skip "skip test - upcall" &&return
+       do_facet mds grep -q ":$T67_UID:$T67_UID" /etc/passwd || \
+               { skip "Need gid=$T67_UID group and gid == uid on mds !" && return; }
+
+       GROUP_UPCALL=$(do_facet mds lctl get_param -n mds.*.group_upcall)
+       [ -z "$GROUP_UPCALL" ] && \
+               skip "lctl get_param failed! Useless to continue the test!" && return
+       [ "$GROUP_UPCALL" != "NONE" ] && \
+               skip "skip test - upcall=$GROUP_UPCALL" && return
         set -vx
         trap cleanup_67b EXIT
         mkdir -p $DIR/$tdir
         chmod 771 $DIR/$tdir
         chgrp $T67_UID $DIR/$tdir
-       lctl set_param -n mds.$MDS.group_upcall `which l_getgroups`
-       l_getgroups -d $T67_UID
+       local l_getgroups=$(do_facet mds which l_getgroups)
+       do_facet mds lctl set_param -n mds.*.group_upcall $l_getgroups
+       do_facet mds $l_getgroups -d $T67_UID
         $RUNAS -u $T67_UID -g 999 -G8,9,$T67_UID touch $DIR/$tdir/$tfile || \
                 error "'touch $DIR/$tdir/$tfile' failed"
         [ -f $DIR/$tdir/$tfile ] || error "$DIR/$tdir/$tfile create error"
@@ -2948,56 +3065,35 @@ run_test 68 "support swapping to Lustre ========================"
  # bug5265, obdfilter oa2dentry return -ENOENT
  # #define OBD_FAIL_OST_ENOENT 0x217
  test_69() {
-       [ $(lctl get_param -n devices |  grep -c obdfilter) -eq 0 ] &&
-               skip "skipping test for remote OST" && return
+       remote_ost_nodsh && skip "remote OST with nodsh" && return
  
         f="$DIR/$tfile"
-       touch $f
+       $SETSTRIPE $f -c 1 -i 0
  
         $DIRECTIO write ${f}.2 0 1 || error "directio write error"
  
         #define OBD_FAIL_OST_ENOENT 0x217
-       lctl set_param fail_loc=0x217
+       do_facet ost1 lctl set_param fail_loc=0x217
         truncate $f 1 # vmtruncate() will ignore truncate() error.
         $DIRECTIO write $f 0 2 && error "write succeeded, expect -ENOENT"
  
-       lctl set_param fail_loc=0
+       do_facet ost1 lctl set_param fail_loc=0
         $DIRECTIO write $f 0 2 || error "write error"
  
         cancel_lru_locks osc
         $DIRECTIO read $f 0 1 || error "read error"
  
         #define OBD_FAIL_OST_ENOENT 0x217
-       lctl set_param fail_loc=0x217
+       do_facet ost1 lctl set_param fail_loc=0x217
         $DIRECTIO read $f 1 1 && error "read succeeded, expect -ENOENT"
  
-       lctl set_param fail_loc=0
+       do_facet ost1 lctl set_param fail_loc=0
         rm -f $f
  }
  run_test 69 "verify oa2dentry return -ENOENT doesn't LBUG ======"
  
  test_71() {
-       which dbench > /dev/null 2>&1 || { skip "dbench not installed, skip this test" && return 0; }
-       DBENCH_LIB=${DBENCH_LIB:-/usr/lib/dbench}
-       PATH=${DBENCH_LIB}:${PATH}
-       cp `which dbench` $DIR
-
-       TGT=$DIR/client.txt
-       SRC=${SRC:-$DBENCH_LIB/client.txt}
-       [ ! -e $TGT -a -e $SRC ] && echo "copying $SRC to $TGT" && cp $SRC $TGT
-       SRC=$DBENCH_LIB/client_plain.txt
-       [ ! -e $TGT -a -e $SRC ] && echo "copying $SRC to $TGT" && cp $SRC $TGT
-       echo "copying necessary libs to $DIR"
-       LIBS71=$(ldd $DIR/dbench|sed -e 's/\t*//' -e 's/.*=> //' -e 's/ .*//' -e 's/^\///')
-       (cd / && tar chf - $LIBS71) | (cd $DIR && tar xvf -)
-       [ $? = 0 ] || error "can't copy libs $LIBS71 to $DIR"
-       echo "chroot $DIR /dbench -c client.txt 2"
-       chroot $DIR /dbench -c client.txt 2
-       RC=$?
-
-       rm -rf $DIR/dbench $DIR/lib $DIR/lib64
-
-       return $RC
+    sh rundbench -C -D $DIR 2 || error "dbench failed!"
  }
  run_test 71 "Running dbench on lustre (don't segment fault) ===="
  
@@ -3006,7 +3102,7 @@ test_72() { # bug 5695 - Test that on 2.6 remove_suid works properly
         [ "$RUNAS_ID" = "$UID" ] && skip "RUNAS_ID = UID = $UID -- skipping" && return
  
          # Check that testing environment is properly set up. Skip if not
-        FAIL_ON_ERROR=false check_runas_id_ret $RUNAS_ID $RUNAS || {
+        FAIL_ON_ERROR=false check_runas_id_ret $RUNAS_ID $RUNAS_ID $RUNAS || {
                  skip "User $RUNAS_ID does not exist - skipping"
                  return 0
          }
@@ -3025,7 +3121,7 @@ run_test 72 "Test that remove suid works properly (bug5695) ===="
  
  # bug 3462 - multiple simultaneous MDC requests
  test_73() {
-       mkdir $DIR/d73-1 
+       mkdir $DIR/d73-1
         mkdir $DIR/d73-2
         multiop_bg_pause $DIR/d73-1/f73-1 O_c || return 1
         pid1=$!
@@ -3044,9 +3140,9 @@ test_73() {
  
         sleep 25
  
-       $CHECKSTAT -t file $DIR/d73-1/f73-1 || return 4
-       $CHECKSTAT -t file $DIR/d73-1/f73-2 || return 5 
-       $CHECKSTAT -t file $DIR/d73-2/f73-3 || return 6 
+       $CHECKSTAT -t file $DIR/d73-1/f73-1 || error "$DIR/d73-1/f73-1 not file"
+       $CHECKSTAT -t file $DIR/d73-1/f73-2 || error "$DIR/d73-1/f73-2 not file"
+       $CHECKSTAT -t file $DIR/d73-2/f73-3 || error "$DIR/d73-2/f73-3 not file"
  
         rm -rf $DIR/d73-*
  }
@@ -3088,22 +3184,22 @@ export T75_PREP=no
  test75_prep() {
          [ $T75_PREP = "yes" ] && return
          echo "using F75=$F75, F128k=$F128k, FHEAD=$FHEAD, FTAIL=$FTAIL"
- 
+
          dd if=/dev/urandom of=${F75}_128k bs=128k count=1 || error "dd failed"
          log "finished dd"
          chmod 777 ${F128k}
          T75_PREP=yes
  }
- 
+
  test_75a() {
          test75_prep
- 
+
          cp -p ${F128k} ${FHEAD}
          log "finished cp to $FHEAD"
          cp -p ${F128k} ${FTAIL}
          log "finished cp to $FTAIL"
          cat ${F128k} ${F128k} > ${F75}_sim_sim
- 
+
          $JOIN ${FHEAD} ${FTAIL} || error "join ${FHEAD} ${FTAIL} error"
          log "finished join $FHEAD to ${F75}_sim_sim"
          cmp ${FHEAD} ${F75}_sim_sim || error "${FHEAD} ${F75}_sim_sim differ"
@@ -3111,10 +3207,10 @@ test_75a() {
          $CHECKSTAT -a ${FTAIL} || error "tail ${FTAIL} still exist after join"
  }
  run_test 75a "TEST join file ===================================="
- 
+
  test_75b() {
          test75_prep
- 
+
          cp -p ${F128k} ${FTAIL}
          cat ${F75}_sim_sim >> ${F75}_join_sim
          cat ${F128k} >> ${F75}_join_sim
@@ -3124,10 +3220,10 @@ test_75b() {
          $CHECKSTAT -a ${FTAIL} || error "tail ${FTAIL} exist after join"
  }
  run_test 75b "TEST join file 2 =================================="
- 
+
  test_75c() {
          test75_prep
- 
+
          cp -p ${F128k} ${FTAIL}
          cat ${F128k} >> ${F75}_sim_join
          cat ${F75}_join_sim >> ${F75}_sim_join
@@ -3137,10 +3233,10 @@ test_75c() {
          $CHECKSTAT -a ${FHEAD} || error "tail ${FHEAD} exist after join"
  }
  run_test 75c "TEST join file 3 =================================="
- 
+
  test_75d() {
          test75_prep
- 
+
          cp -p ${F128k} ${FHEAD}
          cp -p ${F128k} ${FHEAD}_tmp
          cat ${F75}_sim_sim >> ${F75}_join_join
@@ -3151,17 +3247,17 @@ test_75d() {
          $CHECKSTAT -a ${FTAIL} || error "tail ${FTAIL} exist after join (2)"
  }
  run_test 75d "TEST join file 4 =================================="
- 
+
  test_75e() {
          test75_prep
- 
+
          rm -rf ${FHEAD} || "delete join file error"
  }
  run_test 75e "TEST join file 5 (remove joined file) ============="
- 
+
  test_75f() {
          test75_prep
- 
+
          cp -p ${F128k} ${F75}_join_10_compare
          cp -p ${F128k} ${F75}_join_10
          for ((i = 0; i < 10; i++)); do
@@ -3175,13 +3271,13 @@ test_75f() {
                  error "files ${F75}_join_10 ${F75}_join_10_compare differ"
  }
  run_test 75f "TEST join file 6 (join 10 files) =================="
- 
+
  test_75g() {
          [ ! -f ${F75}_join_10 ] && echo "${F75}_join_10 missing" && return
          $LFS getstripe ${F75}_join_10
- 
+
          $OPENUNLINK ${F75}_join_10 ${F75}_join_10 || error "files unlink open"
- 
+
          ls -l $F75*
  }
  run_test 75g "TEST join file 7 (open unlink) ===================="
@@ -3214,9 +3310,8 @@ set_checksums()
  {
         [ "$ORIG_CSUM" ] || ORIG_CSUM=`lctl get_param -n osc.*.checksums |
                                        head -n1`
-       for f in $LPROC/osc/*/checksums; do
-               echo $1 >> $f
-       done
+
+       lctl set_param -n osc.*.checksums=$1
         return 0
  }
  
@@ -3259,7 +3354,7 @@ test_77b() { # bug 10889
  run_test 77b "checksum error on client write ===================="
  
  test_77c() { # bug 10889
-       [ ! -f $DIR/f77b ] && skip "requires 77b - skipping" && return  
+       [ ! -f $DIR/f77b ] && skip "requires 77b - skipping" && return
         set_checksums 1
         for algo in $CKSUM_TYPES; do
                 cancel_lru_locks osc
@@ -3286,7 +3381,7 @@ test_77d() { # bug 10889
  run_test 77d "checksum error on OST direct write ==============="
  
  test_77e() { # bug 10889
-       [ ! -f $DIR/f77 ] && skip "requires 77d - skipping" && return  
+       [ ! -f $DIR/f77 ] && skip "requires 77d - skipping" && return
         #define OBD_FAIL_OSC_CHECKSUM_RECEIVE    0x408
         lctl set_param fail_loc=0x80000408
         set_checksums 1
@@ -3315,29 +3410,31 @@ test_77f() { # bug 10889
  run_test 77f "repeat checksum error on write (expect error) ===="
  
  test_77g() { # bug 10889
-       [ $(lctl get_param -n devices | grep -c obdfilter) -eq 0 ] && \
-               skip "remote OST" && return
+       remote_ost_nodsh && skip "remote OST with nodsh" && return
+
         [ ! -f $F77_TMP ] && setup_f77
+
+       $SETSTRIPE $DIR/f77g -c 1 -i 0
         #define OBD_FAIL_OST_CHECKSUM_RECEIVE       0x21a
-       lctl set_param fail_loc=0x8000021a
+       do_facet ost1 lctl set_param fail_loc=0x8000021a
         set_checksums 1
         dd if=$F77_TMP of=$DIR/f77g bs=1M count=$F77SZ || \
                 error "write error: rc=$?"
-       lctl set_param fail_loc=0
+       do_facet ost1 lctl set_param fail_loc=0
         set_checksums 0
  }
  run_test 77g "checksum error on OST write ======================"
  
  test_77h() { # bug 10889
-       [ $(lctl get_param -n devices | grep -c obdfilter) -eq 0 ] && \
-               skip "remote OST" && return
-       [ ! -f $DIR/f77g ] && skip "requires 77g - skipping" && return  
+       remote_ost_nodsh && skip "remote OST with nodsh" && return
+
+       [ ! -f $DIR/f77g ] && skip "requires 77g - skipping" && return
         cancel_lru_locks osc
         #define OBD_FAIL_OST_CHECKSUM_SEND          0x21b
-       lctl set_param fail_loc=0x8000021b
+       do_facet ost1 lctl set_param fail_loc=0x8000021b
         set_checksums 1
         cmp $F77_TMP $DIR/f77g || error "file compare failed"
-       lctl set_param fail_loc=0
+       do_facet ost1 lctl set_param fail_loc=0
         set_checksums 0
  }
  run_test 77h "checksum error on OST read ======================="
@@ -3409,17 +3506,11 @@ test_78() { # bug 10901
  run_test 78 "handle large O_DIRECT writes correctly ============"
  
  test_79() { # bug 12743
-       [ $(lctl get_param -n devices | grep -c obdfilter) -eq 0 ] &&
-               skip "skipping test for remote OST" && return
-
         wait_delete_completed
  
-        BKTOTAL=`lctl get_param -n obdfilter.*.kbytestotal |
-                 awk 'BEGIN{total=0}; {total+=$1}; END{print total}'`
-        BKFREE=`lctl get_param -n obdfilter.*.kbytesfree |
-                awk 'BEGIN{free=0}; {free+=$1}; END{print free}'`
-        BKAVAIL=`lctl get_param -n obdfilter.*.kbytesavail |
-                 awk 'BEGIN{avail=0}; {avail+=$1}; END{print avail}'`
+        BKTOTAL=$(calc_osc_kbytes kbytestotal)
+        BKFREE=$(calc_osc_kbytes kbytesfree)
+        BKAVAIL=$(calc_osc_kbytes kbytesavail)
          STRING=`df -P $MOUNT | tail -n 1 | awk '{print $2","$3","$4}'`
          DFTOTAL=`echo $STRING | cut -d, -f1`
          DFUSED=`echo $STRING  | cut -d, -f2`
@@ -3428,15 +3519,15 @@ test_79() { # bug 12743
  
          ALLOWANCE=$((64 * $OSTCOUNT))
  
-        if [ $DFTOTAL -lt $(($BKTOTAL - $ALLOWANCE)) ] ||  
+        if [ $DFTOTAL -lt $(($BKTOTAL - $ALLOWANCE)) ] ||
             [ $DFTOTAL -gt $(($BKTOTAL + $ALLOWANCE)) ] ; then
                  error "df total($DFTOTAL) mismatch OST total($BKTOTAL)"
          fi
-        if [ $DFFREE -lt $(($BKFREE - $ALLOWANCE)) ] || 
+        if [ $DFFREE -lt $(($BKFREE - $ALLOWANCE)) ] ||
             [ $DFFREE -gt $(($BKFREE + $ALLOWANCE)) ] ; then
                  error "df free($DFFREE) mismatch OST free($BKFREE)"
          fi
-        if [ $DFAVAIL -lt $(($BKAVAIL - $ALLOWANCE)) ] || 
+        if [ $DFAVAIL -lt $(($BKAVAIL - $ALLOWANCE)) ] ||
             [ $DFAVAIL -gt $(($BKAVAIL + $ALLOWANCE)) ] ; then
                  error "df avail($DFAVAIL) mismatch OST avail($BKAVAIL)"
          fi
@@ -3447,7 +3538,7 @@ test_80() { # bug 10718
          dd if=/dev/zero of=$DIR/$tfile bs=1M count=1 seek=1M
          sync; sleep 1; sync
          BEFORE=`date +%s`
-        cancel_lru_locks OSC
+        cancel_lru_locks osc
          AFTER=`date +%s`
          DIFF=$((AFTER-BEFORE))
          if [ $DIFF -gt 1 ] ; then
@@ -3457,25 +3548,20 @@ test_80() { # bug 10718
  }
  run_test 80 "Page eviction is equally fast at high offsets too  ===="
  
-# on the LLNL clusters, runas will still pick up root's $TMP settings,
-# which will not be writable for the runas user, and then you get a CVS
-# error message with a corrupt path string (CVS bug) and panic.
-# We're not using much space, so just stick it in /tmp, which is safe.
-OLDTMPDIR=$TMPDIR
-OLDTMP=$TMP
-TMPDIR=/tmp
-TMP=/tmp
-OLDHOME=$HOME
-[ $RUNAS_ID -ne $UID ] && HOME=/tmp
-
  test_99a() {
+       [ -z "$(which cvs 2>/dev/null)" ] && skip "could not find cvs" && return
         mkdir -p $DIR/d99cvsroot || error "mkdir $DIR/d99cvsroot failed"
         chown $RUNAS_ID $DIR/d99cvsroot || error "chown $DIR/d99cvsroot failed"
+       local oldPWD=$PWD       # bug 13584, use $TMP as working dir
+       cd $TMP
+       
         $RUNAS cvs -d $DIR/d99cvsroot init || error "cvs init failed"
+       cd $oldPWD
  }
  run_test 99a "cvs init ========================================="
  
  test_99b() {
+        [ -z "$(which cvs 2>/dev/null)" ] && skip "could not find cvs" && return
         [ ! -d $DIR/d99cvsroot ] && test_99a
         $RUNAS [ ! -w /tmp ] && skip "/tmp has wrong w permission -- skipping" && return
         cd /etc/init.d || error "cd /etc/init.d failed"
@@ -3489,6 +3575,7 @@ test_99b() {
  run_test 99b "cvs import ======================================="
  
  test_99c() {
+        [ -z "$(which cvs 2>/dev/null)" ] && skip "could not find cvs" && return
         [ ! -d $DIR/d99cvsroot ] && test_99b
         cd $DIR || error "cd $DIR failed"
         mkdir -p $DIR/d99reposname || error "mkdir $DIR/d99reposname failed"
@@ -3500,6 +3587,7 @@ test_99c() {
  run_test 99c "cvs checkout ====================================="
  
  test_99d() {
+        [ -z "$(which cvs 2>/dev/null)" ] && skip "could not find cvs" && return
         [ ! -d $DIR/d99cvsroot ] && test_99c
         cd $DIR/d99reposname
         $RUNAS touch foo99
@@ -3508,6 +3596,7 @@ test_99d() {
  run_test 99d "cvs add =========================================="
  
  test_99e() {
+        [ -z "$(which cvs 2>/dev/null)" ] && skip "could not find cvs" && return
         [ ! -d $DIR/d99cvsroot ] && test_99c
         cd $DIR/d99reposname
         $RUNAS cvs update
@@ -3515,6 +3604,7 @@ test_99e() {
  run_test 99e "cvs update ======================================="
  
  test_99f() {
+        [ -z "$(which cvs 2>/dev/null)" ] && skip "could not find cvs" && return
         [ ! -d $DIR/d99cvsroot ] && test_99d
         cd $DIR/d99reposname
         $RUNAS cvs commit -m 'nomsg' foo99
@@ -3522,10 +3612,21 @@ test_99f() {
  run_test 99f "cvs commit ======================================="
  
  test_100() {
-       netstat -tna | while read PROT SND RCV LOCAL REMOTE STAT; do
+       [ "$NETTYPE" = tcp ] || \
+               { skip "TCP secure port test, not useful for NETTYPE=$NETTYPE" && \
+                       return ; }
+
+       remote_ost_nodsh && skip "remote OST with nodsh" && return
+       remote_mds_nodsh && skip "remote MDS with nodsh" && return
+       remote_servers || \
+               { skip "useless for local single node setup" && return; }
+
+       netstat -tna | ( rc=1; while read PROT SND RCV LOCAL REMOTE STAT; do
                 [ "$PROT" != "tcp" ] && continue
-               RPORT=`echo $REMOTE | cut -d: -f2`
+               RPORT=$(echo $REMOTE | cut -d: -f2)
                 [ "$RPORT" != "$ACCEPTOR_PORT" ] && continue
+
+               rc=0
                 LPORT=`echo $LOCAL | cut -d: -f2`
                 if [ $LPORT -ge 1024 ]; then
                         echo "bad: $PROT $SND $RCV $LOCAL $REMOTE $STAT"
@@ -3533,7 +3634,7 @@ test_100() {
                         error "local: $LPORT > 1024, remote: $RPORT"
                 fi
         done
-       true
+       [ "$rc" = 0 ] || error "privileged port not found" )
  }
  run_test 100 "check local port using privileged port ==========="
  
@@ -3579,89 +3680,88 @@ test_101() {
         $READS -f $DIR/$tfile -s$((cache_limit * 3192 * 1024)) -b65536 -C -n$nreads -t 180
  
         discard=0
-       for s in `lctl get_param -n llite.*.read_ahead_stats | get_named_value 'read but discarded'`; do
-               discard=$(($discard + $s))
+        for s in `lctl get_param -n llite.*.read_ahead_stats | \
+               get_named_value 'read but discarded' | cut -d" " -f1`; do
+                       discard=$(($discard + $s))
         done
         cleanup_101
  
         if [ $(($discard * 10)) -gt $nreads ] ;then
                 lctl get_param osc.*.rpc_stats
                 lctl get_param llite.*.read_ahead_stats
-               error "too many ($discard) discarded pages" 
+               error "too many ($discard) discarded pages"
         fi
         rm -f $DIR/$tfile || true
  }
  run_test 101 "check read-ahead for random reads ================"
  
-export SETUP_TEST101=no
-setup_test101() {
-       [ "$SETUP_TEST101" = "yes" ] && return
+export SETUP_TEST101b=no
+setup_101b() {
+       [ "$SETUP_TEST101b" = "yes" ] && return
         mkdir -p $DIR/$tdir
         STRIPE_SIZE=1048576
         STRIPE_COUNT=$OSTCOUNT
         STRIPE_OFFSET=0
  
-       trap cleanup_test101 EXIT
+       trap cleanup_101b EXIT
         # prepare the read-ahead file
         $SETSTRIPE $DIR/$tfile -s $STRIPE_SIZE -i $STRIPE_OFFSET -c $OSTCOUNT
  
         dd if=/dev/zero of=$DIR/$tfile bs=1024k count=100 2> /dev/null
-       SETUP_TEST102=yes
+       SETUP_TEST101b=yes
  }
  
-cleanup_test101() {
-       [ "$SETUP_TEST101" = "yes" ] || return
+cleanup_101b() {
         trap 0
-       rm -rf $DIR/$tdir
-       SETUP_TEST102=no
+       rm -rf $DIR/$tdir $DIR/$tfile
+       SETUP_TEST101b=no
  }
  
  calc_total() {
         awk 'BEGIN{total=0}; {total+=$1}; END{print total}'
  }
  
-ra_check_101() {
+ra_check_101b() {
         local READ_SIZE=$1
         local STRIPE_SIZE=1048576
         local RA_INC=1048576
         local STRIDE_LENGTH=$((STRIPE_SIZE/READ_SIZE))
         local FILE_LENGTH=$((64*100))
-       local discard_limit=$(((((((STRIDE_LENGTH - 1))*3)/(STRIDE_LENGTH*OSTCOUNT))* \
-                            (STRIDE_LENGTH*OSTCOUNT - STRIDE_LENGTH))))
-
-       DISCARD=`$LCTL get_param -n llite.*.read_ahead_stats |   \
-                        get_named_value 'read but discarded' | calc_total`
+       local discard_limit=$((((STRIDE_LENGTH - 1)*3/(STRIDE_LENGTH*OSTCOUNT))* \
+                            (STRIDE_LENGTH*OSTCOUNT - STRIDE_LENGTH)))
+       DISCARD=`$LCTL get_param -n llite.*.read_ahead_stats | \
+                       get_named_value 'read but discarded' | \
+                       cut -d" " -f1 | calc_total`
  
         if [ $DISCARD -gt $discard_limit ]; then
                 lctl get_param llite.*.read_ahead_stats
-               error "Too many ($DISCARD) discarded with size (${READ_SIZE})"
+               error "Too many ($DISCARD) discarded pages (size ${READ_SIZE})"
         else
                 echo "Read-ahead success for size ${READ_SIZE}"
         fi
  }
  
  test_101b() {
-       [ "$OSTCOUNT" -lt "2" ] && skip "skipping stride IO stride-ahead test" && return
+       [ "$OSTCOUNT" -lt "2" ] && skip "too few OSTs for stride-read" && return
         local STRIPE_SIZE=1048576
         local STRIDE_SIZE=$((STRIPE_SIZE*OSTCOUNT))
         local FILE_LENGTH=$((STRIPE_SIZE*100))
         local ITERATION=$((FILE_LENGTH/STRIDE_SIZE))
         # prepare the read-ahead file
-       setup_test101
-       cancel_lru_locks osc 
-       for BIDX in 2 4 8 16 32 64 128 256
-       do
+       setup_101b
+       cancel_lru_locks osc
+       for BIDX in 2 4 8 16 32 64 128 256; do
                 local BSIZE=$((BIDX*4096))
                 local READ_COUNT=$((STRIPE_SIZE/BSIZE))
                 local STRIDE_LENGTH=$((STRIDE_SIZE/BSIZE))
                 local OFFSET=$((STRIPE_SIZE/BSIZE*(OSTCOUNT - 1)))
                 $LCTL set_param -n llite.*.read_ahead_stats 0
                 $READS -f $DIR/$tfile  -l $STRIDE_LENGTH -o $OFFSET \
-                             -s $FILE_LENGTH -b $STRIPE_SIZE -a $READ_COUNT -n $ITERATION
+                       -s $FILE_LENGTH -b $STRIPE_SIZE -a $READ_COUNT -n $ITERATION
                 cancel_lru_locks osc
-               ra_check_101 $BSIZE
+               ra_check_101b $BSIZE
         done
-       cleanup_test101
+       cleanup_101b
         true
  }
  run_test 101b "check stride-io mode read-ahead ================="
@@ -3671,18 +3771,18 @@ setup_test102() {
         [ "$SETUP_TEST102" = "yes" ] && return
         mkdir -p $DIR/$tdir
         STRIPE_SIZE=65536
-       STRIPE_COUNT=4 
+       STRIPE_COUNT=4
         STRIPE_OFFSET=2
  
         trap cleanup_test102 EXIT
         cd $DIR
         $SETSTRIPE $tdir -s $STRIPE_SIZE -i $STRIPE_OFFSET -c $STRIPE_COUNT
-       cd $DIR/$tdir 
+       cd $DIR/$tdir
         for num in 1 2 3 4
         do
                 for count in 1 2 3 4
                 do
-                       for offset in 0 1 2 3 
+                       for offset in 0 1 2 3
                         do
                                 local stripe_size=`expr $STRIPE_SIZE \* $num`
                                 local file=file"$num-$offset-$count"
@@ -3692,7 +3792,7 @@ setup_test102() {
         done
  
         cd $DIR
-       star -c  f=$TMP/f102.tar $tdir 
+       star -c  f=$TMP/f102.tar $tdir
         SETUP_TEST102=yes
  }
  
@@ -3711,7 +3811,7 @@ test_102a() {
          touch $testfile
  
         [ "$UID" != 0 ] && skip "must run as root" && return
-       [ -z "`lctl get_param -n mdc.*[mM][dD][cC]*.connect_flags | grep xattr`" ] &&
+       [ -z "`lctl get_param -n mdc.*.connect_flags | grep xattr`" ] &&
         skip "must have user_xattr" && return
         [ -z "$(which setfattr 2>/dev/null)" ] && skip "could not find setfattr" && return
  
@@ -3719,7 +3819,7 @@ test_102a() {
          setfattr -n trusted.name1 -v value1 $testfile || error
          [ "`getfattr -n trusted.name1 $testfile 2> /dev/null | \
          grep "trusted.name1"`" == "trusted.name1=\"value1\"" ] || error
- 
+
          setfattr -n user.author1 -v author1 $testfile || error
          [ "`getfattr -n user.author1 $testfile 2> /dev/null | \
          grep "user.author1"`" == "user.author1=\"author1\"" ] || error
@@ -3730,7 +3830,7 @@ test_102a() {
          [ `getfattr -d -m "^trusted" $testfile 2> /dev/null | \
          grep "trusted.name" | wc -l` -eq 3 ] || error
  
- 
+
          setfattr -n user.author2 -v author2 $testfile || error
          setfattr -n user.author3 -v author3 $testfile || error
          [ `getfattr -d -m "^user" $testfile 2> /dev/null | \
@@ -3803,42 +3903,25 @@ test_102c() {
  }
  run_test 102c "non-root getfattr/setfattr for lustre.lov EAs ==========="
  
-get_stripe_info() {
-       stripe_size=0
-       stripe_count=0
-       stripe_offset=0
-       local lines=`sed -n '/obdidx/=' $1`
-       stripe_size=`awk '{if($1~/size/) print $2}' $1`
-       stripe_count=`awk '{if($1~/count/) print $2}' $1`
-       lines=`expr $lines + 1`
-       stripe_offset=`sed -n ${lines}p $1 |awk '{print $1}'`
-}
-
  compare_stripe_info1() {
         for num in 1 2 3 4
         do
                 for count in 1 2 3 4
                 do
-                       for offset in 0 1 2 3 
+                       for offset in 0 1 2 3
                         do
                                 local size=`expr $STRIPE_SIZE \* $num`
                                 local file=file"$num-$offset-$count"
-                               local tmp_file=out
-                               $GETSTRIPE -v $file > $tmp_file 
-                               get_stripe_info  $tmp_file
-                               if test $stripe_size -ne $size
-                               then
+                               get_stripe_info client $PWD/$file
+                               if [ $stripe_size -ne $size ]; then
                                         error "$file: different stripe size" && return
                                 fi
-                               if test $stripe_count -ne $count
-                               then
+                               if [ $stripe_count -ne $count ]; then
                                         error "$file: different stripe count" && return
                                 fi
-                               if test $stripe_offset -ne 0
-                               then
+                               if [ $stripe_index -ne 0 ]; then
                                         error "$file: different stripe offset" && return
                                 fi
-                               rm -f $tmp_file
                         done
                 done
         done
@@ -3849,26 +3932,20 @@ compare_stripe_info2() {
         do
                 for count in 1 2 3 4
                 do
-                       for offset in 0 1 2 3 
+                       for offset in 0 1 2 3
                         do
                                 local size=`expr $STRIPE_SIZE \* $num`
                                 local file=file"$num-$offset-$count"
-                               local tmp_file=out
-                               $GETSTRIPE -v $file > $tmp_file
-                               get_stripe_info  $tmp_file
-                               if test $stripe_size -ne $size
-                               then
+                               get_stripe_info client $PWD/$file
+                               if [ $stripe_size -ne $size ]; then
                                         error "$file: different stripe size" && return  
                                 fi
-                               if test $stripe_count -ne $count
-                               then
+                               if [ $stripe_count -ne $count ]; then
                                         error "$file: different stripe count" && return
                                 fi
-                               if test $stripe_offset -ne $offset
-                               then
+                               if [ $stripe_index -ne $offset ]; then
                                         error "$file: different stripe offset" && return
                                 fi
-                               rm -f $tmp_file
                         done
                 done
         done
@@ -3876,7 +3953,7 @@ compare_stripe_info2() {
  
  test_102d() {
         # b10930: star test for trusted.lov xattr
-       star --xhelp 2>&1 | grep -q nolustre  
+       star --xhelp 2>&1 | grep -q nolustre
         if [ $? -ne 0 ]
         then
                 skip "being skipped because a lustre-aware star is not installed." && return
@@ -3893,7 +3970,7 @@ run_test 102d "star restore stripe info from tarfile,not keep osts ==========="
  
  test_102e() {
         # b10930: star test for trusted.lov xattr
-       star --xhelp 2>&1 | grep -q nolustre  
+       star --xhelp 2>&1 | grep -q nolustre
         [ $? -ne 0 ] && skip "lustre-aware star is not installed" && return
         [ "$OSTCOUNT" -lt "4" ] && skip "skipping 4-stripe test" && return
         setup_test102
@@ -3906,7 +3983,7 @@ run_test 102e "star restore stripe info from tarfile, keep osts ==========="
  
  test_102f() {
         # b10930: star test for trusted.lov xattr
-       star --xhelp 2>&1 | grep -q nolustre  
+       star --xhelp 2>&1 | grep -q nolustre
         [ $? -ne 0 ] && skip "lustre-aware star is not installed" && return
         [ "$OSTCOUNT" -lt "4" ] && skip "skipping 4-stripe test" && return
         setup_test102
@@ -3920,7 +3997,7 @@ run_test 102f "star copy files, not keep osts ==========="
  
  test_102g() {
         # b10930: star test for trusted.lov xattr
-       star --xhelp 2>&1 | grep -q nolustre  
+       star --xhelp 2>&1 | grep -q nolustre
         [ $? -ne 0 ] && skip "lustre-aware star is not installed" && return
         [ "$OSTCOUNT" -lt "4" ] && skip "skipping 4-stripe test" && return
         setup_test102
@@ -3977,6 +4054,15 @@ test_102h() { # bug 15777
  }
  run_test 102h "grow xattr from inside inode to external block"
  
+test_102i() { # bug 17038
+        touch $DIR/$tfile
+        ln -s $DIR/$tfile $DIR/${tfile}link
+        getfattr -n trusted.lov $DIR/$tfile || error "lgetxattr on $DIR/$tfile failed"
+        getfattr -h -n trusted.lov $DIR/${tfile}link 2>&1 | grep -i "no such attr" || error "error for lgetxattr on $DIR/${tfile}link is not ENODATA"
+        rm -f $DIR/$tfile $DIR/${tfile}link
+}
+run_test 102i "lgetxattr test on symbolic link ============"
+
  run_acl_subtest()
  {
      $LUSTRE/tests/acl/run $LUSTRE/tests/acl/$1.test
@@ -3985,7 +4071,7 @@ run_acl_subtest()
  
  test_103 () {
      [ "$UID" != 0 ] && skip "must run as root" && return
-    [ -z "$(lctl get_param mdc.*[mM][dD][cC]*.connect_flags | grep acl)" ] && skip "must have acl enabled" && return
+    [ -z "$(lctl get_param mdc.*.connect_flags | grep acl)" ] && skip "must have acl enabled" && return
      [ -z "$(which setfacl 2>/dev/null)" ] && skip "could not find setfacl" && return
  
      SAVE_UMASK=`umask`
@@ -4038,9 +4124,9 @@ test_105a() {
          touch $DIR/$tfile
          if [ -n "`mount | grep \"$DIR.*flock\" | grep -v noflock`" ];
          then
-                flocks_test on -f $DIR/$tfile || error "fail flock on"
+                flocks_test 1 on -f $DIR/$tfile || error "fail flock on"
          else
-                flocks_test off -f $DIR/$tfile || error "fail flock off"
+                flocks_test 1 off -f $DIR/$tfile || error "fail flock off"
          fi
  }
  run_test 105a "flock when mounted without -o flock test ========"
@@ -4049,9 +4135,9 @@ test_105b() {
          touch $DIR/$tfile
          if [ -n "`mount | grep \"$DIR.*flock\" | grep -v noflock`" ];
          then
-                flocks_test on -c $DIR/$tfile || error "fail flock on"
+                flocks_test 1 on -c $DIR/$tfile || error "fail flock on"
          else
-                flocks_test off -c $DIR/$tfile || error "fail flock off"
+                flocks_test 1 off -c $DIR/$tfile || error "fail flock off"
          fi
  }
  run_test 105b "fcntl when mounted without -o flock test ========"
@@ -4060,13 +4146,23 @@ test_105c() {
          touch $DIR/$tfile
          if [ -n "`mount | grep \"$DIR.*flock\" | grep -v noflock`" ];
          then
-                flocks_test on -l $DIR/$tfile || error "fail flock on"
+                flocks_test 1 on -l $DIR/$tfile || error "fail flock on"
          else
-                flocks_test off -l $DIR/$tfile || error "fail flock off"
+                flocks_test 1 off -l $DIR/$tfile || error "fail flock off"
          fi
  }
  run_test 105c "lockf when mounted without -o flock test ========"
  
+test_105d() { # bug 15924
+        mkdir -p $DIR/$tdir
+        [ -z "`mount | grep \"$DIR.*flock\" | grep -v noflock`" ] && \
+                skip "mount w/o flock enabled" && return
+        #define OBD_FAIL_LDLM_CP_CB_WAIT  0x315
+        $LCTL set_param fail_loc=0x80000315
+        flocks_test 2 $DIR/$tdir
+}
+run_test 105d "flock race (should not freeze) ========"
+
  test_106() { #bug 10921
         mkdir -p $DIR/$tdir
         $DIR/$tdir && error "exec $DIR/$tdir succeeded"
@@ -4077,14 +4173,19 @@ run_test 106 "attempt exec of dir followed by chown of that dir"
  test_107() {
          CDIR=`pwd`
          cd $DIR
+
+        local file=core
+        rm -f $file
+
+        local save_pattern=$(sysctl -n kernel.core_pattern)
+        local save_uses_pid=$(sysctl -n kernel.core_uses_pid)
+        sysctl -w kernel.core_pattern=$file
+        sysctl -w kernel.core_uses_pid=0
+
          ulimit -c unlimited
          sleep 60 &
          SLEEPPID=$!
  
-        file=`sysctl -n kernel.core_pattern`
-        core_pid=`sysctl -n kernel.core_uses_pid`
-        [ $core_pid -eq 1 ] && file=$file.$SLEEPPID
-        rm -f $file
          sleep 1
  
          kill -s 11 $SLEEPPID
@@ -4096,6 +4197,8 @@ test_107() {
                  error "Fail to create core file $file"
          fi
          rm -f $file
+        sysctl -w kernel.core_pattern=$save_pattern
+        sysctl -w kernel.core_uses_pid=$save_uses_pid
          cd $CDIR
  }
  run_test 107 "Coredump on SIG"
@@ -4120,7 +4223,7 @@ test_115() {
  
         # don't return an error
          [ $OSTIO_post -eq $OSTIO_pre ] && echo \
-           "FAIL: No addition ll_ost_io threads were created ($OSTIO_pre)" &&\
+           "WARNING: No new ll_ost_io threads were created ($OSTIO_pre)" &&\
             echo "This may be fine, depending on what ran before this test" &&\
             echo "and how fast this system is." && return
  
@@ -4143,13 +4246,12 @@ free_min_max () {
                 MINV=${AVAIL[i]}; MINI=$i
             fi
         done
-       echo Min free space: OST $MINI: $MINV 
-       echo Max free space: OST $MAXI: $MAXV 
+       echo Min free space: OST $MINI: $MINV
+       echo Max free space: OST $MAXI: $MAXV
  }
  
  test_116() {
         [ "$OSTCOUNT" -lt "2" ] && skip "too few OSTs" && return
-       remote_mds && skip "remote MDS" && return
  
         echo -n "Free space priority "
         lctl get_param -n lov.*.qos_prio_free
@@ -4184,7 +4286,7 @@ test_116() {
             echo "ok"
         else
             echo "failed - QOS mode won't be used"
-           error "QOS imbalance criteria not met"
+           error_ignore "QOS imbalance criteria not met"
             return
         fi
  
@@ -4209,14 +4311,14 @@ test_116() {
         free_min_max
         DIFF2=$(($MAXV - $MINV))
         echo "free space delta: orig $DIFF final $DIFF2"
-       [ $DIFF2 -gt $DIFF ] && echo "delta got worse!" 
+       [ $DIFF2 -gt $DIFF ] && echo "delta got worse!"
         DIFF=$(($MINV1 - ${AVAIL[$MINI1]}))
         echo "Wrote $DIFF to smaller OST $MINI1"
         DIFF2=$(($MAXV1 - ${AVAIL[$MAXI1]}))
         echo "Wrote $DIFF2 to larger OST $MAXI1"
         [ $DIFF -gt 0 ] && echo "Wrote $(($DIFF2 * 100 / $DIFF - 100))% more data to larger OST $MAXI1"
  
-       # Figure out which files were written where 
+       # Figure out which files were written where
         UUID=$(lctl get_param -n lov.${FSNAME}-clilov-*.target_obd | awk '/'$MINI1': / {print $2; exit}')
         echo $UUID
          MINC=$($GETSTRIPE --obd $UUID $DIR/$tdir | wc -l)
@@ -4225,7 +4327,7 @@ test_116() {
          MAXC=$($GETSTRIPE --obd $UUID $DIR/$tdir | wc -l)
         echo "$MAXC files created on larger OST $MAXI1"
         [ $MINC -gt 0 ] && echo "Wrote $(($MAXC * 100 / $MINC - 100))% more files to larger OST $MAXI1"
-       [ $MAXC -gt $MINC ] || error "stripe QOS didn't balance free space"
+       [ $MAXC -gt $MINC ] || error_ignore "stripe QOS didn't balance free space"
  }
  run_test 116 "stripe QOS: free space balance ==================="
  
@@ -4255,7 +4357,7 @@ reset_async() {
         FILE=$DIR/reset_async
  
         # Ensure all OSCs are cleared
-       $LSTRIPE $FILE 0 -1 -1
+       $SETSTRIPE $FILE 0 -1 -1
          dd if=/dev/zero of=$FILE bs=64k count=$OSTCOUNT
         sync
          rm $FILE
@@ -4278,7 +4380,7 @@ run_test 118a "verify O_SYNC works =========="
  
  test_118b()
  {
-       remote_ost_nodsh && skip "remote OST" && return
+       remote_ost_nodsh && skip "remote OST with nodsh" && return
  
         reset_async
  
@@ -4313,7 +4415,7 @@ run_test 118b "Reclaim dirty pages on fatal error =========="
  
  test_118c()
  {
-       remote_ost_nodsh && skip "remote OST" && return
+       remote_ost_nodsh && skip "remote OST with nodsh" && return
  
         reset_async
  
@@ -4355,7 +4457,7 @@ run_test 118c "Fsync blocks on EROFS until dirty pages are flushed =========="
  
  test_118d()
  {
-       remote_ost_nodsh && skip "remote OST" && return
+       remote_ost_nodsh && skip "remote OST with nodsh" && return
  
         reset_async
  
@@ -4404,7 +4506,7 @@ test_118f() {
         fi
         
          lctl set_param fail_loc=0x0
-        
+
          LOCKED=$(lctl get_param -n "llite.*.dump_page_cache" | grep -c locked)
          DIRTY=$(lctl get_param -n "llite.*.dump_page_cache" | grep -c dirty)
          WRITEBACK=$(lctl get_param -n "llite.*.dump_page_cache" | grep -c writeback)
@@ -4459,7 +4561,7 @@ test_118g() {
  run_test 118g "Don't stay in wait if we got local -ENOMEM  =========="
  
  test_118h() {
-       remote_ost_nodsh && skip "remote OST" && return
+       remote_ost_nodsh && skip "remote OST with nodsh" && return
  
          reset_async
  
@@ -4493,7 +4595,7 @@ test_118h() {
  run_test 118h "Verify timeout in handling recoverables errors  =========="
  
  test_118i() {
-       remote_ost_nodsh && skip "remote OST" && return
+       remote_ost_nodsh && skip "remote OST with nodsh" && return
  
          reset_async
  
@@ -4531,7 +4633,7 @@ test_118i() {
  run_test 118i "Fix error before timeout in recoverable error  =========="
  
  test_118j() {
-       remote_ost_nodsh && skip "remote OST" && return
+       remote_ost_nodsh && skip "remote OST with nodsh" && return
  
          reset_async
  
@@ -4567,6 +4669,8 @@ run_test 118j "Simulate unrecoverable OST side error =========="
  
  test_118k()
  {
+       remote_ost_nodsh && skip "remote OSTs with nodsh" && return
+
         #define OBD_FAIL_OST_BRW_WRITE_BULK      0x20e
         set_nodes_failloc "$(osts_nodes)" 0x20e
         mkdir -p $DIR/$tdir
@@ -4659,7 +4763,7 @@ test_120a() {
          stat $DIR/$tdir > /dev/null
          can1=`lctl get_param -n ldlm.services.ldlm_canceld.stats | awk '/ldlm_cancel/ {print $2}'`
          blk1=`lctl get_param -n ldlm.services.ldlm_cbd.stats | awk '/ldlm_bl_callback/ {print $2}'`
-        mkdir -p $DIR/$tdir/d1
+        mkdir $DIR/$tdir/d1
          can2=`lctl get_param -n ldlm.services.ldlm_canceld.stats | awk '/ldlm_cancel/ {print $2}'`
          blk2=`lctl get_param -n ldlm.services.ldlm_cbd.stats | awk '/ldlm_bl_callback/ {print $2}'`
          [ $can1 -eq $can2 ] || error $((can2-can1)) "cancel RPC occured."
@@ -4792,7 +4896,7 @@ test_120g() {
          cancel_lru_locks mdc
          cancel_lru_locks osc
          t0=`date +%s`
-        
+
          can0=`lctl get_param -n ldlm.services.ldlm_canceld.stats | awk '/ldlm_cancel/ {print $2}'`
          blk0=`lctl get_param -n ldlm.services.ldlm_cbd.stats | awk '/ldlm_bl_callback/ {print $2}'`
          createmany -o $DIR/$tdir/f $count
@@ -4836,44 +4940,55 @@ test_122() { #bug 11544
  run_test 122 "fail client bulk callback (shouldn't LBUG) ======="
  
  test_123a() { # was test 123, statahead(bug 11401)
+        SLOWOK=0
          if [ -z "$(grep "processor.*: 1" /proc/cpuinfo)" ]; then
                  log "testing on UP system. Performance may be not as good as expected."
+               SLOWOK=1
          fi
  
-        remount_client $MOUNT
+        rm -rf $DIR/$tdir
          mkdir -p $DIR/$tdir
-        error=0
          NUMFREE=`df -i -P $DIR | tail -n 1 | awk '{ print $4 }'`
          [ $NUMFREE -gt 100000 ] && NUMFREE=100000 || NUMFREE=$((NUMFREE-1000))
          MULT=10
-        for ((i=1, j=0; i<=$NUMFREE; j=$i, i=$((i * MULT)) )); do
+        for ((i=100, j=0; i<=$NUMFREE; j=$i, i=$((i * MULT)) )); do
                  createmany -o $DIR/$tdir/$tfile $j $((i - j))
  
-                lctl get_param -n llite.*.statahead_max | grep '[0-9]'
-                cancel_lru_locks mdc
-                cancel_lru_locks osc
-                stime=`date +%s`
-                ls -l $DIR/$tdir > /dev/null
-                etime=`date +%s`
-                delta_sa=$((etime - stime))
-                log "ls $i files with statahead:    $delta_sa sec"
-               lctl get_param -n llite.*.statahead_stats
-
                  max=`lctl get_param -n llite.*.statahead_max | head -n 1`
                  lctl set_param -n llite.*.statahead_max 0
                  lctl get_param llite.*.statahead_max
                  cancel_lru_locks mdc
                  cancel_lru_locks osc
                  stime=`date +%s`
-                ls -l $DIR/$tdir > /dev/null
+                time ls -l $DIR/$tdir > /dev/null
                  etime=`date +%s`
                  delta=$((etime - stime))
                  log "ls $i files without statahead: $delta sec"
-
                  lctl set_param llite.*.statahead_max=$max
-                if [ $delta_sa -gt $(($delta + 2)) ]; then
-                        log "ls $i files is slower with statahead!"
-                        error=1
+
+                swrong=`lctl get_param -n llite.*.statahead_stats | grep "statahead wrong:" | awk '{print $3}'`
+                lctl get_param -n llite.*.statahead_max | grep '[0-9]'
+                cancel_lru_locks mdc
+                cancel_lru_locks osc
+                stime=`date +%s`
+                time ls -l $DIR/$tdir > /dev/null
+                etime=`date +%s`
+                delta_sa=$((etime - stime))
+                log "ls $i files with statahead:    $delta_sa sec"
+               lctl get_param -n llite.*.statahead_stats
+                ewrong=`lctl get_param -n llite.*.statahead_stats | grep "statahead wrong:" | awk '{print $3}'`
+
+                if [ $swrong -lt $ewrong ]; then
+                        log "statahead was stopped, maybe too many locks held!"
+                fi
+
+                if [ $((delta_sa * 100)) -gt $((delta * 105)) ]; then
+                        if [  $SLOWOK -eq 0 ]; then
+                                error "ls $i files is slower with statahead!"
+                        else
+                                log "ls $i files is slower with statahead!"
+                        fi
+                        break;
                  fi
  
                  [ $delta -gt 20 ] && break
@@ -4890,10 +5005,6 @@ test_123a() { # was test 123, statahead(bug 11401)
          log "rm -r $DIR/$tdir/: $delta seconds"
          log "rm done"
          lctl get_param -n llite.*.statahead_stats
-        # wait for commitment of removal
-        sleep 2
-        [ $error -ne 0 ] && error "statahead is slow!"
-        return 0
  }
  run_test 123a "verify statahead work"
  
@@ -4919,25 +5030,24 @@ run_test 123b "not panic with network error in statahead enqueue (bug 15027)"
  test_124a() {
         [ -z "`lctl get_param -n mdc.*.connect_flags | grep lru_resize`" ] && \
                 skip "no lru resize on server" && return 0
-        NR=2000
+        local NR=2000
          mkdir -p $DIR/$tdir || error "failed to create $DIR/$tdir"
  
-        # use touch to produce $NR new locks
          log "create $NR files at $DIR/$tdir"
-        createmany -o $DIR/$tdir/f $NR || 
-                error "failed to create $NR files in $DIR/$tdir" 
-        
+        createmany -o $DIR/$tdir/f $NR ||
+                error "failed to create $NR files in $DIR/$tdir"
+
          cancel_lru_locks mdc
          ls -l $DIR/$tdir > /dev/null
  
-        NSDIR=""
-        LRU_SIZE=0
+        local NSDIR=""
+        local LRU_SIZE=0
          for VALUE in `lctl get_param ldlm.namespaces.*mdc-*.lru_size`; do
-                PARAM=`echo ${VALUE[0]} | cut -d "=" -f1`
+                local PARAM=`echo ${VALUE[0]} | cut -d "=" -f1`
                  LRU_SIZE=$(lctl get_param -n $PARAM)
                  if [ $LRU_SIZE -gt $(default_lru_size) ]; then
                          NSDIR=$(echo $PARAM | cut -d "." -f1-3)
-                        log "using $(basename $NSDIR) namespace"
+                        log "NS=$(basename $NSDIR)"
                          break
                  fi
          done
@@ -4946,40 +5056,53 @@ test_124a() {
                  skip "Not enough cached locks created!"
                  return 0
          fi
-        log "created $LRU_SIZE lock(s)"
-
-        # we want to sleep 30s to not make test too long
-        SLEEP=30
-        SLEEP_ADD=2
-
-        # we know that lru resize allows one client to hold $LIMIT locks for 10h
-        MAX_HRS=10
-
-        # get the pool limit
-        LIMIT=`lctl get_param -n $NSDIR.pool.limit`
-
-        # calculate lock volume factor taking into account data set size and the
-        # rule that number of locks will be getting smaller durring sleep interval
-        # and we need to additionally enforce LVF to take this into account.
-        # Use $LRU_SIZE_B here to take into account real number of locks created
-        # in the case of CMD, LRU_SIZE_B != $NR in most of cases
-        LVF=$(($MAX_HRS * 60 * 60 * $LIMIT / $SLEEP))
-        LRU_SIZE_B=$LRU_SIZE
-        log "make client drop locks $LVF times faster so that ${SLEEP}s is enough to cancel $LRU_SIZE lock(s)"
-        OLD_LVF=`lctl get_param -n $NSDIR.pool.lock_volume_factor`
+        log "LRU=$LRU_SIZE"
+
+        local SLEEP=30
+
+        # We know that lru resize allows one client to hold $LIMIT locks
+        # for 10h. After that locks begin to be killed by client.
+        local MAX_HRS=10
+        local LIMIT=`lctl get_param -n $NSDIR.pool.limit`
+
+        # Make LVF so higher that sleeping for $SLEEP is enough to _start_
+        # killing locks. Some time was spent for creating locks. This means
+        # that up to the moment of sleep finish we must have killed some of
+        # them (10-100 locks). This depends on how fast ther were created.
+        # Many of them were touched in almost the same moment and thus will
+        # be killed in groups.
+        local LVF=$(($MAX_HRS * 60 * 60 / $SLEEP * $LIMIT / $LRU_SIZE))
+
+        # Use $LRU_SIZE_B here to take into account real number of locks
+        # created in the case of CMD, LRU_SIZE_B != $NR in most of cases
+        local LRU_SIZE_B=$LRU_SIZE
+        log "LVF=$LVF"
+        local OLD_LVF=`lctl get_param -n $NSDIR.pool.lock_volume_factor`
          lctl set_param -n $NSDIR.pool.lock_volume_factor $LVF
-        log "sleep for $((SLEEP+SLEEP_ADD))s"
-        sleep $((SLEEP+SLEEP_ADD))
+        
+        # Let's make sure that we really have some margin. Client checks
+        # cached locks every 10 sec.
+        SLEEP=$((SLEEP+20))
+        log "Sleep ${SLEEP} sec"
+        local SEC=0
+        while ((SEC<$SLEEP)); do
+                echo -n "..."
+                sleep 5
+                SEC=$((SEC+5))
+                LRU_SIZE=`lctl get_param -n $NSDIR/lru_size`
+                echo -n "$LRU_SIZE"
+        done
+        echo ""
          lctl set_param -n $NSDIR.pool.lock_volume_factor $OLD_LVF
-        LRU_SIZE_A=`lctl get_param -n $NSDIR/lru_size`
+        local LRU_SIZE_A=`lctl get_param -n $NSDIR/lru_size`
  
          [ $LRU_SIZE_B -gt $LRU_SIZE_A ] || {
-                error "No locks dropped in "$((SLEEP+SLEEP_ADD))"s. LRU size: $LRU_SIZE_A"
+                error "No locks dropped in ${SLEEP}s. LRU size: $LRU_SIZE_A"
                  unlinkmany $DIR/$tdir/f $NR
                  return
          }
  
-        log "Dropped "$((LRU_SIZE_B-LRU_SIZE_A))" locks in "$((SLEEP+SLEEP_ADD))"s"
+        log "Dropped "$((LRU_SIZE_B-LRU_SIZE_A))" locks in ${SLEEP}s"
          log "unlink $NR files at $DIR/$tdir"
          unlinkmany $DIR/$tdir/f $NR
  }
@@ -4989,7 +5112,7 @@ test_124b() {
         [ -z "`lctl get_param -n mdc.*.connect_flags | grep lru_resize`" ] && \
                 skip "no lru resize on server" && return 0
  
-        # even for cmd no matter what metadata namespace to use for getting 
+        # even for cmd no matter what metadata namespace to use for getting
          # the limit, we use appropriate.
          LIMIT=`lctl get_param -n ldlm.namespaces.*mdc*.pool.limit`
  
@@ -4998,7 +5121,7 @@ test_124b() {
                  NR=$LIMIT
          fi
          lru_resize_disable mdc
-        mkdir -p $DIR/$tdir/disable_lru_resize || 
+        mkdir -p $DIR/$tdir/disable_lru_resize ||
                 error "failed to create $DIR/$tdir/disable_lru_resize"
  
          createmany -o $DIR/$tdir/disable_lru_resize/f $NR
@@ -5022,7 +5145,7 @@ test_124b() {
          unlinkmany $DIR/$tdir/disable_lru_resize/f $NR
  
          lru_resize_enable mdc
-        mkdir -p $DIR/$tdir/enable_lru_resize || 
+        mkdir -p $DIR/$tdir/enable_lru_resize ||
                 error "failed to create $DIR/$tdir/enable_lru_resize"
  
          createmany -o $DIR/$tdir/enable_lru_resize/f $NR
@@ -5074,7 +5197,7 @@ test_126() { # bug 12829/13455
  run_test 126 "check that the fsgid provided by the client is taken into account"
  
  test_127() { # bug 15521
-        $LSTRIPE -i 0 -c 1 $DIR/$tfile
+        $SETSTRIPE -i 0 -c 1 $DIR/$tfile
          $LCTL set_param osc.*.stats=0
          FSIZE=$((2048 * 1024))
          dd if=/dev/zero of=$DIR/$tfile bs=$FSIZE count=1
@@ -5083,19 +5206,20 @@ test_127() { # bug 15521
  
          $LCTL get_param osc.*0000-osc-*.stats | grep samples > $DIR/${tfile}.tmp
          while read NAME COUNT SAMP UNIT MIN MAX SUM SUMSQ; do
-                eval $NAME=$COUNT
                  echo "got $COUNT $NAME"
-
+                [ ! $MIN ] && error "Missing min value for $NAME proc entry"
+                eval $NAME=$COUNT || error "Wrong proc format"
+               
                  case $NAME in
-                        ost_read|ost_write)
+                        read_bytes|write_bytes)
                          [ $MIN -lt 4096 ] && error "min is too small: $MIN"
                          [ $MIN -gt $FSIZE ] && error "min is too big: $MIN"
                          [ $MAX -lt 4096 ] && error "max is too small: $MAX"
                          [ $MAX -gt $FSIZE ] && error "max is too big: $MAX"
                          [ $SUM -ne $FSIZE ] && error "sum is wrong: $SUM"
-                        [ $SUMSQ -lt $(((FSIZE /4096) * (4096 * 4096))) ] && 
+                        [ $SUMSQ -lt $(((FSIZE /4096) * (4096 * 4096))) ] &&
                                  error "sumsquare is too small: $SUMSQ"
-                        [ $SUMSQ -gt $((FSIZE * FSIZE)) ] && 
+                        [ $SUMSQ -gt $((FSIZE * FSIZE)) ] &&
                                  error "sumsquare is too big: $SUMSQ"
                          ;;
                          *) ;;
@@ -5103,8 +5227,10 @@ test_127() { # bug 15521
          done < $DIR/${tfile}.tmp
  
          #check that we actually got some stats
-        [ "$ost_read" ] || error "no read done"
-        [ "$ost_write" ] || error "no write done"
+        [ "$read_bytes" ] || error "Missing read_bytes stats"
+        [ "$write_bytes" ] || error "Missing write_bytes stats"
+        [ "$read_bytes" != 0 ] || error "no read done"
+        [ "$write_bytes" != 0 ] || error "no write done"
  }
  run_test 127 "verify the client stats are sane"
  
@@ -5117,15 +5243,16 @@ test_128() { # bug 15212
  
         result=$(grep error $TMP/$tfile.log)
         rm -f $DIR/$tfile
-       [ -z "$result" ] || error "consecutive find's under interactive lfs failed"
+       [ -z "$result" ] || error "consecutive find with interactive lfs failed"
  }
-run_test 128 "interactive lfs for 2 consecutive find's"
+run_test 128 "interactive lfs for 2 consecutive finds"
  
  test_129() {
          [ "$FSTYPE" != "ldiskfs" ] && skip "not needed for FSTYPE=$FSTYPE" && return 0
+        remote_mds_nodsh && skip "remote MDS with nodsh" && return
  
          DEV=$(basename $(do_facet mds lctl get_param -n mds.*.mntdev))
-        [ -z "$DEV" ] && error "can't access mds mntdev" 
+        [ -z "$DEV" ] && error "can't access mds mntdev"
          EFBIG=27
          LDPROC=/proc/fs/ldiskfs/$DEV/max_dir_size
          MAX=16384
@@ -5145,7 +5272,7 @@ test_129() {
                          return 0
                  elif [ $rc -ne 0 ]; then
                          do_facet mds "echo 0 >$LDPROC"
-                        error_exit "return code $rc received instead of expected $EFBIG"
+                        error_exit "error $rc instead of expected $EFBIG"
                  fi
                  J=$((J+1))
                  I=$(stat -c%s "$DIR/$tdir")
@@ -5156,9 +5283,570 @@ test_129() {
  }
  run_test 129 "test directory size limit ========================"
  
-TMPDIR=$OLDTMPDIR
-TMP=$OLDTMP
-HOME=$OLDHOME
+OLDIFS="$IFS"
+cleanup_130() {
+       trap 0
+       IFS="$OLDIFS"
+}
+
+test_130a() {
+       filefrag_op=$(filefrag -e 2>&1 | grep "invalid option")
+       [ -n "$filefrag_op" ] && skip "filefrag has no FIEMAP support" && return
+
+       trap cleanup_130 EXIT RETURN
+
+       local fm_file=$DIR/$tfile
+       lfs setstripe -s 65536 -c 1 $fm_file ||
+               error "setstripe failed on $fm_file"
+       dd if=/dev/zero of=$fm_file bs=65536 count=1 ||
+               error "dd failed for $fm_file"
+
+       filefrag -ves $fm_file || error "filefrag $fm_file failed"
+       filefrag_op=`filefrag -ve $fm_file | grep -A 100 "ext:" | grep -v "ext:" | grep -v "found"`
+
+       lun=`$GETSTRIPE $fm_file  | grep -A 10 obdidx | awk '{print $1}' | grep -v "obdidx"`
+
+       start_blk=`echo $filefrag_op | cut -d: -f2 | cut -d. -f1`
+       IFS=$'\n'
+       tot_len=0
+       for line in $filefrag_op
+       do
+               frag_lun=`echo $line | cut -d: -f5`
+               ext_len=`echo $line | cut -d: -f4`
+               if (( $frag_lun != $lun )); then
+                       cleanup_130
+                       error "FIEMAP on 1-stripe file($fm_file) failed"
+                       return
+               fi
+               (( tot_len += ext_len ))
+       done
+
+       if (( lun != frag_lun || start_blk != 0 || tot_len != 64 )); then
+               cleanup_130
+               error "FIEMAP on 1-stripe file($fm_file) failed;"
+               return
+       fi
+
+       cleanup_130
+
+       echo "FIEMAP on single striped file succeeded"
+}
+run_test 130a "FIEMAP (1-stripe file)"
+
+test_130b() {
+       [ "$OSTCOUNT" -lt "2" ] && skip "FIEMAP on 2-stripe file test" && return
+
+       filefrag_op=$(filefrag -e 2>&1 | grep "invalid option")
+       [ -n "$filefrag_op" ] && skip "filefrag has no FIEMAP support" && return
+
+       trap cleanup_130 EXIT RETURN
+
+       local fm_file=$DIR/$tfile
+       lfs setstripe -s 65536 -c 2 $fm_file ||
+               error "setstripe failed on $fm_file"
+       dd if=/dev/zero of=$fm_file bs=1M count=2 ||
+               error "dd failed on $fm_file"
+
+       filefrag -ves $fm_file || error "filefrag $fm_file failed"
+       filefrag_op=`filefrag -ve $fm_file | grep -A 100 "ext:" | grep -v "ext:" | grep -v "found"`
+
+       last_lun=`echo $filefrag_op | cut -d: -f5`
+
+       IFS=$'\n'
+       tot_len=0
+       num_luns=1
+       for line in $filefrag_op; do
+               frag_lun=`echo $line | cut -d: -f5`
+               ext_len=`echo $line | cut -d: -f4`
+               if (( $frag_lun != $last_lun )); then
+                       if (( tot_len != 1024 )); then
+                               cleanup_130
+                               error "FIEMAP $fm_file: len $tot_len for OST $last_lun instead of 256"
+                               return
+                       else
+                               (( num_luns += 1 ))
+                               tot_len=0
+                       fi
+               fi
+               (( tot_len += ext_len ))
+               last_lun=$frag_lun
+       done
+       if (( num_luns != 2 || tot_len != 1024 )); then
+               cleanup_130
+               error "FIEMAP $fm_file: wrong number of LUNs or wrong len for OST $last_lun"
+               return
+       fi
+
+       cleanup_130
+
+       echo "FIEMAP on 2-stripe file succeeded"
+}
+run_test 130b "FIEMAP (2-stripe file)"
+
+test_130c() {
+       [ "$OSTCOUNT" -lt "2" ] && skip "FIEMAP on 2-stripe hole test" && return
+
+       filefrag_op=$(filefrag -e 2>&1 | grep "invalid option")
+       [ -n "$filefrag_op" ] && skip "filefrag has no FIEMAP support" && return
+
+       trap cleanup_130 EXIT RETURN
+
+       local fm_file=$DIR/$tfile
+       lfs setstripe -s 65536 -c 2 $fm_file ||
+               error "setstripe failed on $fm_file"
+       dd if=/dev/zero of=$fm_file seek=1 bs=1M count=1 ||
+               error "dd failed on $fm_file"
+
+       filefrag -ves $fm_file || error "filefrag $fm_file failed"
+       filefrag_op=`filefrag -ve $fm_file | grep -A 100 "ext:" | grep -v "ext:" | grep -v "found"`
+
+       last_lun=`echo $filefrag_op | cut -d: -f5`
+
+       IFS=$'\n'
+       tot_len=0
+       num_luns=1
+       for line in $filefrag_op
+       do
+               frag_lun=`echo $line | cut -d: -f5`
+               ext_len=`echo $line | cut -d: -f4`
+               if (( $frag_lun != $last_lun )); then
+                       logical=`echo $line | cut -d: -f2 | cut -d. -f1`
+                       if (( logical != 512 )); then
+                               cleanup_130
+                               error "FIEMAP $fm_file: logical start for LUN $logical instead of 512"
+                               return
+                       fi
+                       if (( tot_len != 512 )); then
+                               cleanup_130
+                               error "FIEMAP $fm_file: len $tot_len for OST $last_lun instead of 1024"
+                               return
+                       else
+                               (( num_luns += 1 ))
+                               tot_len=0
+                       fi
+               fi
+               (( tot_len += ext_len ))
+               last_lun=$frag_lun
+       done
+       if (( num_luns != 2 || tot_len != 512 )); then
+               cleanup_130
+               error "FIEMAP $fm_file: wrong number of LUNs or wrong len for OST $last_lun"
+               return
+       fi
+
+       cleanup_130
+
+       echo "FIEMAP on 2-stripe file with hole succeeded"
+}
+run_test 130c "FIEMAP (2-stripe file with hole)"
+
+test_130d() {
+       [ "$OSTCOUNT" -lt "3" ] && skip "FIEMAP on N-stripe file test" && return
+
+       filefrag_op=$(filefrag -e 2>&1 | grep "invalid option")
+       [ -n "$filefrag_op" ] && skip "filefrag has no FIEMAP support" && return
+
+       trap cleanup_130 EXIT RETURN
+
+       local fm_file=$DIR/$tfile
+       lfs setstripe -s 65536 -c $OSTCOUNT $fm_file ||
+               error "setstripe failed on $fm_file"
+       dd if=/dev/zero of=$fm_file bs=1M count=$OSTCOUNT ||
+               error "dd failed on $fm_file"
+
+       filefrag -ves $fm_file || error "filefrag $fm_file failed"
+       filefrag_op=`filefrag -ve $fm_file | grep -A 100 "ext:" | grep -v "ext:" | grep -v "found"`
+
+       last_lun=`echo $filefrag_op | cut -d: -f5`
+
+       IFS=$'\n'
+       tot_len=0
+       num_luns=1
+       for line in $filefrag_op
+       do
+               frag_lun=`echo $line | cut -d: -f5`
+               ext_len=`echo $line | cut -d: -f4`
+               if (( $frag_lun != $last_lun )); then
+                       if (( tot_len != 1024 )); then
+                               cleanup_130
+                               error "FIEMAP $fm_file: len $tot_len for OST $last_lun instead of 1024"
+                               return
+                       else
+                               (( num_luns += 1 ))
+                               tot_len=0
+                       fi
+               fi
+               (( tot_len += ext_len ))
+               last_lun=$frag_lun
+       done
+       if (( num_luns != OSTCOUNT || tot_len != 1024 )); then
+               cleanup_130
+               error "FIEMAP $fm_file: wrong number of LUNs or wrong len for OST $last_lun"
+               return
+       fi
+
+       cleanup_130
+
+       echo "FIEMAP on N-stripe file succeeded"
+}
+run_test 130d "FIEMAP (N-stripe file)"
+
+test_130e() {
+       [ "$OSTCOUNT" -lt "2" ] && skip "continuation FIEMAP test" && return
+
+       filefrag_op=$(filefrag -e 2>&1 | grep "invalid option")
+       [ -n "$filefrag_op" ] && skip "filefrag has no FIEMAP support" && return
+
+       trap cleanup_130 EXIT RETURN
+
+       local fm_file=$DIR/$tfile
+       lfs setstripe -s 65536 -c 2 $fm_file ||
+               error "setstripe failed on $fm_file"
+       NUM_BLKS=512
+       EXPECTED_LEN=$(( (NUM_BLKS / 2) * 4 ))
+       for ((i = 0; i < $NUM_BLKS; i++)); do
+               dd if=/dev/zero of=$fm_file count=1 bs=4096 seek=$((2*$i)) conv=notrunc > /dev/null 2>&1
+       done
+
+       filefrag -ves $fm_file || error "filefrag $fm_file failed"
+       filefrag_op=`filefrag -ve $fm_file | grep -A 750 "ext:" | grep -v "ext:" | grep -v "found"`
+
+       last_lun=`echo $filefrag_op | cut -d: -f5`
+
+       IFS=$'\n'
+       tot_len=0
+       num_luns=1
+       for line in $filefrag_op
+       do
+               frag_lun=`echo $line | cut -d: -f5`
+               ext_len=`echo $line | cut -d: -f4`
+               if (( $frag_lun != $last_lun )); then
+                       if (( tot_len != $EXPECTED_LEN )); then
+                               cleanup_130
+                               error "FIEMAP $fm_file: len $tot_len for OST $last_lun instead of $EXPECTED_LEN"
+                               return
+                       else
+                               (( num_luns += 1 ))
+                               tot_len=0
+                       fi
+               fi
+               (( tot_len += ext_len ))
+               last_lun=$frag_lun
+       done
+       if (( num_luns != 2 || tot_len != $EXPECTED_LEN )); then
+               cleanup_130
+               error "FIEMAP $fm_file: wrong number of LUNs or wrong len for OST $last_lun"
+               return
+       fi
+
+       cleanup_130
+
+       echo "FIEMAP with continuation calls succeeded"
+}
+run_test 130e "FIEMAP (test continuation FIEMAP calls)"
+
+test_140() { #bug-17379
+        mkdir -p $DIR/$tdir || error "Creating dir $DIR/$tdir"
+        cd $DIR/$tdir || error "Changing to $DIR/$tdir"
+        cp /usr/bin/stat . || error "Copying stat to $DIR/$tdir"
+
+        # VFS limits max symlink depth to 5(4KSTACK) or 8
+        local i=0
+        while i=`expr $i + 1`; do
+                mkdir -p $i || error "Creating dir $i"
+                cd $i || error "Changing to $i"
+                ln -s ../stat stat || error "Creating stat symlink"
+                # Read the symlink until ELOOP present,
+                # not LBUGing the system is considered success,
+                # we didn't overrun the stack.
+                $OPENFILE -f O_RDONLY stat >/dev/null 2>&1; ret=$?
+                [ $ret -ne 0 ] && {
+                        if [ $ret -eq 40 ]; then
+                                break  # -ELOOP
+                        else
+                                error "Open stat symlink"
+                                return
+                        fi
+                }
+        done
+        i=`expr $i - 1`
+        echo "The symlink depth = $i"
+        [ $i -eq 4 -o $i -eq 8 ] || error "Invalid symlink depth"
+}
+run_test 140 "Check reasonable stack depth (shouldn't LBUG) ===="
+
+test_150() {
+       local TF="$TMP/$tfile"
+
+        dd if=/dev/urandom of=$TF bs=6096 count=1 || error "dd failed"
+        cp $TF $DIR/$tfile
+        cancel_lru_locks osc
+        cmp $TF $DIR/$tfile || error "$TMP/$tfile $DIR/$tfile differ"
+        remount_client $MOUNT
+        cmp $TF $DIR/$tfile || error "$TF $DIR/$tfile differ (remount)"
+
+        $TRUNCATE $TF 6000
+        $TRUNCATE $DIR/$tfile 6000
+        cancel_lru_locks osc
+        cmp $TF $DIR/$tfile || error "$TF $DIR/$tfile differ (truncate1)"
+
+        echo "12345" >>$TF
+        echo "12345" >>$DIR/$tfile
+        cancel_lru_locks osc
+        cmp $TF $DIR/$tfile || error "$TF $DIR/$tfile differ (append1)"
+
+        echo "12345" >>$TF
+        echo "12345" >>$DIR/$tfile
+        cancel_lru_locks osc
+        cmp $TF $DIR/$tfile || error "$TF $DIR/$tfile differ (append2)"
+
+        rm -f $TF
+        true
+}
+run_test 150 "truncate/append tests"
+
+function roc_access() {
+       ACCNUM=`$LCTL get_param -n obdfilter.*.stats | \
+               grep 'cache_access' | awk '{print $2}' | \
+               awk '{sum=sum+$3} END{print sum}'`
+       echo $ACCNUM
+}
+
+function roc_hit() {
+       ACCNUM=`$LCTL get_param -n obdfilter.*.stats | \
+               grep 'cache_hit' | awk '{print $2}' | \
+               awk '{sum=sum+$1} END{print sum}'`
+       echo $ACCNUM
+}
+
+test_151() {
+       local CPAGES=3
+
+       # check whether obdfilter is cache capable at all
+       if ! $LCTL get_param -n obdfilter.*.read_cache_enable > /dev/null; then
+               echo "not cache-capable obdfilter"
+               return 0
+       fi
+
+       # make sure cache is enabled on all obdfilters
+       $LCTL set_param obdfilter.*.read_cache_enable=1
+       $LCTL set_param obdfilter.*.writethrough_cache_enable=1
+
+       # pages should be in the case right after write
+        dd if=/dev/urandom of=$DIR/$tfile bs=4k count=$CPAGES||error "dd failed"
+       BEFORE=`roc_hit`
+        cancel_lru_locks osc
+       cat $DIR/$tfile >/dev/null
+       AFTER=`roc_hit`
+       if let "AFTER - BEFORE != CPAGES"; then
+               error "NOT IN CACHE: before: $BEFORE, after: $AFTER"
+       fi
+
+       # the following read invalidates the cache
+        cancel_lru_locks osc
+       $LCTL set_param -n obdfilter.*.read_cache_enable 0
+       cat $DIR/$tfile >/dev/null
+
+       # now data shouldn't be found in the cache
+       BEFORE=`roc_hit`
+        cancel_lru_locks osc
+       cat $DIR/$tfile >/dev/null
+       AFTER=`roc_hit`
+       if let "AFTER - BEFORE != 0"; then
+               error "IN CACHE: before: $BEFORE, after: $AFTER"
+       fi
+
+       $LCTL set_param -n obdfilter.*.read_cache_enable=1
+       $LCTL set_param obdfilter.*.writethrough_cache_enable=1
+        rm -f $DIR/$tfile
+}
+run_test 151 "test cache on oss and controls ==============================="
+
+test_152() {
+        local TF="$TMP/$tfile"
+
+       # simulate ENOMEM during write
+#define OBD_FAIL_OST_NOMEM             0x226
+        lctl set_param fail_loc=0x80000226
+        dd if=/dev/urandom of=$TF bs=6096 count=1 || error "dd failed"
+        cp $TF $DIR/$tfile
+        sync || error "sync failed"
+        lctl set_param fail_loc=0
+       
+        # discard client's cache
+        cancel_lru_locks osc
+
+        # simulate ENOMEM during read
+        lctl set_param fail_loc=0x80000226
+        cmp $TF $DIR/$tfile || error "cmp failed"
+        lctl set_param fail_loc=0
+
+       rm -f $TF
+}
+run_test 152 "test read/write with enomem ============================"
+
+test_153() {
+        multiop $DIR/$tfile Ow4096Ycu || error "multiop failed"
+}
+run_test 153 "test if fdatasync does not crash ======================="
+
+POOL=${POOL:-cea1}
+TGT_COUNT=$OSTCOUNT
+TGTPOOL_FIRST=1
+TGTPOOL_MAX=$(($TGT_COUNT - 1))
+TGTPOOL_STEP=2
+TGTPOOL_LIST=`seq $TGTPOOL_FIRST $TGTPOOL_STEP $TGTPOOL_MAX`
+POOL_ROOT=${POOL_ROOT:-$DIR/d200.pools}
+POOL_DIR=$POOL_ROOT/dir_tst
+POOL_FILE=$POOL_ROOT/file_tst
+
+check_file_in_pool()
+{
+       file=$1
+       res=$($GETSTRIPE $file | grep 0x | cut -f2)
+       for i in $res
+       do
+               found=$(echo :$TGTPOOL_LIST: | tr " " ":"  | grep :$i:)
+               if [[ "$found" == "" ]]
+               then
+                       echo "pool list: $TGTPOOL_LIST"
+                       echo "striping: $res"
+                       error "$file not allocated in $POOL"
+                       return 1
+               fi
+       done
+       return 0
+}
+
+test_200a() {
+       [ -z "$(lctl get_param -n mdc.*.connect_flags | grep pools)" ] &&
+               skip "missing pools support on server" && return
+       remote_mds_nodsh && skip "remote MDS with nodsh" && return
+       do_facet mgs $LCTL pool_new $FSNAME.$POOL
+       do_facet mgs $LCTL get_param -n lov.$FSNAME-mdtlov.pools.$POOL
+       [ $? == 0 ] || error "Pool creation of $POOL failed"
+}
+run_test 200a "Create new pool =========================================="
+
+test_200b() {
+       [ -z "$(lctl get_param -n mdc.*.connect_flags | grep pools)" ] &&
+               skip "missing pools support on server" && return
+       remote_mds_nodsh && skip "remote MDS with nodsh" && return
+       TGT=$(seq -f $FSNAME-OST%04g_UUID $TGTPOOL_FIRST $TGTPOOL_STEP \
+               $TGTPOOL_MAX | tr '\n' ' ')
+       do_facet mgs $LCTL pool_add $FSNAME.$POOL \
+               $FSNAME-OST[$TGTPOOL_FIRST-$TGTPOOL_MAX/$TGTPOOL_STEP]_UUID
+       res=$(do_facet mgs $LCTL get_param -n lov.$FSNAME-mdtlov.pools.$POOL |
+               sort | tr '\n' ' ')
+       [ "$res" = "$TGT" ] || error "Pool ($res) do not match requested ($TGT)"
+}
+run_test 200b "Add targets to a pool ===================================="
+
+test_200c() {
+       [ -z "$(lctl get_param -n mdc.*.connect_flags | grep pools)" ] &&
+               skip "missing pools support on server" && return
+       remote_mds_nodsh && skip "remote MDS with nodsh" && return
+       mkdir -p $POOL_DIR
+       $SETSTRIPE -c 2 -p $POOL $POOL_DIR
+       [ $? = 0 ] || error "Cannot set pool $POOL to $POOL_DIR"
+}
+run_test 200c "Set pool on a directory ================================="
+
+test_200d() {
+       [ -z "$(lctl get_param -n mdc.*.connect_flags | grep pools)" ] &&
+               skip "missing pools support on server" && return
+       remote_mds_nodsh && skip "remote MDS with nodsh" && return
+       res=$($GETSTRIPE $POOL_DIR | grep pool: | cut -f8 -d " ")
+       [ "$res" = $POOL ] || error "Pool on $POOL_DIR is not $POOL"
+}
+run_test 200d "Check pool on a directory ==============================="
+
+test_200e() {
+       [ -z "$(lctl get_param -n mdc.*.connect_flags | grep pools)" ] &&
+               skip "missing pools support on server" && return
+       remote_mds_nodsh && skip "remote MDS with nodsh" && return
+       failed=0
+       for i in $(seq -w 1 $(($TGT_COUNT * 3))); do
+               file=$POOL_DIR/file-$i
+               touch $file
+               check_file_in_pool $file
+               if [[ $? != 0 ]]; then
+                       failed=$(($failed + 1))
+               fi
+       done
+       [ "$failed" = 0 ] || error "$failed files not allocated in $POOL"
+}
+run_test 200e "Check files allocation from directory pool =============="
+
+test_200f() {
+       [ -z "$(lctl get_param -n mdc.*.connect_flags | grep pools)" ] &&
+               skip "missing pools support on server" && return
+       remote_mds_nodsh && skip "remote MDS with nodsh" && return
+       mkdir -p $POOL_FILE
+       failed=0
+       for i in $(seq -w 1 $(($TGT_COUNT * 3))); do
+               file=$POOL_FILE/spoo-$i
+               $SETSTRIPE -p $POOL $file
+               check_file_in_pool $file
+               if [[ $? != 0 ]]; then
+                       failed=$(($failed + 1))
+               fi
+       done
+       [ "$failed" = 0 ] || error "$failed files not allocated in $POOL"
+}
+run_test 200f "Create files in a pool ==================================="
+
+test_200g() {
+       [ -z "$(lctl get_param -n mdc.*.connect_flags | grep pools)" ] &&
+               skip "missing pools support on server" && return
+       remote_mds_nodsh && skip "remote MDS with nodsh" && return
+       TGT=$(do_facet mgs $LCTL get_param -n lov.$FSNAME-mdtlov.pools.$POOL |
+               head -1)
+       do_facet mgs $LCTL pool_remove $FSNAME.$POOL $TGT
+       res=$(do_facet mgs $LCTL get_param -n lov.$FSNAME-mdtlov.pools.$POOL |
+               grep $TGT)
+       [ "$res" = "" ] || error "$TGT not removed from $FSNAME.$POOL"
+}
+run_test 200g "Remove a target from a pool ============================="
+
+test_200h() {
+       [ -z "$(lctl get_param -n mdc.*.connect_flags | grep pools)" ] &&
+               skip "missing pools support on server" && return
+       remote_mds_nodsh && skip "remote MDS with nodsh" && return
+       for TGT in $(do_facet mgs $LCTL get_param -n lov.$FSNAME-mdtlov.pools.$POOL); do
+               do_facet mgs $LCTL pool_remove $FSNAME.$POOL $TGT
+       done
+       res=$(do_facet mgs $LCTL get_param -n lov.$FSNAME-mdtlov.pools.$POOL)
+       [ "$res" = "" ] || error "Pool $FSNAME.$POOL cannot be drained"
+}
+run_test 200h "Remove all targets from a pool =========================="
+
+test_200i() {
+       [ -z "$(lctl get_param -n mdc.*.connect_flags | grep pools)" ] &&
+               skip "missing pools support on server" && return
+       remote_mds_nodsh && skip "remote MDS with nodsh" && return
+       do_facet mgs $LCTL pool_destroy $FSNAME.$POOL
+       res=$(do_facet mgs "$LCTL get_param -n lov.$FSNAME-mdtlov.pools.$POOL 2>/dev/null")
+       [ "$res" = "" ] || error "Pool $FSNAME.$POOL is not destroyed"
+}
+run_test 200i "Remove a pool ============================================"
+
+#
+# tests that do cleanup/setup should be run at the end
+#
+
+test_900() {
+        local ls
+        #define OBD_FAIL_MGC_PAUSE_PROCESS_LOG   0x903
+        $LCTL set_param fail_loc=0x903
+        # cancel_lru_locks mgc - does not work due to lctl set_param syntax
+        for ls in /proc/fs/lustre/ldlm/namespaces/MGC*/lru_size; do
+                echo "clear" > $ls
+        done
+        FAIL_ON_ERROR=true cleanup
+        FAIL_ON_ERROR=true setup
+}
+run_test 900 "umount should not race with any mgc requeue thread"
  
  log "cleanup: ======================================================"
  check_and_cleanup_lustre
diff --git a/lustre/tests/sanityN.sh b/lustre/tests/sanityN.sh

index 16c1e14..f8844e6 100644 (file)
--- a/lustre/tests/sanityN.sh
+++ b/lustre/tests/sanityN.sh
@@ -3,8 +3,8 @@
  set -e
  
  ONLY=${ONLY:-"$*"}
-# bug number for skipped test:  3192 12652  15528/3811 9977  15528/11549
-ALWAYS_EXCEPT="                 14b  14c    19         28    29           $SANITYN_EXCEPT"
+# bug number for skipped test: 3192 12652  15528/3811 16929 9977 15528/11549
+ALWAYS_EXCEPT="                14b  14c    19         22    28   29          $SANITYN_EXCEPT"
  # UPDATE THE COMMENT ABOVE WITH BUG NUMBERS WHEN CHANGING ALWAYS_EXCEPT!
  
  # bug number for skipped test:                                                    12652 12652
@@ -45,18 +45,15 @@ init_test_env $@
  [ "$SLOW" = "no" ] && EXCEPT_SLOW="12 16"
  
  SANITYLOG=${TESTSUITELOG:-$TMP/$(basename $0 .sh).log}
-FAIL_ON_ERROR=false
+FAIL_ON_ERROR=${FAIL_ON_ERROR:-false}
  
  SETUP=${SETUP:-:}
  TRACE=${TRACE:-""}
  
-LPROC=/proc/fs/lustre
-
  [ "$SANITYLOG" ] && rm -f $SANITYLOG || true
  
  check_and_setup_lustre
  
-LPROC=/proc/fs/lustre
  LOVNAME=`lctl get_param -n llite.*.lov.common_name | tail -n 1`
  OSTCOUNT=`lctl get_param -n lov.$LOVNAME.numobd`
  
@@ -66,7 +63,7 @@ rm -rf $DIR1/[df][0-9]* $DIR1/lnk
  # $RUNAS_ID may get set incorrectly somewhere else
  [ $UID -eq 0 -a $RUNAS_ID -eq 0 ] && error "\$RUNAS_ID set to 0, but \$UID is also 0!"
  
-check_runas_id $RUNAS_ID $RUNAS
+check_runas_id $RUNAS_ID $RUNAS_ID $RUNAS
  
  build_test_filter
  
@@ -133,9 +130,9 @@ test_2e() {
  run_test 2e "check chmod on root is propagated to others"
  
  test_3() {
-       ( cd $DIR1 ; ln -s this/is/good lnk )
-       [ "this/is/good" = "`perl -e 'print readlink("'$DIR2/lnk'");'`" ] || \
-               error
+       ( cd $DIR1 ; ln -s this/is/good $tfile )
+       [ "this/is/good" = "`perl -e 'print readlink("'$DIR2/$tfile'");'`" ] ||
+               error "link $DIR2/$tfile not as expected"
  }
  run_test 3 "symlink on one mtpt, readlink on another ==========="
  
@@ -254,11 +251,14 @@ test_13() {       # bug 2451 - directory coherency
  run_test 13 "test directory page revocation ===================="
  
  test_14() {
-       mkdir $DIR1/d14
-       cp -p /bin/ls $DIR1/d14/ls
-       exec 100>> $DIR1/d14/ls
-       $DIR2/d14/ls && error || true
-       exec 100<&-
+       mkdir -p $DIR1/$tdir
+       cp -p /bin/ls $DIR1/$tdir/$tfile
+       multiop_bg_pause $DIR1/$tdir/$tfile Ow_c || return 1
+       MULTIPID=$!
+
+       $DIR2/$tdir/$tfile && error || true
+       kill -USR1 $MULTIPID
+       wait $MULTIPID || return 2
  }
  run_test 14 "execution of file open for write returns -ETXTBSY ="
  
@@ -331,16 +331,17 @@ test_16() {
  run_test 16 "2500 iterations of dual-mount fsx ================="
  
  test_17() { # bug 3513, 3667
-       [ ! -d /proc/fs/lustre/ost ] && skip "remote OST, skipping OST-only test" && return
+       remote_ost_nodsh && skip "remote OST with nodsh" && return
  
-       cp /etc/termcap $DIR1/f17
+       lfs setstripe $DIR1/$tfile -i 0 -c 1
+       cp /etc/termcap $DIR1/$tfile
         cancel_lru_locks osc > /dev/null
         #define OBD_FAIL_ONCE|OBD_FAIL_LDLM_CREATE_RESOURCE    0x30a
-       lctl set_param fail_loc=0x8000030a
-       ls -ls $DIR1/f17 | awk '{ print $1,$6 }' > $DIR1/f17-1 & \
-       ls -ls $DIR2/f17 | awk '{ print $1,$6 }' > $DIR2/f17-2
+       do_facet ost1 lctl set_param fail_loc=0x8000030a
+       ls -ls $DIR1/$tfile | awk '{ print $1,$6 }' > $DIR1/$tfile-1 & \
+       ls -ls $DIR2/$tfile | awk '{ print $1,$6 }' > $DIR2/$tfile-2
         wait
-       diff -u $DIR1/f17-1 $DIR2/f17-2 || error "files are different"
+       diff -u $DIR1/$tfile-1 $DIR2/$tfile-2 || error "files are different"
  }
  run_test 17 "resource creation/LVB creation race ==============="
  
@@ -353,10 +354,8 @@ run_test 18 "mmap sanity check ================================="
  test_19() { # bug3811
         [ -d /proc/fs/lustre/obdfilter ] || return 0
  
-       MAX=`cat /proc/fs/lustre/obdfilter/*/readcache_max_filesize | head -n 1`
-       for O in /proc/fs/lustre/obdfilter/*OST*; do
-               echo 4096 > $O/readcache_max_filesize
-       done
+       MAX=`lctl get_param -n obdfilter.*.readcache_max_filesize | head -n 1`
+       lctl set_param -n obdfilter.*OST*.readcache_max_filesize=4096
         dd if=/dev/urandom of=$TMP/f19b bs=512k count=32
         SUM=`cksum $TMP/f19b | cut -d" " -f 1,2`
         cp $TMP/f19b $DIR1/f19b
@@ -371,9 +370,7 @@ test_19() { # bug3811
                 [ "`cat $TMP/sum2`" = "$SUM" ] || \
                         error "$DIR2/f19b `cat $TMP/sum2` != $SUM"
         done
-       for O in /proc/fs/lustre/obdfilter/*OST*; do
-               echo $MAX > $O/readcache_max_filesize
-       done
+       lctl set_param -n obdfilter.*OST*.readcache_max_filesize=$MAX
         rm $DIR1/f19b
  }
  run_test 19 "test concurrent uncached read races ==============="
@@ -381,12 +378,12 @@ run_test 19 "test concurrent uncached read races ==============="
  test_20() {
         mkdir $DIR1/d20
         cancel_lru_locks osc
-       CNT=$((`cat /proc/fs/lustre/llite/*/dump_page_cache | wc -l`))
+       CNT=$((`lctl get_param -n llite.*.dump_page_cache | wc -l`))
         multiop $DIR1/f20 Ow8190c
         multiop $DIR2/f20 Oz8194w8190c
         multiop $DIR1/f20 Oz0r8190c
         cancel_lru_locks osc
-       CNTD=$((`cat /proc/fs/lustre/llite/*/dump_page_cache | wc -l` - $CNT))
+       CNTD=$((`lctl get_param -n llite.*.dump_page_cache | wc -l` - $CNT))
         [ $CNTD -gt 0 ] && \
             error $CNTD" page left in cache after lock cancel" || true
  }
@@ -515,7 +512,7 @@ test_26b() {
  run_test 26b "sync mtime between ost and mds"
  
  test_27() {
-       cancel_lru_locks OSC
+       cancel_lru_locks osc
         lctl clear
         dd if=/dev/zero of=$DIR2/$tfile bs=$((4096+4))k conv=notrunc count=4 seek=3 &
         DD2_PID=$!
@@ -582,7 +579,7 @@ test_30() { #bug #11110
  
  run_test 30 "recreate file race ========="
  
-test_31() {
+test_31a() {
          mkdir -p $DIR1/$tdir || error "Creating dir $DIR1/$tdir"
          writes=`LANG=C dd if=/dev/zero of=$DIR/$tdir/$tfile count=1 2>&1 |
                  awk 'BEGIN { FS="+" } /out/ {print $1}'`
@@ -592,7 +589,24 @@ test_31() {
                 awk 'BEGIN { FS="+" } /in/ {print $1}'`
          [ $reads -eq $writes ] || error "read" $reads "blocks, must be" $writes
  }
-run_test 31 "voluntary cancel / blocking ast race=============="
+run_test 31a "voluntary cancel / blocking ast race=============="
+
+test_31b() {
+        remote_ost || { skip "local OST" && return 0; }
+        remote_ost_nodsh && skip "remote OST w/o dsh" && return 0
+        mkdir -p $DIR1/$tdir || error "Creating dir $DIR1/$tdir"
+        lfs setstripe $DIR/$tdir/$tfile -i 0 -c 1
+        cp /etc/hosts $DIR/$tdir/$tfile
+        #define OBD_FAIL_LDLM_CANCEL_BL_CB_RACE   0x314
+        lctl set_param fail_loc=0x314
+        #define OBD_FAIL_LDLM_OST_FAIL_RACE      0x316
+        do_facet ost1 lctl set_param fail_loc=0x316
+        # Don't crash kernel
+        cat $DIR2/$tdir/$tfile > /dev/null 2>&1
+        lctl set_param fail_loc=0
+        do_facet ost1 lctl set_param fail_loc=0
+}
+run_test 31b "voluntary OST cancel / blocking ast race=============="
  
  # enable/disable lockless truncate feature, depending on the arg 0/1
  enable_lockless_truncate() {
@@ -634,6 +648,8 @@ test_32a() { # bug 11270
  run_test 32a "lockless truncate"
  
  test_32b() { # bug 11270
+        remote_ost_nodsh && skip "remote OST with nodsh" && return
+
          local node
          local p="$TMP/sanityN-$TESTNAME.parameters"
          save_lustre_params $HOSTNAME "llite.*.contention_seconds" > $p
@@ -673,6 +689,95 @@ test_32b() { # bug 11270
  }
  run_test 32b "lockless i/o"
  
+test_33() { #16129
+        local OPER
+        local lock_in
+        local lock_out
+        for OPER in notimeout timeout ; do
+                rm $DIR1/$tfile 2>/dev/null
+                lock_in=$(do_nodes $(osts_nodes) "lctl get_param -n ldlm.namespaces.filter-*.lock_timeouts" | calc_sum)
+                if [ $OPER == "timeout" ] ; then
+                        for j in `seq $OSTCOUNT`; do
+                                #define OBD_FAIL_PTLRPC_HPREQ_TIMEOUT    0x511
+                                do_facet ost$j lctl set_param fail_loc=0x511
+                        done
+                        echo lock should expire
+                else
+                        for j in `seq $OSTCOUNT`; do
+                                #define OBD_FAIL_PTLRPC_HPREQ_NOTIMEOUT  0x512
+                                do_facet ost$j lctl set_param fail_loc=0x512
+                        done
+                        echo lock should not expire
+                fi
+                echo writing on client1
+                dd if=/dev/zero of=$DIR1/$tfile count=100 conv=notrunc > /dev/null 2>&1
+                sync &
+                # wait for the flush
+                sleep 1
+                echo reading on client2
+                dd of=/dev/null if=$DIR2/$tfile > /dev/null 2>&1
+                # wait for a lock timeout
+                sleep 4
+                lock_out=$(do_nodes $(osts_nodes) "lctl get_param -n ldlm.namespaces.filter-*.lock_timeouts" | calc_sum)
+                if [ $OPER == "timeout" ] ; then 
+                        if [ $lock_in == $lock_out ]; then
+                                error "no lock timeout happened"
+                        else
+                                echo "success"
+                        fi
+                else
+                        if [ $lock_in != $lock_out ]; then
+                                error "lock timeout happened"
+                        else
+                                echo "success"
+                        fi
+                fi
+        done
+}
+run_test 33 "no lock timeout under IO"
+
+test_34() { # bug 17645
+        local generation=[]
+        local count=0
+        for imp in /proc/fs/lustre/osc/$FSNAME-OST*-osc-*; do
+            g=$(awk '/generation/{print $2}' $imp/import)
+            generation[count]=$g
+            let count=count+1
+        done
+
+        dd if=/dev/zero of=$MOUNT1/$tfile bs=1M count=10
+        sync
+        cancel_lru_locks osc
+
+        # Let's get some read locks so that later we have something to
+        # conflict with
+        dd if=$MOUNT1/$tfile of=$MOUNT1/${tfile}-1 bs=1k count=10000
+        
+        # Let's initiate -EINTR situation by setting fail_loc and take
+        # write lock on same file from same client. This will not cause
+        # bl_ast yet as lock is already in local cache.
+#define OBD_FAIL_LDLM_INTR_CP_AST        0x317
+        do_facet client "lctl set_param fail_loc=0x80000317"
+        dd if=$MOUNT1/${tfile}-1 of=$MOUNT1/$tfile bs=1k count=10000 &
+        sleep 1
+        
+        # Let's take write lock on same file from another mount. This
+        # should cause conflict and bl_ast
+        dd if=$MOUNT2/${tfile}-1 of=$MOUNT2/$tfile bs=1k count=10000 &
+        wait
+        do_facet client "lctl set_param fail_loc=0x0"
+        df -h $MOUNT1 $MOUNT2
+        count=0
+        for imp in /proc/fs/lustre/osc/$FSNAME-OST*-osc-*; do
+            g=$(awk '/generation/{print $2}' $imp/import)
+            if ! test "$g" -eq "${generation[count]}"; then
+                error "Eviction happened on import $(basename $imp)"
+            fi
+            let count=count+1
+        done
+}
+run_test 34 "-EINTR cp_ast vs. bl_ast race does not evict client"
+
  log "cleanup: ======================================================"
  
  check_and_cleanup_lustre
diff --git a/lustre/tests/sleeptest.c b/lustre/tests/sleeptest.c

index d8beceb..7e40b24 100644 (file)
--- a/lustre/tests/sleeptest.c
+++ b/lustre/tests/sleeptest.c
@@ -1,3 +1,39 @@
+/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
+ * vim:expandtab:shiftwidth=8:tabstop=8:
+ *
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright  2008 Sun Microsystems, Inc. All rights reserved
+ * Use is subject to license terms.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
  #include <sys/types.h>
  #include <sys/stat.h>
  #include <fcntl.h>
diff --git a/lustre/tests/small_write.c b/lustre/tests/small_write.c

index 442d0fd..666f909 100644 (file)
--- a/lustre/tests/small_write.c
+++ b/lustre/tests/small_write.c
@@ -1,3 +1,39 @@
+/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
+ * vim:expandtab:shiftwidth=8:tabstop=8:
+ *
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright  2008 Sun Microsystems, Inc. All rights reserved
+ * Use is subject to license terms.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
  #include <stdio.h>
  #include <sys/stat.h>
  #include <sys/types.h>
diff --git a/lustre/tests/statmany.c b/lustre/tests/statmany.c

index 63a13ad..9c1429b 100644 (file)
--- a/lustre/tests/statmany.c
+++ b/lustre/tests/statmany.c
@@ -1,3 +1,39 @@
+/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
+ * vim:expandtab:shiftwidth=8:tabstop=8:
+ *
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright  2008 Sun Microsystems, Inc. All rights reserved
+ * Use is subject to license terms.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
  #include <stdio.h>
  #include <stdlib.h>
  #include <sys/types.h>
diff --git a/lustre/tests/statone.c b/lustre/tests/statone.c

index e835eaa..f470572 100644 (file)
--- a/lustre/tests/statone.c
+++ b/lustre/tests/statone.c
@@ -1,3 +1,39 @@
+/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
+ * vim:expandtab:shiftwidth=8:tabstop=8:
+ *
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright  2008 Sun Microsystems, Inc. All rights reserved
+ * Use is subject to license terms.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
  #include <stdio.h>
  #include <stdlib.h>
  #include <sys/types.h>
diff --git a/lustre/tests/tchmod.c b/lustre/tests/tchmod.c

index 08732ff..8936171 100644 (file)
--- a/lustre/tests/tchmod.c
+++ b/lustre/tests/tchmod.c
@@ -1,3 +1,39 @@
+/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
+ * vim:expandtab:shiftwidth=8:tabstop=8:
+ *
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright  2008 Sun Microsystems, Inc. All rights reserved
+ * Use is subject to license terms.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
  #include <sys/types.h>
  #include <sys/stat.h>
  #include <errno.h>
diff --git a/lustre/tests/test-framework.sh b/lustre/tests/test-framework.sh

index 6169a2c..c5bf857 100644 (file)
--- a/lustre/tests/test-framework.sh
+++ b/lustre/tests/test-framework.sh
@@ -7,6 +7,7 @@ set -e
  
  
  export REFORMAT=${REFORMAT:-""}
+export WRITECONF=${WRITECONF:-""}
  export VERBOSE=false
  export GMNALNID=${GMNALNID:-/usr/sbin/gmlndnid}
  export CATASTROPHE=${CATASTROPHE:-/proc/sys/lnet/catastrophe}
@@ -26,12 +27,12 @@ assert_env() {
  
  assert_DIR () {
      local failed=""
-    [ -z "`echo :$DIR: | grep :$MOUNT:`" ] && \
-        failed=1 && echo "DIR not in $MOUNT. Aborting."
-    [ -z "`echo :$DIR1: | grep :$MOUNT1:`" ] && \
-        failed=1 && echo "DIR1 not in $MOUNT1. Aborting."
-    [ -z "`echo :$DIR2: | grep :$MOUNT2:`" ] && \
-        failed=1 && echo "DIR2 not in $MOUNT2. Aborting"
+    [[ $DIR/ = $MOUNT/* ]] || \
+        { failed=1 && echo "DIR=$DIR not in $MOUNT. Aborting."; }
+    [[ $DIR1/ = $MOUNT1/* ]] || \
+        { failed=1 && echo "DIR1=$DIR1 not in $MOUNT1. Aborting."; }
+    [[ $DIR2/ = $MOUNT2/* ]] || \
+        { failed=1 && echo "DIR2=$DIR2 not in $MOUNT2. Aborting"; }
  
      [ -n "$failed" ] && exit 99 || true
  }
@@ -55,7 +56,7 @@ print_summary () {
          local o=$(echo $O | tr "[:upper:]" "[:lower:]")
          o=${o//_/-}
          o=${o//tyn/tyN}
-        local log=${TMP}/${o}.log 
+        local log=${TMP}/${o}.log
          [ -f $log ] && skipped=$(grep excluded $log | awk '{ printf " %s", $3 }' | sed 's/test_//g')
          [ -f $log ] && slow=$(grep SLOW $log | awk '{ printf " %s", $3 }' | sed 's/test_//g')
          [ "${!O}" = "done" ] && \
@@ -78,8 +79,9 @@ print_summary () {
  init_test_env() {
      export LUSTRE=`absolute_path $LUSTRE`
      export TESTSUITE=`basename $0 .sh`
+    export TEST_FAILED=false
  
-    [ -d /r ] && export ROOT=${ROOT:-/r}
+    #[ -d /r ] && export ROOT=${ROOT:-/r}
      export TMP=${TMP:-$ROOT/tmp}
      export TESTSUITELOG=${TMP}/${TESTSUITE}.log
      export HOSTNAME=${HOSTNAME:-`hostname`}
@@ -89,19 +91,23 @@ init_test_env() {
      if ! echo $PATH | grep -q $LUSTRE/test; then
         export PATH=$PATH:$LUSTRE/tests
      fi
+    export MDSRATE=${MDSRATE:-"$LUSTRE/tests/mdsrate"}
+    [ ! -f "$MDSRATE" ] && export MDSRATE=$(which mdsrate 2> /dev/null)
+    if ! echo $PATH | grep -q $LUSTRE/test/racer; then
+        export PATH=$PATH:$LUSTRE/tests/racer
+    fi
      export LCTL=${LCTL:-"$LUSTRE/utils/lctl"}
      export LFS=${LFS:-"$LUSTRE/utils/lfs"}
-    [ ! -f "$LCTL" ] && export LCTL=$(which lctl) 
+    [ ! -f "$LCTL" ] && export LCTL=$(which lctl)
      export LFS=${LFS:-"$LUSTRE/utils/lfs"}
-    [ ! -f "$LFS" ] && export LFS=$(which lfs) 
+    [ ! -f "$LFS" ] && export LFS=$(which lfs)
      export MKFS=${MKFS:-"$LUSTRE/utils/mkfs.lustre"}
-    [ ! -f "$MKFS" ] && export MKFS=$(which mkfs.lustre) 
+    [ ! -f "$MKFS" ] && export MKFS=$(which mkfs.lustre)
      export TUNEFS=${TUNEFS:-"$LUSTRE/utils/tunefs.lustre"}
-    [ ! -f "$TUNEFS" ] && export TUNEFS=$(which tunefs.lustre) 
+    [ ! -f "$TUNEFS" ] && export TUNEFS=$(which tunefs.lustre)
      export CHECKSTAT="${CHECKSTAT:-"checkstat -v"} "
      export FSYTPE=${FSTYPE:-"ldiskfs"}
      export NAME=${NAME:-local}
-    export LPROC=/proc/fs/lustre
      export DIR2
      export AT_MAX_PATH
      export SAVE_PWD=${SAVE_PWD:-$LUSTRE/tests}
@@ -110,18 +116,19 @@ init_test_env() {
          export PORT_OPT="--port $ACCEPTOR_PORT"
      fi
  
-    # Paths on remote nodes, if different 
+    # Paths on remote nodes, if different
      export RLUSTRE=${RLUSTRE:-$LUSTRE}
      export RPWD=${RPWD:-$PWD}
      export I_MOUNTED=${I_MOUNTED:-"no"}
  
      # command line
-    
-    while getopts "rvf:" opt $*; do 
+
+    while getopts "rvwf:" opt $*; do
          case $opt in
              f) CONFIG=$OPTARG;;
              r) REFORMAT=--reformat;;
              v) VERBOSE=true;;
+            w) WRITECONF=writeconf;;
              \?) usage;;
          esac
      done
@@ -135,7 +142,6 @@ init_test_env() {
  }
  
  case `uname -r` in
-2.4.*) EXT=".o"; USE_QUOTA=no; [ ! "$CLIENTONLY" ] && FSTYPE=ext3;;
      *) EXT=".ko"; USE_QUOTA=yes;;
  esac
  
@@ -167,10 +173,12 @@ load_modules() {
      load_module ../lnet/libcfs/libcfs
      [ "$PTLDEBUG" ] && lctl set_param debug=$PTLDEBUG
      [ "$SUBSYSTEM" ] && lctl set_param subsystem_debug=${SUBSYSTEM# }
+    local MODPROBECONF=
      [ -f /etc/modprobe.conf ] && MODPROBECONF=/etc/modprobe.conf
-    [ -f /etc/modprobe.d/Lustre ] && MODPROBECONF=/etc/modprobe.d/Lustre
-    [ -z "$LNETOPTS" -a -n "$MODPROBECONF" ] && \
+    [ ! "$MODPROBECONF" -a -d /etc/modprobe.d ] && MODPROBECONF=/etc/modprobe.d/Lustre
+    [ -z "$LNETOPTS" -a "$MODPROBECONF" ] && \
          LNETOPTS=$(awk '/^options lnet/ { print $0}' $MODPROBECONF | sed 's/^options lnet //g')
+    echo $LNETOPTS | grep -q "accept=all"  || LNETOPTS="$LNETOPTS accept=all";
      echo "lnet options: '$LNETOPTS'"
      # note that insmod will ignore anything in modprobe.conf
      load_module ../lnet/lnet/lnet $LNETOPTS
@@ -244,17 +252,30 @@ unload_dep_module() {
      $RMMOD $MODULE || true
  }
  
+check_mem_leak () {
+    LEAK_LUSTRE=$(dmesg | tail -n 30 | grep "obd_memory.*leaked" || true)
+    LEAK_PORTALS=$(dmesg | tail -n 20 | grep "Portals memory leaked" || true)
+    if [ "$LEAK_LUSTRE" -o "$LEAK_PORTALS" ]; then
+        echo "$LEAK_LUSTRE" 1>&2
+        echo "$LEAK_PORTALS" 1>&2
+        mv $TMP/debug $TMP/debug-leak.`date +%s` || true
+        echo "Memory leaks detected"
+        [ -n "$IGNORE_LEAK" ] && { echo "ignoring leaks" && return 0; } || true
+        return 1
+    fi
+}
+
  unload_modules() {
      wait_exit_ST client # bug 12845
  
      lsmod | grep libcfs > /dev/null && $LCTL dl
-    unload_dep_module $FSTYPE
+    [ -z "$CLIENTONLY" ] && unload_dep_module $FSTYPE
      unload_dep_module libcfs
  
      local MODULES=$($LCTL modules | awk '{ print $2 }')
      if [ -n "$MODULES" ]; then
          echo "Modules still loaded: "
-        echo $MODULES 
+        echo $MODULES
          if [ "$(lctl dl)" ]; then
              echo "Lustre still loaded"
              lctl dl || true
@@ -267,16 +288,8 @@ unload_modules() {
      fi
      HAVE_MODULES=false
  
-    LEAK_LUSTRE=$(dmesg | tail -n 30 | grep "obd mem.*leaked" || true)
-    LEAK_PORTALS=$(dmesg | tail -n 20 | grep "Portals memory leaked" || true)
-    if [ "$LEAK_LUSTRE" -o "$LEAK_PORTALS" ]; then
-        echo "$LEAK_LUSTRE" 1>&2
-        echo "$LEAK_PORTALS" 1>&2
-        mv $TMP/debug $TMP/debug-leak.`date +%s` || true
-        echo "Memory leaks detected"
-        [ -n "$IGNORE_LEAK" ] && echo "ignoring leaks" && return 0
-        return 254
-    fi
+    check_mem_leak || return 254
+
      echo "modules unloaded."
      return 0
  }
@@ -288,7 +301,7 @@ mount_facet() {
      local dev=${facet}_dev
      local opt=${facet}_opt
      echo "Starting ${facet}: ${!opt} $@ ${!dev} ${MOUNT%/*}/${facet}"
-    do_facet ${facet} mount -t lustre ${!opt} $@ ${!dev} ${MOUNT%/*}/${facet}     
+    do_facet ${facet} mount -t lustre ${!opt} $@ ${!dev} ${MOUNT%/*}/${facet}
      RC=${PIPESTATUS[0]}
      if [ $RC -ne 0 ]; then
          echo "mount -t lustre $@ ${!dev} ${MOUNT%/*}/${facet}"
@@ -298,7 +311,7 @@ mount_facet() {
              lctl set_param subsystem_debug=${SUBSYSTEM# }; \
              lctl set_param debug_mb=${DEBUG_SIZE}; \
              sync"
- 
+
          label=$(do_facet ${facet} "e2label ${!dev}")
          [ -z "$label" ] && echo no label for ${!dev} && exit 1
          eval export ${facet}_svc=${label}
@@ -307,7 +320,7 @@ mount_facet() {
      return $RC
  }
  
-# start facet device options 
+# start facet device options
  start() {
      facet=$1
      shift
@@ -354,14 +367,13 @@ zconf_mount() {
          exit 1
      fi
  
-    echo "Starting client: $client: $OPTIONS $device $mnt" 
+    echo "Starting client: $client: $OPTIONS $device $mnt"
      do_node $client mkdir -p $mnt
      do_node $client mount -t lustre $OPTIONS $device $mnt || return 1
      do_node $client "lctl set_param debug=$PTLDEBUG;
          lctl set_param subsystem_debug=${SUBSYSTEM# };
          lctl set_param debug_mb=${DEBUG_SIZE}"
  
-    [ -d /r ] && $LCTL modules > /r/tmp/ogdb-$HOSTNAME
      return 0
  }
  
@@ -372,11 +384,12 @@ zconf_umount() {
      local running=$(do_node $client "grep -c $mnt' ' /proc/mounts") || true
      if [ $running -ne 0 ]; then
          echo "Stopping client $client $mnt (opts:$force)"
-        lsof | grep "$mnt" || true
+        do_node $client lsof | grep "$mnt" || true
          do_node $client umount $force $mnt
      fi
  }
  
+# mount clients if not mouted
  zconf_mount_clients() {
      local OPTIONS
      local clients=$1
@@ -393,8 +406,10 @@ zconf_mount_clients() {
      fi
  
      echo "Starting client $clients: $OPTIONS $device $mnt"
-    do_nodes $clients mkdir -p $mnt
-    do_nodes $clients mount -t lustre $OPTIONS $device $mnt || return 1
+    do_nodes $clients "mount | grep $mnt || { mkdir -p $mnt && mount -t lustre $OPTIONS $device $mnt || false; }"
+
+    echo "Started clients $clients: "
+    do_nodes $clients "mount | grep $mnt"
  
      do_nodes $clients "sysctl -w lnet.debug=$PTLDEBUG;
          sysctl -w lnet.subsystem_debug=${SUBSYSTEM# };
@@ -413,10 +428,10 @@ zconf_umount_clients() {
  }
  
  shutdown_facet() {
-    facet=$1
+    local facet=$1
      if [ "$FAILURE_MODE" = HARD ]; then
          $POWER_DOWN `facet_active_host $facet`
-        sleep 2 
+        sleep 2
      elif [ "$FAILURE_MODE" = SOFT ]; then
          stop $facet
      fi
@@ -431,6 +446,99 @@ reboot_facet() {
      fi
  }
  
+boot_node() {
+    local node=$1
+    if [ "$FAILURE_MODE" = HARD ]; then
+       $POWER_UP $node
+    fi
+}
+
+# recovery-scale functions
+check_progs_installed () {
+    local clients=$1
+    shift
+    local progs=$@
+
+    do_nodes $clients "set -x ; PATH=:$PATH status=true; for prog in $progs; do
+        which \\\$prog || { echo \\\$prog missing on \\\$(hostname) && status=false; }
+        done;
+        eval \\\$status"
+}
+
+start_client_load() {
+    local list=(${1//,/ })
+    local nodenum=$2
+
+    local numloads=${#CLIENT_LOADS[@]}
+    local testnum=$((nodenum % numloads))
+
+    do_node ${list[nodenum]} "PATH=$PATH MOUNT=$MOUNT ERRORS_OK=$ERRORS_OK \
+                              BREAK_ON_ERROR=$BREAK_ON_ERROR \
+                              END_RUN_FILE=$END_RUN_FILE \
+                              LOAD_PID_FILE=$LOAD_PID_FILE \
+                              TESTSUITELOG=$TESTSUITELOG \
+                              run_${CLIENT_LOADS[testnum]}.sh" &
+    CLIENT_LOAD_PIDS="$CLIENT_LOAD_PIDS $!"
+    log "Started client load: ${CLIENT_LOADS[testnum]} on ${list[nodenum]}"
+
+    eval export ${list[nodenum]}_load=${CLIENT_LOADS[testnum]}
+    return 0
+}
+
+start_client_loads () {
+    local clients=(${1//,/ })
+
+    for ((num=0; num < ${#clients[@]}; num++ )); do
+        start_client_load $1 $num
+    done
+}
+
+# only for remote client 
+check_client_load () {
+    local client=$1
+    local var=${client}_load
+
+    local TESTLOAD=run_${!var}.sh
+
+    ps auxww | grep -v grep | grep $client | grep -q "$TESTLOAD" || return 1
+
+    check_catastrophe $client || return 2
+
+    # see if the load is still on the client
+    local tries=3
+    local RC=254
+    while [ $RC = 254 -a $tries -gt 0 ]; do
+        let tries=$tries-1
+        # assume success
+        RC=0
+        if ! do_node $client "ps auxwww | grep -v grep | grep -q $TESTLOAD"; then
+            RC=${PIPESTATUS[0]}
+            sleep 30
+        fi
+    done
+    if [ $RC = 254 ]; then
+        echo "got a return status of $RC from do_node while checking (i.e. with 'ps') the client load on the remote system"
+        # see if we can diagnose a bit why this is
+    fi
+
+    return $RC
+}
+check_client_loads () {
+   local clients=${1//,/ }
+   local client=
+   local rc=0
+
+   for client in $clients; do
+      check_client_load $client
+      rc=$?
+      if [ "$rc" != 0 ]; then
+        log "Client load failed on node $client, rc=$rc"
+        return $rc
+      fi
+   done
+}
+# End recovery-scale functions
+
  # verify that lustre actually cleaned up properly
  cleanup_check() {
      [ -f $CATASTROPHE ] && [ `cat $CATASTROPHE` -ne 0 ] && \
@@ -441,15 +549,8 @@ cleanup_check() {
          [ -e $TMP/debug ] && mv $TMP/debug $TMP/debug-busy.`date +%s`
          exit 205
      fi
-    LEAK_LUSTRE=`dmesg | tail -n 30 | grep "obd mem.*leaked" || true`
-    LEAK_PORTALS=`dmesg | tail -n 20 | grep "Portals memory leaked" || true`
-    if [ "$LEAK_LUSTRE" -o "$LEAK_PORTALS" ]; then
-        echo "$0: $LEAK_LUSTRE" 1>&2
-        echo "$0: $LEAK_PORTALS" 1>&2
-        echo "$0: Memory leak(s) detected..." 1>&2
-        mv $TMP/debug $TMP/debug-leak.`date +%s`
-        exit 204
-    fi
+
+    check_mem_leak || exit 204
  
      [ "`lctl dl 2> /dev/null | wc -l`" -gt 0 ] && lctl dl && \
          echo "$0: lustre didn't clean up..." 1>&2 && return 202 || true
@@ -507,7 +608,7 @@ wait_mds_recovery_done () {
          echo "Waiting $(($MAX - $WAIT)) secs for MDS recovery done"
      done
      echo "MDS recovery not done in $MAX sec"
-    return 1            
+    return 1
  }
  
  wait_exit_ST () {
@@ -515,6 +616,7 @@ wait_exit_ST () {
  
      local WAIT=0
      local INTERVAL=1
+    local running
      # conf-sanity 31 takes a long time cleanup
      while [ $WAIT -lt 300 ]; do
          running=$(do_facet ${facet} "lsmod | grep lnet > /dev/null && lctl dl | grep ' ST '") || true
@@ -529,6 +631,38 @@ wait_exit_ST () {
      return 1
  }
  
+wait_remote_prog () {
+   local prog=$1
+   local WAIT=0
+   local INTERVAL=5
+   local rc=0
+
+   [ "$PDSH" = "no_dsh" ] && return 0
+
+   while [ $WAIT -lt $2 ]; do
+        running=$(ps uax | grep "$PDSH.*$prog.*$MOUNT" | grep -v grep) || true
+        [ -z "${running}" ] && return 0 || true
+        echo "waited $WAIT for: "
+        echo "$running"
+        [ $INTERVAL -lt 60 ] && INTERVAL=$((INTERVAL + INTERVAL))
+        sleep $INTERVAL
+        WAIT=$((WAIT + INTERVAL))
+    done
+    local pids=$(ps  uax | grep "$PDSH.*$prog.*$MOUNT" | grep -v grep | awk '{print $2}')
+    [ -z "$pids" ] && return 0
+    echo "$PDSH processes still exists after $WAIT seconds.  Still running: $pids"
+    for pid in $pids; do
+        cat /proc/${pid}/status || true
+        cat /proc/${pid}/wchan || true
+        echo "Killing $pid"
+        kill -9 $pid || true
+        sleep 1
+        ps -P $pid && rc=1
+    done
+
+    return $rc
+}
+
  client_df() {
      # not every config has many clients
      if [ -n "$CLIENTS" ]; then
@@ -540,8 +674,10 @@ client_df() {
  
  client_reconnect() {
      uname -n >> $MOUNT/recon
-    if [ ! -z "$CLIENTS" ]; then
-        $PDSH $CLIENTS "df $MOUNT; uname -n >> $MOUNT/recon" > /dev/null
+    if [ -z "$CLIENTS" ]; then
+        df $MOUNT; uname -n >> $MOUNT/recon
+    else
+        do_nodes $CLIENTS "df $MOUNT; uname -n >> $MOUNT/recon" > /dev/null
      fi
      echo Connected clients:
      cat $MOUNT/recon
@@ -551,11 +687,14 @@ client_reconnect() {
  
  facet_failover() {
      facet=$1
+    sleep_time=$2
      echo "Failing $facet on node `facet_active_host $facet`"
      shutdown_facet $facet
+    [ -n "$sleep_time" ] && sleep $sleep_time
      reboot_facet $facet
      client_df &
      DFPID=$!
+    RECOVERY_START_TIME=`date +%s`
      echo "df pid is $DFPID"
      change_active $facet
      TO=`facet_active_host $facet`
@@ -652,7 +791,7 @@ declare -fx h2ptl
  
  h2tcp() {
      if [ "$1" = "client" -o "$1" = "'*'" ]; then echo \'*\'; else
-        echo $1"@tcp" 
+        echo $1"@tcp"
      fi
  }
  declare -fx h2tcp
@@ -684,6 +823,8 @@ declare -fx h2o2ib
  
  facet_host() {
      local facet=$1
+
+    [ "$facet" == client ] && echo -n $HOSTNAME && return
      varname=${facet}_HOST
      if [ -z "${!varname}" ]; then
          if [ "${facet:0:3}" == "ost" ]; then
@@ -702,7 +843,7 @@ facet_active() {
      fi
  
      active=${!activevar}
-    if [ -z "$active" ] ; then 
+    if [ -z "$active" ] ; then
          echo -n ${facet}
      else
          echo -n ${active}
@@ -721,7 +862,7 @@ facet_active_host() {
  
  change_active() {
      local facet=$1
-    failover=${facet}failover 
+    failover=${facet}failover
      host=`facet_host $failover`
      [ -z "$host" ] && return
      curactive=`facet_active $facet`
@@ -755,7 +896,7 @@ do_node() {
         local command_status="$TMP/cs"
         rsh $HOST ":> $command_status"
         rsh $HOST "(PATH=\$PATH:$RLUSTRE/utils:$RLUSTRE/tests:/sbin:/usr/sbin;
-                   cd $RPWD; sh -c \"$@\") || 
+                   cd $RPWD; sh -c \"$@\") ||
                     echo command failed >$command_status"
         [ -n "$($myPDSH $HOST cat $command_status)" ] && return 1 || true
          return 0
@@ -764,25 +905,23 @@ do_node() {
      return ${PIPESTATUS[0]}
  }
  
+single_local_node () {
+   [ "$1" = "$HOSTNAME" ]
+}
+
  do_nodes() {
-    local nodes=$1
+    local rnodes=$1
      shift
  
-    nodes=${nodes//,/ }
-    # split list to local and remote
-    local rnodes=$(echo " $nodes " | sed -re "s/\s+$HOSTNAME\s+/ /g")
- 
-    if [ "$(get_node_count $nodes)" != "$(get_node_count $rnodes)" ]; then
-        do_node $HOSTNAME $@
+    if $(single_local_node $rnodes); then
+        do_node $rnodes $@
+        return $?
      fi
  
-    [ -z "$(echo $rnodes)" ] && return 0
-
      # This is part from do_node
      local myPDSH=$PDSH
  
-    rnodes=$(comma_list $rnodes)
-    [ -z "$myPDSH" -o "$myPDSH" = "no_dsh" ] && \
+    [ -z "$myPDSH" -o "$myPDSH" = "no_dsh" -o "$myPDSH" = "rsh" ] && \
          echo "cannot run remote command on $rnodes with $myPDSH" && return 128
  
      if $VERBOSE; then
@@ -790,16 +929,6 @@ do_nodes() {
          $myPDSH $rnodes $LCTL mark "$@" > /dev/null 2>&1 || :
      fi
  
-    if [ "$myPDSH" = "rsh" ]; then
-# we need this because rsh does not return exit code of an executed command
-       local command_status="$TMP/cs"
-       rsh $rnodes ":> $command_status"
-       rsh $rnodes "(PATH=\$PATH:$RLUSTRE/utils:$RLUSTRE/tests:/sbin:/usr/sbin;
-                   cd $RPWD; sh -c \"$@\") || 
-                   echo command failed >$command_status"
-       [ -n "$($myPDSH $rnodes cat $command_status)" ] && return 1 || true
-        return 0
-    fi
      $myPDSH $rnodes "(PATH=\$PATH:$RLUSTRE/utils:$RLUSTRE/tests:/sbin:/usr/sbin; cd $RPWD; sh -c \"$@\")" | sed -re "s/\w+:\s//g"
      return ${PIPESTATUS[0]}
  }
@@ -839,8 +968,8 @@ stopall() {
      if [ $activemds != "mds" ]; then
          fail mds
      fi
-    
-    # assume client mount is local 
+
+    # assume client mount is local
      grep " $MOUNT " /proc/mounts && zconf_umount $HOSTNAME $MOUNT $*
      grep " $MOUNT2 " /proc/mounts && zconf_umount $HOSTNAME $MOUNT2 $*
  
@@ -910,12 +1039,30 @@ set_obd_timeout() {
      do_facet $facet "lctl set_param timeout=$timeout"
  }
  
+writeconf_facet () {
+    local facet=$1
+    local dev=$2
+
+    do_facet $facet "$TUNEFS --writeconf $dev"
+}
+
+writeconf_all () {
+    writeconf_facet mds $MDSDEV
+
+    for num in `seq $OSTCOUNT`; do
+        DEVNAME=`ostdevname $num`
+        writeconf_facet ost$num $DEVNAME
+    done
+}
+
  setupall() {
      load_modules
      if [ -z "$CLIENTONLY" ]; then
          echo Setup mdt, osts
-        echo $REFORMAT | grep -q "reformat" \
-           || do_facet mds "$TUNEFS --writeconf $MDSDEV"
+
+        echo $WRITECONF | grep -q "writeconf" && \
+            writeconf_all
+
          set_obd_timeout mds $TIMEOUT
          start mds $MDSDEV $MDS_MOUNT_OPTS
          # We started mds, now we should set failover variable properly.
@@ -945,20 +1092,80 @@ setupall() {
          [ -n "$CLIENTS" ] && zconf_mount_clients $CLIENTS $MOUNT2
      fi
      sleep 5
+    init_versions_vars
  }
  
  mounted_lustre_filesystems() {
         awk '($3 ~ "lustre" && $1 ~ ":") { print $2 }' /proc/mounts
  }
  
+init_facet_vars () {
+    local facet=$1
+    shift
+    local device=$1
+
+    shift
+
+    eval export ${facet}_dev=${device}
+    eval export ${facet}_opt=\"$@\"
+
+    local dev=${facet}_dev
+    local label=$(do_facet ${facet} "e2label ${!dev}")
+    [ -z "$label" ] && echo no label for ${!dev} && exit 1
+
+    eval export ${facet}_svc=${label}
+
+    local varname=${facet}failover_HOST
+    if [ -z "${!varname}" ]; then
+       eval $varname=$(facet_host $facet)
+    fi
+}
+
+init_facets_vars () {
+    init_facet_vars mds $MDSDEV $MDS_MOUNT_OPTS
+
+    for num in `seq $OSTCOUNT`; do
+        DEVNAME=`ostdevname $num`
+        init_facet_vars ost$num $DEVNAME $OST_MOUNT_OPTS
+    done
+}
+
+init_versions_vars () {
+    export MDSVER=$(do_facet mds "lctl get_param version" | cut -d. -f1,2)
+    export OSTVER=$(do_facet ost1 "lctl get_param version" | cut -d. -f1,2)
+    export CLIVER=$(lctl get_param version | cut -d. -f 1,2)
+}
+
+check_config () {
+    local mntpt=$1
+    local myMGS_host=$mgs_HOST   
+    if [ "$NETTYPE" = "ptl" ]; then
+        myMGS_host=$(h2ptl $mgs_HOST | sed -e s/@ptl//) 
+    fi
+
+    echo Checking config lustre mounted on $mntpt
+    local mgshost=$(mount | grep " $mntpt " | awk -F@ '{print $1}')
+    mgshost=$(echo $mgshost | awk -F: '{print $1}')
+
+    if [ "$mgshost" != "$myMGS_host" ]; then
+        FAIL_ON_ERROR=true \
+            error "Bad config file: lustre is mounted with mgs $mgshost, but mgs_HOST=$mgs_HOST, NETTYPE=$NETTYPE
+                   Please use correct config or set mds_HOST correctly!"
+    fi
+}
+
  check_and_setup_lustre() {
-    MOUNTED="`mounted_lustre_filesystems`"
-    if [ -z "$MOUNTED" ]; then
+    local MOUNTED=$(mounted_lustre_filesystems)
+    if [ -z "$MOUNTED" ] || ! $(echo $MOUNTED | grep -w -q $MOUNT); then
          [ "$REFORMAT" ] && formatall
          setupall
-        MOUNTED="`mounted_lustre_filesystems`"
+        MOUNTED=$(mounted_lustre_filesystems | head -1)
          [ -z "$MOUNTED" ] && error "NAME=$NAME not mounted"
          export I_MOUNTED=yes
+    else
+        check_config $MOUNT
+        init_facets_vars
+        init_versions_vars
      fi
      if [ "$ONLY" == "setup" ]; then
          exit 0
@@ -969,7 +1176,7 @@ cleanup_and_setup_lustre() {
      if [ "$ONLY" == "cleanup" -o "`mount | grep $MOUNT`" ]; then
          lctl set_param debug=0 || true
          cleanupall
-        if [ "$ONLY" == "cleanup" ]; then 
+        if [ "$ONLY" == "cleanup" ]; then
             exit 0
          fi
      fi
@@ -986,7 +1193,7 @@ check_and_cleanup_lustre() {
      unset I_MOUNTED
  }
  
-####### 
+#######
  # General functions
  
  check_network() {
@@ -1025,6 +1232,16 @@ comma_list() {
      echo "$*" | tr -s " " "\n" | sort -b -u | tr "\n" " " | sed 's/ \([^$]\)/,\1/g'
  }
  
+# list is comma separated list
+exclude_item_from_list () {
+    local list=$1
+    local excluded=$2
+
+    list=${list//,/ }
+    list=$(echo " $list " | sed -re "s/\s+$excluded\s+/ /g")
+    echo $(comma_list $list) 
+}
+
  absolute_path() {
      (cd `dirname $1`; echo $PWD/`basename $1`)
  }
@@ -1053,9 +1270,16 @@ at_is_enabled() {
  }
  
  at_max_get() {
+    local facet=$1
+
      at_is_valid || error "invalid call"
  
-    do_facet $1 "cat $AT_MAX_PATH"
+    # suppose that all ost-s has the same at_max set
+    if [ $facet == "ost" ]; then
+        do_facet ost1 "cat $AT_MAX_PATH"
+    else
+        do_facet $facet "cat $AT_MAX_PATH"
+    fi
  }
  
  at_max_set() {
@@ -1064,6 +1288,7 @@ at_max_set() {
  
      at_is_valid || error "invalid call"
  
+    local facet
      for facet in $@; do
          if [ $facet == "ost" ]; then
              for i in `seq $OSTCOUNT`; do
@@ -1161,18 +1386,11 @@ set_nodes_failloc () {
  
  cancel_lru_locks() {
      $LCTL mark "cancel_lru_locks $1 start"
-    lctl set_param ldlm.namespaces.*$1*.lru_size=0
-    lctl get_param ldlm.namespaces.*$1*.lock_unused_count | grep -v '=0'
-    $LCTL mark "cancel_lru_locks $1 stop"
-}
-
-set_nodes_failloc () {
-    local nodes=$1
-    local node
-
-    for node in $nodes ; do
-        do_node $node sysctl -w lustre.fail_loc=$2
+    for d in `lctl get_param -N ldlm.namespaces.*.lru_size | egrep -i $1`; do
+        $LCTL set_param -n $d=clear
      done
+    $LCTL get_param ldlm.namespaces.*.lock_unused_count | egrep -i $1 | grep -v '=0'
+    $LCTL mark "cancel_lru_locks $1 stop"
  }
  
  default_lru_size()
@@ -1204,6 +1422,16 @@ pgcache_empty() {
      return 0
  }
  
+create_fake_exports () {
+    local facet=$1
+    local num=$2
+#obd_fail_val = num;
+#define OBD_FAIL_TGT_FAKE_EXP 0x708
+    do_facet $facet "lctl set_param fail_val=$num"
+    do_facet $facet "lctl set_param fail_loc=0x80000708"
+    fail $facet
+}
+
  debugsave() {
      DEBUGSAVE="$(lctl get_param -n debug)"
  }
@@ -1214,7 +1442,7 @@ debugrestore() {
  }
  
  ##################################
-# Test interface 
+# Test interface
  ##################################
  
  error_noexit() {
@@ -1231,6 +1459,7 @@ error_noexit() {
      done
      debugrestore
      [ "$TESTSUITELOG" ] && echo "$0: ${TYPE}: $TESTNAME $@" >> $TESTSUITELOG
+    TEST_FAILED=true
  }
  
  error() {
@@ -1254,7 +1483,8 @@ error_ignore() {
  
  skip () {
         log " SKIP: ${TESTSUITE} ${TESTNAME} $@"
-       [ "$TESTSUITELOG" ] && echo "${TESTSUITE}: SKIP: $TESTNAME $@" >> $TESTSUITELOG
+       [ "$TESTSUITELOG" ] && \
+               echo "${TESTSUITE}: SKIP: $TESTNAME $@" >> $TESTSUITELOG || true
  }
  
  build_test_filter() {
@@ -1285,6 +1515,8 @@ basetest() {
      IFS=abcdefghijklmnopqrstuvwxyz _basetest $1
  }
  
+# print a newline if the last test was skipped
+export LAST_SKIPPED=
  run_test() {
      assert_DIR
  
@@ -1292,40 +1524,48 @@ run_test() {
      if [ ! -z "$ONLY" ]; then
          testname=ONLY_$1
          if [ ${!testname}x != x ]; then
+            [ "$LAST_SKIPPED" ] && echo "" && LAST_SKIPPED=
              run_one $1 "$2"
              return $?
          fi
          testname=ONLY_$base
          if [ ${!testname}x != x ]; then
+            [ "$LAST_SKIPPED" ] && echo "" && LAST_SKIPPED=
              run_one $1 "$2"
              return $?
          fi
+        LAST_SKIPPED="y"
          echo -n "."
          return 0
      fi
      testname=EXCEPT_$1
      if [ ${!testname}x != x ]; then
+        LAST_SKIPPED="y"
          TESTNAME=test_$1 skip "skipping excluded test $1"
          return 0
      fi
      testname=EXCEPT_$base
      if [ ${!testname}x != x ]; then
+        LAST_SKIPPED="y"
          TESTNAME=test_$1 skip "skipping excluded test $1 (base $base)"
          return 0
      fi
      testname=EXCEPT_SLOW_$1
      if [ ${!testname}x != x ]; then
+        LAST_SKIPPED="y"
          TESTNAME=test_$1 skip "skipping SLOW test $1"
          return 0
      fi
      testname=EXCEPT_SLOW_$base
      if [ ${!testname}x != x ]; then
+        LAST_SKIPPED="y"
          TESTNAME=test_$1 skip "skipping SLOW test $1 (base $base)"
          return 0
      fi
  
+    LAST_SKIPPED=
      run_one $1 "$2"
-    
+
      return $?
  }
  
@@ -1351,6 +1591,7 @@ log() {
      MSG=${MSG//\|/\\\|}
      MSG=${MSG//\>/\\\>}
      MSG=${MSG//\</\\\<}
+    MSG=${MSG//\//\\\/}
      local NODES=$(nodes_list)
      for NODE in $NODES; do
          do_node $NODE $LCTL mark "$MSG" 2> /dev/null || true
@@ -1366,7 +1607,8 @@ trace() {
  }
  
  pass() {
-    echo PASS $@
+    $TEST_FAILED && echo -n "FAIL " || echo -n "PASS " 
+    echo $@
  }
  
  check_mds() {
@@ -1389,6 +1631,7 @@ run_one() {
      message=$2
      tfile=f${testnum}
      export tdir=d0.${TESTSUITE}/d${base}
+
      local SAVE_UMASK=`umask`
      umask 0022
  
@@ -1396,19 +1639,19 @@ run_one() {
      log "== test $testnum: $message ============ `date +%H:%M:%S` ($BEFORE)"
      #check_mds
      export TESTNAME=test_$testnum
+    TEST_FAILED=false
      test_${testnum} || error "test_$testnum failed with $?"
      #check_mds
      cd $SAVE_PWD
      reset_fail_loc
      check_grant ${testnum} || error "check_grant $testnum failed with $?"
-    [ -f $CATASTROPHE ] && [ `cat $CATASTROPHE` -ne 0 ] && \
-        error "LBUG/LASSERT detected"
+    check_catastrophe || error "LBUG/LASSERT detected"
      ps auxww | grep -v grep | grep -q multiop && error "multiop still running"
      pass "($((`date +%s` - $BEFORE))s)"
+    TEST_FAILED=false
      unset TESTNAME
      unset tdir
      umask $SAVE_UMASK
-    $CLEANUP
  }
  
  canonical_path() {
@@ -1416,8 +1659,8 @@ canonical_path() {
  }
  
  sync_clients() {
-    [ -d $DIR1 ] && cd $DIR1 && sync; sleep 1; sync 
-    [ -d $DIR2 ] && cd $DIR2 && sync; sleep 1; sync 
+    [ -d $DIR1 ] && cd $DIR1 && sync; sleep 1; sync
+    [ -d $DIR2 ] && cd $DIR2 && sync; sleep 1; sync
         cd $SAVE_PWD
  }
  
@@ -1435,7 +1678,7 @@ check_grant() {
         for i in `seq $OSTCOUNT`; do
                 $LFS setstripe $DIR1/${tfile}_check_grant_$i -i $(($i -1)) -c 1
                 dd if=/dev/zero of=$DIR1/${tfile}_check_grant_$i bs=4k \
-                                             count=1 > /dev/null 2>&1 
+                                             count=1 > /dev/null 2>&1
         done
      # sync all the data and make sure no pending data on server
      sync_clients
@@ -1455,7 +1698,7 @@ check_grant() {
                 rm $DIR1/${tfile}_check_grant_$i
         done
  
-       #check whether client grant == server grant 
+       #check whether client grant == server grant
         if [ $client_grant != $server_grant ]; then
                 echo "failed: client:${client_grant} server: ${server_grant}"
                 return 1
@@ -1477,9 +1720,14 @@ osc_to_ost()
      echo $ost
  }
  
+remote_node () {
+    local node=$1
+    [ "$node" != "$(hostname)" ]
+}
+
  remote_mds ()
  {
-    [ -z "$(lctl dl | grep mdt)" ]
+    remote_node $mds_HOST
  }
  
  remote_mds_nodsh()
@@ -1489,7 +1737,11 @@ remote_mds_nodsh()
  
  remote_ost ()
  {
-    [ -z "$(lctl dl | grep ost)" ]
+    local node
+    for node in $(osts_nodes) ; do
+        remote_node $node && return 0
+    done
+    return 1
  }
  
  remote_ost_nodsh()
@@ -1497,6 +1749,10 @@ remote_ost_nodsh()
      remote_ost && [ "$PDSH" = "no_dsh" -o -z "$PDSH" -o -z "$ost_HOST" ]
  }
  
+remote_servers () {
+    remote_ost && remote_mds
+}
+
  osts_nodes () {
      local OSTNODES=$(facet_host ost1)
      local NODES_sort
@@ -1527,14 +1783,59 @@ nodes_list () {
      echo $myNODES_sort
  }
  
+remote_nodes_list () {
+    local rnodes=$(nodes_list)
+    rnodes=$(echo " $rnodes " | sed -re "s/\s+$HOSTNAME\s+/ /g")
+    echo $rnodes
+}
+
+init_clients_lists () {
+    # Sanity check: exclude the local client from RCLIENTS
+    local rclients=$(echo " $RCLIENTS " | sed -re "s/\s+$HOSTNAME\s+/ /g")
+
+    # Sanity check: exclude the dup entries
+    rclients=$(for i in $rclients; do echo $i; done | sort -u)
+
+    local clients="$SINGLECLIENT $HOSTNAME $rclients"
+
+    # Sanity check: exclude the dup entries from CLIENTS
+    # for those configs which has SINGLCLIENT set to local client
+    clients=$(for i in $clients; do echo $i; done | sort -u)
+
+    CLIENTS=`comma_list $clients`
+    local -a remoteclients=($rclients)
+    for ((i=0; $i<${#remoteclients[@]}; i++)); do
+            varname=CLIENT$((i + 2))
+            eval $varname=${remoteclients[i]}
+    done
+
+    CLIENTCOUNT=$((${#remoteclients[@]} + 1))
+}
+
+get_random_entry () {
+    local rnodes=$1
+
+    rnodes=${rnodes//,/ }
+
+    local nodes=($rnodes)
+    local num=${#nodes[@]} 
+    local i=$((RANDOM * num  / 65536))
+
+    echo ${nodes[i]}
+}
+
  is_patchless ()
  {
      lctl get_param version | grep -q patchless
  }
  
+check_versions () {
+    [ "$MDSVER" = "$CLIVER" -a "$OSTVER" = "$CLIVER" ]
+}
+
  get_node_count() {
-   local nodes="$@"
-   echo $nodes | wc -w || true
+    local nodes="$@"
+    echo $nodes | wc -w || true
  }
  
  mixed_ost_devs () {
@@ -1543,30 +1844,48 @@ mixed_ost_devs () {
      [ ! "$OSTCOUNT" = "$osscount" ]
  }
  
+generate_machine_file() {
+    local nodes=${1//,/ }
+    local machinefile=$2
+    rm -f $machinefile || error "can't rm $machinefile"
+    for node in $nodes; do
+        echo $node >>$machinefile
+    done
+}
+
+get_stripe () {
+    local file=$1/stripe
+    touch $file
+    $LFS getstripe -v $file || error
+    rm -f $file
+}
+
  check_runas_id_ret() {
      local myRC=0
-    local myRUNAS_ID=$1
-    shift
+    local myRUNAS_UID=$1
+    local myRUNAS_GID=$2
+    shift 2
      local myRUNAS=$@
      if [ -z "$myRUNAS" ]; then
          error_exit "myRUNAS command must be specified for check_runas_id"
      fi
      mkdir $DIR/d0_runas_test
      chmod 0755 $DIR
-    chown $myRUNAS_ID:$myRUNAS_ID $DIR/d0_runas_test
+    chown $myRUNAS_UID:$myRUNAS_GID $DIR/d0_runas_test
      $myRUNAS touch $DIR/d0_runas_test/f$$ || myRC=1
      rm -rf $DIR/d0_runas_test
      return $myRC
  }
  
  check_runas_id() {
-    local myRUNAS_ID=$1
-    shift
+    local myRUNAS_UID=$1
+    local myRUNAS_GID=$2
+    shift 2
      local myRUNAS=$@
-    check_runas_id_ret $myRUNAS_ID $myRUNAS || \
-        error "unable to write to $DIR/d0_runas_test as UID $myRUNAS_ID. 
-        Please set RUNAS_ID to some UID which exists on MDS and client or 
-        add user $myRUNAS_ID:$myRUNAS_ID on these nodes."
+    check_runas_id_ret $myRUNAS_UID $myRUNAS_GID $myRUNAS || \
+        error "unable to write to $DIR/d0_runas_test as UID $myRUNAS_UID.
+        Please set RUNAS_ID to some UID which exists on MDS and client or
+        add user $myRUNAS_UID:$myRUNAS_GID on these nodes."
  }
  
  # Run multiop in the background, but wait for it to print
@@ -1583,7 +1902,10 @@ multiop_bg_pause() {
      $MULTIOP_PROG $FILE v$ARGS > $TMPPIPE &
  
      echo "TMPPIPE=${TMPPIPE}"
-    read -t 60 multiop_output < $TMPPIPE
+    local multiop_output
+    local multiop_pid
+
+    read -t 60 multiop_output multiop_pid < $TMPPIPE
      if [ $? -ne 0 ]; then
          rm -f $TMPPIPE
          return 1
@@ -1595,9 +1917,31 @@ multiop_bg_pause() {
          return 1
      fi
  
+    echo $multiop_pid
      return 0
  }
  
+check_rate() {
+    local OP=$1
+    local TARGET_RATE=$2
+    local NUM_CLIENTS=$3
+    local LOG=$4
+
+    local RATE=$(awk '/^Rate: [0-9\.]+ '"${OP}"'s\/sec/ { print $2}' ${LOG})
+
+    # We need to use bc since the rate is a floating point number
+    local RES=$(echo "${RATE} < ${TARGET_RATE}" | bc -l )
+    if [ "${RES}" = 0 ]; then
+        echo "Success: ${RATE} ${OP}s/sec met target rate" \
+             "${TARGET_RATE} ${OP}s/sec for ${NUM_CLIENTS} client(s)."
+        return 0
+    else
+        echo "Failure: ${RATE} ${OP}s/sec did not meet target rate" \
+             "${TARGET_RATE} ${OP}s/sec for ${NUM_CLIENTS} client(s)."
+        return 1
+    fi
+}
+
  # reset llite stat counters
  clear_llite_stats(){
          lctl set_param -n llite.*.stats 0
@@ -1610,6 +1954,14 @@ calc_llite_stats() {
          echo $res
  }
  
+calc_sum () {
+        awk 'BEGIN {s = 0}; {s += $1}; END {print s}'
+}
+
+calc_osc_kbytes () {
+        $LCTL get_param -n osc.*[oO][sS][cC][-_]*.$1 | calc_sum
+}
+
  # save_lustre_params(node, parameter_mask)
  # generate a stream of formatted strings (<node> <param name>=<param value>)
  save_lustre_params() {
@@ -1627,3 +1979,49 @@ restore_lustre_params() {
          done
  }
  
+check_catastrophe () {
+    local rnodes=${1:-$(comma_list $(remote_nodes_list))}
+
+    [ -f $CATASTROPHE ] && [ $(cat $CATASTROPHE) -ne 0 ] && return 1
+    if [ $rnodes ]; then
+        do_nodes $rnodes "set -x; [ -f $CATASTROPHE ] && { [ \`cat $CATASTROPHE\` -eq 0 ] || false; } || true"
+    fi
+}
+
+# $1 node
+# $2 file
+get_stripe_info() {
+       local tmp_file
+
+       stripe_size=0
+       stripe_count=0
+       stripe_index=0
+       tmp_file=$(mktemp)
+
+       do_facet $1 lfs getstripe -v $2 > $tmp_file
+
+       stripe_size=`awk '$1 ~ /size/ {print $2}' $tmp_file`
+       stripe_count=`awk '$1 ~ /count/ {print $2}' $tmp_file`
+       stripe_index=`awk '/obdidx/ {start = 1; getline; print $1; exit}' $tmp_file`
+       rm -f $tmp_file
+}
+
+mpi_run () {
+    local mpirun="$MPIRUN $MPIRUN_OPTIONS"
+    local command="$mpirun $@"
+
+    if [ "$MPI_USER" != root -a $mpirun ]; then
+        echo "+ chmod 0777 $MOUNT"
+        chmod 0777 $MOUNT
+        command="su $MPI_USER sh -c \"$command \""
+    fi
+
+    ls -ald $MOUNT
+    echo "+ $command"
+    eval $command
+}
+
+delayed_recovery_enabled () {
+    do_facet mds "lctl get_param -n mds.${mds_svc}.stale_export_age" > /dev/null 2>&1
+}
+
diff --git a/lustre/tests/test2.c b/lustre/tests/test2.c

index fbbe6bb..9224e48 100755 (executable)
--- a/lustre/tests/test2.c
+++ b/lustre/tests/test2.c
@@ -1,8 +1,37 @@
-/*
- * Copyright (C) 2001  Cluster File Systems, Inc.
+/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
+ * vim:expandtab:shiftwidth=8:tabstop=8:
+ *
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
   *
- * This code is issued under the GNU General Public License.
- * See the file COPYING in this distribution
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright  2008 Sun Microsystems, Inc. All rights reserved
+ * Use is subject to license terms.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
   */
  
  #include <stdio.h>
diff --git a/lustre/tests/test_brw.c b/lustre/tests/test_brw.c

index fe18021..9be2796 100644 (file)
--- a/lustre/tests/test_brw.c
+++ b/lustre/tests/test_brw.c
@@ -1,5 +1,37 @@
  /* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
   * vim:expandtab:shiftwidth=8:tabstop=8:
+ *
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright  2008 Sun Microsystems, Inc. All rights reserved
+ * Use is subject to license terms.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
   */
  
  /* for O_DIRECT */
diff --git a/lustre/tests/toexcl.c b/lustre/tests/toexcl.c

index 7f099e8..43ca4c9 100644 (file)
--- a/lustre/tests/toexcl.c
+++ b/lustre/tests/toexcl.c
@@ -1,3 +1,39 @@
+/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
+ * vim:expandtab:shiftwidth=8:tabstop=8:
+ *
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright  2008 Sun Microsystems, Inc. All rights reserved
+ * Use is subject to license terms.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
  #include <sys/types.h>
  #include <sys/stat.h>
  #include <stdio.h>
diff --git a/lustre/tests/truncate.c b/lustre/tests/truncate.c

index c49fb15..28eb26e 100644 (file)
--- a/lustre/tests/truncate.c
+++ b/lustre/tests/truncate.c
@@ -1,3 +1,39 @@
+/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
+ * vim:expandtab:shiftwidth=8:tabstop=8:
+ *
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright  2008 Sun Microsystems, Inc. All rights reserved
+ * Use is subject to license terms.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
  #include <unistd.h>
  #include <stdio.h>
  #include <errno.h>
diff --git a/lustre/tests/unlinkmany.c b/lustre/tests/unlinkmany.c

index 080b1c6..0d075c6 100644 (file)
--- a/lustre/tests/unlinkmany.c
+++ b/lustre/tests/unlinkmany.c
@@ -1,3 +1,39 @@
+/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
+ * vim:expandtab:shiftwidth=8:tabstop=8:
+ *
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright  2008 Sun Microsystems, Inc. All rights reserved
+ * Use is subject to license terms.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
  #include <stdio.h>
  #include <sys/types.h>
  #include <sys/stat.h>
diff --git a/lustre/tests/utime.c b/lustre/tests/utime.c

index 8d8cd08..8bc5259 100644 (file)
--- a/lustre/tests/utime.c
+++ b/lustre/tests/utime.c
@@ -1,6 +1,43 @@
+/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
+ * vim:expandtab:shiftwidth=8:tabstop=8:
+ *
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright  2008 Sun Microsystems, Inc. All rights reserved
+ * Use is subject to license terms.
+ */
  /*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/tests/utime.c
+ *
   * Simple test for validating mtime on a file create and set via utime.
   */
+
  #include <stdio.h>
  #include <stdlib.h>
  #include <sys/types.h>
diff --git a/lustre/tests/wantedi.c b/lustre/tests/wantedi.c

deleted file mode 100644 (file)

index 7c78924..0000000
--- a/lustre/tests/wantedi.c
+++ /dev/null
@@ -1,49 +0,0 @@
-#include <stdio.h>
-#include <stdlib.h>
-#include <sys/types.h>
-#include <sys/stat.h>
-#include <errno.h>
-#include <string.h>
-#include <fcntl.h>
-#include <getopt.h>
-#include <unistd.h>
-#include <time.h>
-#include <limits.h>
-#include <sys/ioctl.h>
-#include <liblustre.h>
-#include <obd.h>
-#include <lustre_lib.h>
-
-static int usage(char *prog, FILE *out)
-{
-        fprintf(out,
-               "Usage: %s <dir> <desired child ino>\n", prog);
-        exit(out == stderr);
-}
-
-#define LDISKFS_IOC_CREATE_INUM            _IOW('f', 5, long)
-
-int main(int argc, char ** argv)
-{
-        int dirfd, wantedi, rc;
-
-       if (argc < 2 || argc > 3)
-               usage(argv[0], stderr);
-       
-       dirfd = open(argv[1], O_RDONLY);
-       if (dirfd < 0) {
-              perror("open");
-              exit(1);
-       }
-        
-       wantedi = atoi(argv[2]);
-       printf("Creating %s/%d with ino %d\n", argv[1], wantedi, wantedi);
-
-       rc = ioctl(dirfd, LDISKFS_IOC_CREATE_INUM, wantedi);
-       if (rc < 0) {
-              perror("ioctl(LDISKFS_IOC_CREATE_INUM)");
-              exit(2);
-       }
-
-        return 0;
-}
diff --git a/lustre/tests/write_append_truncate.c b/lustre/tests/write_append_truncate.c

index 8f95043..24f9043 100644 (file)
--- a/lustre/tests/write_append_truncate.c
+++ b/lustre/tests/write_append_truncate.c
@@ -1,6 +1,40 @@
  /* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
   * vim:expandtab:shiftwidth=8:tabstop=8:
   *
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright  2008 Sun Microsystems, Inc. All rights reserved
+ * Use is subject to license terms.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/tests/write_append_truncate.c
+ *
   * Each loop does 3 things:
   *   - truncate file to zero (not via ftruncate though, to test O_APPEND)
   *   - append a "chunk" of data (should be at file offset 0 after truncate)
@@ -16,6 +50,7 @@
   *  or:     pdsh -w <two hosts> write_append_truncate <file>
   *  or:     prun -n 2 [-N 2] write_append_truncate <file>
   */
+
  #include <stdlib.h>
  #include <stdio.h>
  #include <stdarg.h>
diff --git a/lustre/tests/write_disjoint.c b/lustre/tests/write_disjoint.c

index 431c14c..e02287d 100644 (file)
--- a/lustre/tests/write_disjoint.c
+++ b/lustre/tests/write_disjoint.c
@@ -1,6 +1,40 @@
  /* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
   * vim:expandtab:shiftwidth=8:tabstop=8:
   *
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright  2008 Sun Microsystems, Inc. All rights reserved
+ * Use is subject to license terms.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/tests/write_disjoint.c
+ *
   * Each loop does 3 things:
   *   - rank 0 truncates to 0
   *   - all ranks agree on a random chunk size
@@ -13,6 +47,7 @@
   *  or:     pdsh -w <N hosts> write_disjoint
   *  or:     prun -n N [-N M] write_disjoint
   */
+
  #include <stdlib.h>
  #include <stdio.h>
  #include <sys/types.h>
diff --git a/lustre/tests/writemany.c b/lustre/tests/writemany.c

index 03b48fb..c9db9f9 100644 (file)
--- a/lustre/tests/writemany.c
+++ b/lustre/tests/writemany.c
@@ -1,6 +1,39 @@
  /* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
   * vim:expandtab:shiftwidth=8:tabstop=8:
+ *
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
   */
+/*
+ * Copyright  2008 Sun Microsystems, Inc. All rights reserved
+ * Use is subject to license terms.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
  #include <stdlib.h>
  #include <stdio.h>
  #include <unistd.h>
diff --git a/lustre/tests/writeme.c b/lustre/tests/writeme.c

index bf5c971..0fdd798 100644 (file)
--- a/lustre/tests/writeme.c
+++ b/lustre/tests/writeme.c
@@ -1,3 +1,39 @@
+/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
+ * vim:expandtab:shiftwidth=8:tabstop=8:
+ *
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright  2008 Sun Microsystems, Inc. All rights reserved
+ * Use is subject to license terms.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
  #include <fcntl.h>
  #include <unistd.h>
  #include <stdlib.h>
diff --git a/lustre/utils/.cvsignore b/lustre/utils/.cvsignore

index b942408..88162c0 100644 (file)
--- a/lustre/utils/.cvsignore
+++ b/lustre/utils/.cvsignore
@@ -16,9 +16,11 @@ wirecheck
  wiretest
  llog_reader
  lr_reader
+lshowmount
  ltrack_stats
  obdio
  obdbarrier
+ll_recover_lost_found_objs
  lload
  llverfs
  llverdev
diff --git a/lustre/utils/Makefile.am b/lustre/utils/Makefile.am

index 99d893c..84e274f 100644 (file)
--- a/lustre/utils/Makefile.am
+++ b/lustre/utils/Makefile.am
@@ -16,7 +16,7 @@ EXTRA_PROGRAMS = wirecheck
  rootsbin_PROGRAMS = mount.lustre
  sbin_PROGRAMS = mkfs.lustre tunefs.lustre lctl wiretest \
         l_getgroups llverfs llverdev llog_reader ll_recover_lost_found_objs \
-       lr_reader ltrack_stats
+       lr_reader ltrack_stats lshowmount
  if LIBPTHREAD
  sbin_PROGRAMS += loadgen
  endif
@@ -28,16 +28,19 @@ endif # UTILS
  lib_LIBRARIES = liblustreapi.a
  
  lctl_SOURCES = parser.c obd.c lustre_cfg.c lctl.c parser.h obdctl.h platform.h
-lctl_LDADD := $(LIBREADLINE) $(LIBPTLCTL)
-lctl_DEPENDENCIES := $(LIBPTLCTL)
+lctl_LDADD := $(LIBREADLINE) liblustreapi.a $(LIBPTLCTL)
+lctl_DEPENDENCIES := $(LIBPTLCTL) liblustreapi.a
  
  lfs_SOURCES = lfs.c parser.c lustre_cfg.c obd.c
  lfs_LDADD := $(LIBREADLINE) liblustreapi.a $(LIBPTLCTL)
  lfs_DEPENDENCIES := $(LIBPTLCTL) liblustreapi.a 
  
  loadgen_SOURCES = loadgen.c lustre_cfg.c obd.c
-loadgen_LDADD := $(LIBREADLINE) $(LIBPTLCTL) $(PTHREAD_LIBS)
-loadgen_DEPENDENCIES := $(LIBPTLCTL)
+loadgen_LDADD := $(LIBREADLINE) liblustreapi.a $(LIBPTLCTL) $(PTHREAD_LIBS)
+loadgen_DEPENDENCIES := $(LIBPTLCTL) liblustreapi.a
+
+lshowmount_SOURCES    = lshowmount.h lshowmount.c hash.c hash.h \
+                       thread.c thread.h hostlist.c hostlist.h
  
  if EXT2FS_DEVEL
  EXT2FSLIB = -lext2fs
@@ -77,11 +80,11 @@ ll_recover_lost_found_objs_DEPENDENCIES := $(LIBPTLCTL)
  
  lr_reader_SOURCES = lr_reader.c
  
-mount_lustre_SOURCES = mount_lustre.c 
+mount_lustre_SOURCES = mount_lustre.c mount_utils.c mount_utils.h
  mount_lustre_LDADD := $(LIBPTLCTL)
  mount_lustre_DEPENDENCIES := $(LIBPTLCTL)
  
-mkfs_lustre_SOURCES = mkfs_lustre.c
+mkfs_lustre_SOURCES = mkfs_lustre.c mount_utils.c mount_utils.h
  mkfs_lustre_CPPFLAGS = -UTUNEFS $(AM_CPPFLAGS)
  mkfs_lustre_LDADD := $(LIBPTLCTL)
  mkfs_lustre_DEPENDENCIES := $(LIBPTLCTL)
diff --git a/lustre/utils/hash.c b/lustre/utils/hash.c

new file mode 100644 (file)

index 0000000..374a315
--- /dev/null
+++ b/lustre/utils/hash.c
@@ -0,0 +1,446 @@
+/*****************************************************************************
+ *  $Id: hash.c,v 1.1.10.2 2008/12/18 18:02:13 johann Exp $
+ *****************************************************************************
+ *  Copyright (C) 2003-2005 The Regents of the University of California.
+ *  Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER).
+ *  Written by Chris Dunlap <cdunlap@llnl.gov>.
+ *
+ *  This file is from LSD-Tools, the LLNL Software Development Toolbox.
+ *
+ *  This is free software; you can redistribute it and/or modify it
+ *  under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2 of the License, or
+ *  (at your option) any later version.
+ *
+ *  This is distributed in the hope that it will be useful, but WITHOUT
+ *  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ *  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ *  for more details.
+ *
+ *  You should have received a copy of the GNU General Public License;
+ *  if not, write to the Free Software Foundation, Inc., 51 Franklin Street,
+ *  Fifth Floor, Boston, MA  02110-1301  USA.
+ *****************************************************************************
+ *  Refer to "hash.h" for documentation on public functions.
+ *****************************************************************************/
+
+
+#if HAVE_CONFIG_H
+#  include "config.h"
+#endif /* HAVE_CONFIG_H */
+
+#include <assert.h>
+#include <errno.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "thread.h"
+#include "hash.h"
+
+
+/*****************************************************************************
+ *  Constants
+ *****************************************************************************/
+
+#define HASH_ALLOC      1024
+#define HASH_DEF_SIZE   1213
+#define HASH_MAGIC      0xDEADBEEF
+
+
+/*****************************************************************************
+ *  Data Types
+ *****************************************************************************/
+
+struct hash_node {
+    struct hash_node   *next;           /* next node in list                 */
+    void               *data;           /* ptr to hashed item                */
+    const void         *hkey;           /* ptr to hashed item's key          */
+};
+
+struct hash {
+    int                 count;          /* number of items in hash table     */
+    int                 size;           /* num slots allocated in hash table */
+    struct hash_node  **table;          /* hash table array of node ptrs     */
+    hash_cmp_f          cmp_f;          /* key comparison function           */
+    hash_del_f          del_f;          /* item deletion function            */
+    hash_key_f          key_f;          /* key hash function                 */
+#if WITH_PTHREADS
+    pthread_mutex_t     mutex;          /* mutex to protect access to hash   */
+#endif /* WITH_PTHREADS */
+#ifndef NDEBUG
+    unsigned int        magic;          /* sentinel for asserting validity   */
+#endif /* NDEBUG */
+};
+
+
+/*****************************************************************************
+ *  Prototypes
+ *****************************************************************************/
+
+static struct hash_node * hash_node_alloc (void);
+
+static void hash_node_free (struct hash_node *node);
+
+
+/*****************************************************************************
+ *  Variables
+ *****************************************************************************/
+
+#if 0
+static struct hash_node *hash_free_list = NULL;
+#endif
+
+#if WITH_PTHREADS
+static pthread_mutex_t hash_free_lock = PTHREAD_MUTEX_INITIALIZER;
+#endif /* WITH_PTHREADS */
+
+
+/*****************************************************************************
+ *  Macros
+ *****************************************************************************/
+
+#ifdef WITH_LSD_FATAL_ERROR_FUNC
+#  undef lsd_fatal_error
+   extern void lsd_fatal_error (char *file, int line, char *mesg);
+#else /* !WITH_LSD_FATAL_ERROR_FUNC */
+#  ifndef lsd_fatal_error
+#    define lsd_fatal_error(file, line, mesg) (abort ())
+#  endif /* !lsd_fatal_error */
+#endif /* !WITH_LSD_FATAL_ERROR_FUNC */
+
+#ifdef WITH_LSD_NOMEM_ERROR_FUNC
+#  undef lsd_nomem_error
+   extern void * lsd_nomem_error (char *file, int line, char *mesg);
+#else /* !WITH_LSD_NOMEM_ERROR_FUNC */
+#  ifndef lsd_nomem_error
+#    define lsd_nomem_error(file, line, mesg) (NULL)
+#  endif /* !lsd_nomem_error */
+#endif /* !WITH_LSD_NOMEM_ERROR_FUNC */
+
+
+/*****************************************************************************
+ *  Functions
+ *****************************************************************************/
+
+hash_t
+hash_create (int size, hash_key_f key_f, hash_cmp_f cmp_f, hash_del_f del_f)
+{
+    hash_t h;
+
+    if (!cmp_f || !key_f) {
+        errno = EINVAL;
+        return (NULL);
+    }
+    if (size <= 0) {
+        size = HASH_DEF_SIZE;
+    }
+    if (!(h = malloc (sizeof (*h)))) {
+        return (lsd_nomem_error (__FILE__, __LINE__, "hash_create"));
+    }
+    if (!(h->table = calloc (size, sizeof (struct hash_node *)))) {
+        free (h);
+        return (lsd_nomem_error (__FILE__, __LINE__, "hash_create"));
+    }
+    h->count = 0;
+    h->size = size;
+    h->cmp_f = cmp_f;
+    h->del_f = del_f;
+    h->key_f = key_f;
+    lsd_mutex_init (&h->mutex);
+    assert (h->magic = HASH_MAGIC);     /* set magic via assert abuse */
+    return (h);
+}
+
+
+void
+hash_destroy (hash_t h)
+{
+    int i;
+    struct hash_node *p, *q;
+
+    if (!h) {
+        errno = EINVAL;
+        return;
+    }
+    lsd_mutex_lock (&h->mutex);
+    assert (h->magic == HASH_MAGIC);
+    for (i = 0; i < h->size; i++) {
+        for (p = h->table[i]; p != NULL; p = q) {
+            q = p->next;
+            if (h->del_f)
+                h->del_f (p->data);
+            hash_node_free (p);
+        }
+    }
+    assert (h->magic = ~HASH_MAGIC);    /* clear magic via assert abuse */
+    lsd_mutex_unlock (&h->mutex);
+    lsd_mutex_destroy (&h->mutex);
+    free (h->table);
+    free (h);
+    return;
+}
+
+
+int
+hash_is_empty (hash_t h)
+{
+    int n;
+
+    if (!h) {
+        errno = EINVAL;
+        return (0);
+    }
+    lsd_mutex_lock (&h->mutex);
+    assert (h->magic == HASH_MAGIC);
+    n = h->count;
+    lsd_mutex_unlock (&h->mutex);
+    return (n == 0);
+}
+
+
+int
+hash_count (hash_t h)
+{
+    int n;
+
+    if (!h) {
+        errno = EINVAL;
+        return (0);
+    }
+    lsd_mutex_lock (&h->mutex);
+    assert (h->magic == HASH_MAGIC);
+    n = h->count;
+    lsd_mutex_unlock (&h->mutex);
+    return (n);
+}
+
+
+void *
+hash_find (hash_t h, const void *key)
+{
+    unsigned int slot;
+    struct hash_node *p;
+    void *data = NULL;
+
+    if (!h || !key) {
+        errno = EINVAL;
+        return (NULL);
+    }
+    errno = 0;
+    lsd_mutex_lock (&h->mutex);
+    assert (h->magic == HASH_MAGIC);
+    slot = h->key_f (key) % h->size;
+    for (p = h->table[slot]; p != NULL; p = p->next) {
+        if (!h->cmp_f (p->hkey, key)) {
+            data = p->data;
+            break;
+        }
+    }
+    lsd_mutex_unlock (&h->mutex);
+    return (data);
+}
+
+
+void *
+hash_insert (hash_t h, const void *key, void *data)
+{
+    struct hash_node *p;
+    unsigned int slot;
+
+    if (!h || !key || !data) {
+        errno = EINVAL;
+        return (NULL);
+    }
+    lsd_mutex_lock (&h->mutex);
+    assert (h->magic == HASH_MAGIC);
+    slot = h->key_f (key) % h->size;
+    for (p = h->table[slot]; p != NULL; p = p->next) {
+        if (!h->cmp_f (p->hkey, key)) {
+            errno = EEXIST;
+            data = NULL;
+            goto end;
+        }
+    }
+    if (!(p = hash_node_alloc ())) {
+        data = lsd_nomem_error (__FILE__, __LINE__, "hash_insert");
+        goto end;
+    }
+    p->hkey = key;
+    p->data = data;
+    p->next = h->table[slot];
+    h->table[slot] = p;
+    h->count++;
+
+end:
+    lsd_mutex_unlock (&h->mutex);
+    return (data);
+}
+
+
+void *
+hash_remove (hash_t h, const void *key)
+{
+    struct hash_node **pp;
+    struct hash_node *p;
+    unsigned int slot;
+    void *data = NULL;
+
+    if (!h || !key) {
+        errno = EINVAL;
+        return (NULL);
+    }
+    errno = 0;
+    lsd_mutex_lock (&h->mutex);
+    assert (h->magic == HASH_MAGIC);
+    slot = h->key_f (key) % h->size;
+    for (pp = &(h->table[slot]); (p = *pp) != NULL; pp = &((*pp)->next)) {
+        if (!h->cmp_f (p->hkey, key)) {
+            data = p->data;
+            *pp = p->next;
+            hash_node_free (p);
+            h->count--;
+            break;
+        }
+    }
+    lsd_mutex_unlock (&h->mutex);
+    return (data);
+}
+
+
+int
+hash_delete_if (hash_t h, hash_arg_f arg_f, void *arg)
+{
+    int i;
+    struct hash_node **pp;
+    struct hash_node *p;
+    int n = 0;
+
+    if (!h || !arg_f) {
+        errno = EINVAL;
+        return (-1);
+    }
+    lsd_mutex_lock (&h->mutex);
+    assert (h->magic == HASH_MAGIC);
+    for (i = 0; i < h->size; i++) {
+        pp = &(h->table[i]);
+        while ((p = *pp) != NULL) {
+            if (arg_f (p->data, p->hkey, arg) > 0) {
+                if (h->del_f)
+                    h->del_f (p->data);
+                *pp = p->next;
+                hash_node_free (p);
+                h->count--;
+                n++;
+            }
+            else {
+                pp = &(p->next);
+            }
+        }
+    }
+    lsd_mutex_unlock (&h->mutex);
+    return (n);
+}
+
+
+int
+hash_for_each (hash_t h, hash_arg_f arg_f, void *arg)
+{
+    int i;
+    struct hash_node *p;
+    int n = 0;
+
+    if (!h || !arg_f) {
+        errno = EINVAL;
+        return (-1);
+    }
+    lsd_mutex_lock (&h->mutex);
+    assert (h->magic == HASH_MAGIC);
+    for (i = 0; i < h->size; i++) {
+        for (p = h->table[i]; p != NULL; p = p->next) {
+            if (arg_f (p->data, p->hkey, arg) > 0) {
+                n++;
+            }
+        }
+    }
+    lsd_mutex_unlock (&h->mutex);
+    return (n);
+}
+
+
+/*****************************************************************************
+ *  Hash Functions
+ *****************************************************************************/
+
+unsigned int
+hash_key_string (const char *str)
+{
+    unsigned char *p;
+    unsigned int hval = 0;
+    const unsigned int multiplier = 31;
+
+    for (p = (unsigned char *) str; *p != '\0'; p++) {
+        hval += (multiplier * hval) + *p;
+    }
+    return (hval);
+}
+
+
+/*****************************************************************************
+ *  Internal Functions
+ *****************************************************************************/
+
+static struct hash_node *
+hash_node_alloc (void)
+{
+/*  Allocates a hash node from the freelist.
+ *  Memory is allocated in chunks of HASH_ALLOC.
+ *  Returns a ptr to the object, or NULL if memory allocation fails.
+ */
+#if 0
+    int i;
+#endif
+    struct hash_node *p = NULL;
+
+    assert (HASH_ALLOC > 0);
+    lsd_mutex_lock (&hash_free_lock);
+#if 0
+    if (!hash_free_list) {
+        if ((hash_free_list = malloc (HASH_ALLOC * sizeof (*p)))) {
+            for (i = 0; i < HASH_ALLOC - 1; i++)
+                hash_free_list[i].next = &hash_free_list[i+1];
+            hash_free_list[i].next = NULL;
+        }
+    }
+    if (hash_free_list) {
+        p = hash_free_list;
+        hash_free_list = p->next;
+    }
+    else {
+        errno = ENOMEM;
+    }
+#else
+    if (!(p = malloc (sizeof(*p))))
+        errno = ENOMEM;
+#endif
+    lsd_mutex_unlock (&hash_free_lock);
+    return (p);
+}
+
+
+static void
+hash_node_free (struct hash_node *node)
+{
+/*  De-allocates the object [node], returning it to the freelist.
+ */
+    assert (node != NULL);
+    memset (node, 0, sizeof (*node));
+    lsd_mutex_lock (&hash_free_lock);
+#if 0
+    node->next = hash_free_list;
+    hash_free_list = node;
+#else
+    free (node);
+#endif
+    lsd_mutex_unlock (&hash_free_lock);
+    return;
+}
diff --git a/lustre/utils/hash.h b/lustre/utils/hash.h

new file mode 100644 (file)

index 0000000..bbbfc1a
--- /dev/null
+++ b/lustre/utils/hash.h
@@ -0,0 +1,177 @@
+/*****************************************************************************
+ *  $Id: hash.h,v 1.1.10.2 2008/12/18 18:02:13 johann Exp $
+ *****************************************************************************
+ *  Copyright (C) 2003-2005 The Regents of the University of California.
+ *  Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER).
+ *  Written by Chris Dunlap <cdunlap@llnl.gov>.
+ *
+ *  This file is from LSD-Tools, the LLNL Software Development Toolbox.
+ *
+ *  This is free software; you can redistribute it and/or modify it
+ *  under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2 of the License, or
+ *  (at your option) any later version.
+ *
+ *  This is distributed in the hope that it will be useful, but WITHOUT
+ *  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ *  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ *  for more details.
+ *
+ *  You should have received a copy of the GNU General Public License;
+ *  if not, write to the Free Software Foundation, Inc., 51 Franklin Street,
+ *  Fifth Floor, Boston, MA  02110-1301  USA.
+ *****************************************************************************/
+
+
+#ifndef LSD_HASH_H
+#define LSD_HASH_H
+
+
+/*****************************************************************************
+ *  Notes
+ *****************************************************************************/
+/*
+ *  If an item's key is modified after insertion, the hash will be unable to
+ *  locate it if the new key should hash to a different slot in the table.
+ *
+ *  If NDEBUG is not defined, internal debug code will be enabled; this is
+ *  intended for development use only.  Production code should define NDEBUG.
+ *
+ *  If WITH_LSD_FATAL_ERROR_FUNC is defined, the linker will expect to
+ *  find an external lsd_fatal_error(file,line,mesg) function.  By default,
+ *  lsd_fatal_error(file,line,mesg) is a macro definition that aborts.
+ *  This macro may be redefined to invoke another routine instead.
+ *
+ *  If WITH_LSD_NOMEM_ERROR_FUNC is defined, the linker will expect to
+ *  find an external lsd_nomem_error(file,line,mesg) function.  By default,
+ *  lsd_nomem_error(file,line,mesg) is a macro definition that returns NULL.
+ *  This macro may be redefined to invoke another routine instead.
+ *
+ *  If WITH_PTHREADS is defined, these routines will be thread-safe.
+ */
+
+
+/*****************************************************************************
+ *  Data Types
+ *****************************************************************************/
+
+typedef struct hash * hash_t;
+/*
+ *  Hash table opaque data type.
+ */
+
+typedef unsigned int (*hash_key_f) (const void *key);
+/*
+ *  Function prototype for the hash function responsible for converting
+ *    the data's [key] into an unsigned integer hash value.
+ */
+
+typedef int (*hash_cmp_f) (const void *key1, const void *key2);
+/*
+ *  Function prototype for comparing two keys.
+ *  Returns zero if both keys are equal; o/w, returns nonzero.
+ */
+
+typedef void (*hash_del_f) (void *data);
+/*
+ *  Function prototype for de-allocating a data item stored within a hash.
+ *  This function is responsible for freeing all memory associated with
+ *    the [data] item, including any subordinate items.
+ */
+
+typedef int (*hash_arg_f) (void *data, const void *key, void *arg);
+/*
+ *  Function prototype for operating on each element in the hash table.
+ *  The function will be invoked once for each [data] item in the hash,
+ *    with the item's [key] and the specified [arg] being passed in as args.
+ */
+
+
+/*****************************************************************************
+ *  Functions
+ *****************************************************************************/
+
+hash_t hash_create (int size,
+    hash_key_f key_f, hash_cmp_f cmp_f, hash_del_f del_f);
+/*
+ *  Creates and returns a new hash table on success.
+ *    Returns lsd_nomem_error() with errno=ENOMEM if memory allocation fails.
+ *    Returns NULL with errno=EINVAL if [keyf] or [cmpf] is not specified.
+ *  The [size] is the number of slots in the table; a larger table requires
+ *    more memory, but generally provide quicker access times.  If set <= 0,
+ *    the default size is used.
+ *  The [keyf] function converts a key into a hash value.
+ *  The [cmpf] function determines whether two keys are equal.
+ *  The [delf] function de-allocates memory used by items in the hash;
+ *    if set to NULL, memory associated with these items will not be freed
+ *    when the hash is destroyed.
+ */
+
+void hash_destroy (hash_t h);
+/*
+ *  Destroys hash table [h].  If a deletion function was specified when the
+ *    hash was created, it will be called for each item contained within.
+ *  Abadoning a hash without calling hash_destroy() will cause a memory leak.
+ */
+
+int hash_is_empty (hash_t h);
+/*
+ *  Returns non-zero if hash table [h] is empty; o/w, returns zero.
+ */
+
+int hash_count (hash_t h);
+/*
+ *  Returns the number of items in hash table [h].
+ */
+
+void * hash_find (hash_t h, const void *key);
+/*
+ *  Searches for the item corresponding to [key] in hash table [h].
+ *  Returns a ptr to the found item's data on success.
+ *    Returns NULL with errno=0 if no matching item is found.
+ *    Returns NULL with errno=EINVAL if [key] is not specified.
+ */
+
+void * hash_insert (hash_t h, const void *key, void *data);
+/*
+ *  Inserts [data] with the corresponding [key] into hash table [h];
+ *    note that it is permissible for [key] to be set equal to [data].
+ *  Returns a ptr to the inserted item's data on success.
+ *    Returns NULL with errno=EEXIST if [key] already exists in the hash.
+ *    Returns NULL with errno=EINVAL if [key] or [data] is not specified.
+ *    Returns lsd_nomem_error() with errno=ENOMEM if memory allocation fails.
+ */
+
+void * hash_remove (hash_t h, const void *key);
+/*
+ *  Removes the item corresponding to [key] from hash table [h].
+ *  Returns a ptr to the removed item's data on success.
+ *    Returns NULL with errno=0 if no matching item is found.
+ *    Returns NULL with errno=EINVAL if [key] is not specified.
+ */
+
+int hash_delete_if (hash_t h, hash_arg_f argf, void *arg);
+/*
+ *  Conditionally deletes (and de-allocates) items from hash table [h].
+ *  The [argf] function is invoked once for each item in the hash, with
+ *    [arg] being passed in as an argument.  Items for which [argf] returns
+ *    greater-than-zero are deleted.
+ *  Returns the number of items deleted.
+ *    Returns -1 with errno=EINVAL if [argf] is not specified.
+ */
+
+int hash_for_each (hash_t h, hash_arg_f argf, void *arg);
+/*
+ *  Invokes the [argf] function once for each item in hash table [h],
+ *    with [arg] being passed in as an argument.
+ *  Returns the number of items for which [argf] returns greater-than-zero.
+ *    Returns -1 with errno=EINVAL if [argf] is not specified.
+ */
+
+unsigned int hash_key_string (const char *str);
+/*
+ *  A hash_key_f function that hashes the string [str].
+ */
+
+
+#endif /* !LSD_HASH_H */
diff --git a/lustre/utils/hostlist.c b/lustre/utils/hostlist.c

new file mode 100644 (file)

index 0000000..fedbcd5
--- /dev/null
+++ b/lustre/utils/hostlist.c
@@ -0,0 +1,2687 @@
+/*****************************************************************************\
+ *  $Id: hostlist.c,v 1.1.10.2 2008/12/18 18:02:13 johann Exp $
+ *****************************************************************************
+ *  Copyright (C) 2002 The Regents of the University of California.
+ *  Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER).
+ *  Written by Mark Grondona <mgrondona@llnl.gov>
+ *  UCRL-CODE-2002-040.
+ *
+ *  This file is part of SLURM, a resource management program.
+ *  For details, see <http://www.llnl.gov/linux/slurm/>.
+ *
+ *  SLURM is free software; you can redistribute it and/or modify it under
+ *  the terms of the GNU General Public License as published by the Free
+ *  Software Foundation; either version 2 of the License, or (at your option)
+ *  any later version.
+ *
+ *  SLURM is distributed in the hope that it will be useful, but WITHOUT ANY
+ *  WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+ *  FOR A PARTICULAR PURPOSE.  See the GNU General Public License for more
+ *  details.
+ *
+ *  You should have received a copy of the GNU General Public License along
+ *  with SLURM; if not, write to the Free Software Foundation, Inc.,
+ *  59 Temple Place, Suite 330, Boston, MA  02111-1307  USA.
+\*****************************************************************************/
+
+#ifdef HAVE_CONFIG_H
+#  include "config.h"
+#  if HAVE_STRING_H
+#    include <string.h>
+#  endif
+#  if HAVE_PTHREAD_H
+#    include <pthread.h>
+#  endif
+#else                /* !HAVE_CONFIG_H */
+#  include <string.h>
+#  include <pthread.h>
+#endif                /* HAVE_CONFIG_H */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <stdarg.h>
+#include <assert.h>
+#include <errno.h>
+#include <ctype.h>
+#include <sys/param.h>
+#include <unistd.h>
+
+#include "hostlist.h"
+
+/*
+ * lsd_fatal_error : fatal error macro
+ */
+#ifdef WITH_LSD_FATAL_ERROR_FUNC
+#  undef lsd_fatal_error
+   extern void lsd_fatal_error(char *file, int line, char *mesg);
+#else /* !WITH_LSD_FATAL_ERROR_FUNC */
+#  ifndef lsd_fatal_error
+#    define lsd_fatal_error(file, line, mesg)                                \
+       do {                                                                  \
+           fprintf(stderr, "ERROR: [%s:%d] %s: %s\n",                        \
+           file, line, mesg, strerror(errno));                               \
+       } while (0)
+#  endif /* !lsd_fatal_error */
+#endif /* !WITH_LSD_FATAL_ERROR_FUNC */
+
+/*
+ * lsd_nonmem_error
+ */
+#ifdef WITH_LSD_NOMEM_ERROR_FUNC
+#  undef lsd_nomem_error
+   extern void * lsd_nomem_error(char *file, int line, char *mesg);
+#else /* !WITH_LSD_NOMEM_ERROR_FUNC */
+#  ifndef lsd_nomem_error
+#    define lsd_nomem_error(file, line, mesg) (NULL)
+#  endif /* !lsd_nomem_error */
+#endif /* !WITH_LSD_NOMEM_ERROR_FUNC */
+
+/*
+ * OOM helper function
+ *  Automatically call lsd_nomem_error with appropriate args
+ *  and set errno to ENOMEM
+ */
+#define out_of_memory(mesg)                                                  \
+    do {                                                                     \
+        errno = ENOMEM;                                                      \
+        return(lsd_nomem_error(__FILE__, __LINE__, mesg));                   \
+    } while (0)
+
+/*
+ * Some constants and tunables:
+ */
+
+/* number of elements to allocate when extending the hostlist array */
+#define HOSTLIST_CHUNK    16
+
+/* max host range: anything larger will be assumed to be an error */
+#define MAX_RANGE    16384    /* 16K Hosts */
+
+/* max host suffix value */
+#define MAX_HOST_SUFFIX 1<<25
+
+/* max number of ranges that will be processed between brackets */
+#define MAX_RANGES    10240    /* 10K Ranges */
+
+/* size of internal hostname buffer (+ some slop), hostnames will probably
+ * be truncated if longer than MAXHOSTNAMELEN */
+#ifndef MAXHOSTNAMELEN
+#define MAXHOSTNAMELEN    64
+#endif
+
+/* max size of internal hostrange buffer */
+#define MAXHOSTRANGELEN 1024
+
+/* ----[ Internal Data Structures ]---- */
+
+/* hostname type: A convenience structure used in parsing single hostnames */
+struct hostname_components {
+    char *hostname;         /* cache of initialized hostname        */
+    char *prefix;           /* hostname prefix                      */
+    unsigned long num;      /* numeric suffix                       */
+
+    /* string representation of numeric suffix
+     * points into `hostname'                                       */
+    char *suffix;
+};
+
+typedef struct hostname_components *hostname_t;
+
+/* hostrange type: A single prefix with `hi' and `lo' numeric suffix values */
+struct hostrange_components {
+    char *prefix;        /* alphanumeric prefix: */
+
+    /* beginning (lo) and end (hi) of suffix range */
+    unsigned long lo, hi;
+
+    /* width of numeric output format
+     * (pad with zeros up to this width) */
+    int width;
+
+    /* If singlehost is 1, `lo' and `hi' are invalid */
+    unsigned singlehost:1;
+};
+
+typedef struct hostrange_components *hostrange_t;
+
+/* The hostlist type: An array based list of hostrange_t's */
+struct hostlist {
+#ifndef NDEBUG
+#define HOSTLIST_MAGIC    57005
+    int magic;
+#endif
+#if    WITH_PTHREADS
+    pthread_mutex_t mutex;
+#endif                /* WITH_PTHREADS */
+
+    /* current number of elements available in array */
+    int size;
+
+    /* current number of ranges stored in array */
+    int nranges;
+
+    /* current number of hosts stored in hostlist */
+    int nhosts;
+
+    /* pointer to hostrange array */
+    hostrange_t *hr;
+
+    /* list of iterators */
+    struct hostlist_iterator *ilist;
+
+};
+
+
+/* a hostset is a wrapper around a hostlist */
+struct hostset {
+    hostlist_t hl;
+};
+
+struct hostlist_iterator {
+#ifndef NDEBUG
+    int magic;
+#endif
+    /* hostlist we are traversing */
+    hostlist_t hl;
+
+    /* current index of iterator in hl->hr[] */
+    int idx;
+
+    /* current hostrange object in list hl, i.e. hl->hr[idx] */
+    hostrange_t hr;
+
+    /* current depth we've traversed into range hr */
+    int depth;
+
+    /* next ptr for lists of iterators */
+    struct hostlist_iterator *next;
+};
+
+
+/* ---- ---- */
+
+/* ------[ static function prototypes ]------ */
+
+static void _error(char *file, int line, char *mesg, ...);
+static char * _next_tok(char *, char **);
+static int    _zero_padded(unsigned long, int);
+static int    _width_equiv(unsigned long, int *, unsigned long, int *);
+
+static int           host_prefix_end(const char *);
+static hostname_t    hostname_create(const char *);
+static void          hostname_destroy(hostname_t);
+static int           hostname_suffix_is_valid(hostname_t);
+static int           hostname_suffix_width(hostname_t);
+
+static hostrange_t   hostrange_new(void);
+static hostrange_t   hostrange_create_single(const char *);
+static hostrange_t   hostrange_create(char *, unsigned long, unsigned long, int);
+static unsigned long hostrange_count(hostrange_t);
+static hostrange_t   hostrange_copy(hostrange_t);
+static void          hostrange_destroy(hostrange_t);
+static hostrange_t   hostrange_delete_host(hostrange_t, unsigned long);
+static int           hostrange_cmp(hostrange_t, hostrange_t);
+static int           hostrange_prefix_cmp(hostrange_t, hostrange_t);
+static int           hostrange_within_range(hostrange_t, hostrange_t);
+static int           hostrange_width_combine(hostrange_t, hostrange_t);
+static int           hostrange_empty(hostrange_t);
+static char *        hostrange_pop(hostrange_t);
+static char *        hostrange_shift(hostrange_t);
+static int           hostrange_join(hostrange_t, hostrange_t);
+static hostrange_t   hostrange_intersect(hostrange_t, hostrange_t);
+static int           hostrange_hn_within(hostrange_t, hostname_t);
+static size_t        hostrange_to_string(hostrange_t hr, size_t, char *, char *);
+static size_t        hostrange_numstr(hostrange_t, size_t, char *);
+
+static hostlist_t  hostlist_new(void);
+static hostlist_t _hostlist_create_bracketed(const char *, char *, char *);
+static int         hostlist_resize(hostlist_t, size_t);
+static int         hostlist_expand(hostlist_t);
+static int         hostlist_push_range(hostlist_t, hostrange_t);
+static int         hostlist_push_hr(hostlist_t, char *, unsigned long,
+                                    unsigned long, int);
+static int         hostlist_insert_range(hostlist_t, hostrange_t, int);
+static void        hostlist_delete_range(hostlist_t, int n);
+static void        hostlist_coalesce(hostlist_t hl);
+static void        hostlist_collapse(hostlist_t hl);
+static hostlist_t _hostlist_create(const char *, char *, char *);
+static void        hostlist_shift_iterators(hostlist_t, int, int, int);
+static int        _attempt_range_join(hostlist_t, int);
+static int        _is_bracket_needed(hostlist_t, int);
+
+static hostlist_iterator_t hostlist_iterator_new(void);
+static void               _iterator_advance(hostlist_iterator_t);
+static void               _iterator_advance_range(hostlist_iterator_t);
+
+static int hostset_find_host(hostset_t, const char *);
+
+/* ------[ macros ]------ */
+
+#ifdef WITH_PTHREADS
+#  define mutex_init(mutex)                                                  \
+     do {                                                                    \
+        int e = pthread_mutex_init(mutex, NULL);                             \
+        if (e) {                                                             \
+            errno = e;                                                       \
+            lsd_fatal_error(__FILE__, __LINE__, "hostlist mutex init:");     \
+            abort();                                                         \
+        }                                                                    \
+     } while (0)
+
+#  define mutex_lock(mutex)                                                  \
+     do {                                                                    \
+        int e = pthread_mutex_lock(mutex);                                   \
+        if (e) {                                                             \
+           errno = e;                                                        \
+           lsd_fatal_error(__FILE__, __LINE__, "hostlist mutex lock:");      \
+           abort();                                                          \
+        }                                                                    \
+     } while (0)
+
+#  define mutex_unlock(mutex)                                                \
+     do {                                                                    \
+        int e = pthread_mutex_unlock(mutex);                                 \
+        if (e) {                                                             \
+            errno = e;                                                       \
+            lsd_fatal_error(__FILE__, __LINE__, "hostlist mutex unlock:");   \
+            abort();                                                         \
+        }                                                                    \
+     } while (0)
+
+#  define mutex_destroy(mutex)                                               \
+     do {                                                                    \
+        int e = pthread_mutex_destroy(mutex);                                \
+        if (e) {                                                             \
+            errno = e;                                                       \
+            lsd_fatal_error(__FILE__, __LINE__, "hostlist mutex destroy:");  \
+            abort();                                                         \
+        }                                                                    \
+     } while (0)
+
+#else                /* !WITH_PTHREADS */
+
+#  define mutex_init(mutex)
+#  define mutex_lock(mutex)
+#  define mutex_unlock(mutex)
+#  define mutex_destroy(mutex)
+
+#endif                /* WITH_PTHREADS */
+
+#define LOCK_HOSTLIST(_hl)                                                   \
+      do {                                                                   \
+          assert(_hl != NULL);                                               \
+          mutex_lock(&(_hl)->mutex);                                         \
+          assert((_hl)->magic == HOSTLIST_MAGIC);                            \
+      } while (0)
+
+#define UNLOCK_HOSTLIST(_hl)                                                 \
+      do {                                                                   \
+          mutex_unlock(&(_hl)->mutex);                                       \
+      } while (0)
+
+#define seterrno_ret(_errno, _rc)                                            \
+      do {                                                                   \
+          errno = _errno;                                                    \
+          return _rc;                                                        \
+      } while (0)
+
+/* ------[ Function Definitions ]------ */
+
+/* ----[ general utility functions ]---- */
+
+
+/*
+ *  Varargs capable error reporting via lsd_fatal_error()
+ */
+static void _error(char *file, int line, char *msg, ...)
+{
+    va_list ap;
+    char    buf[1024];
+    int     len = 0;
+    va_start(ap, msg);
+
+    len = vsnprintf(buf, 1024, msg, ap);
+    if ((len < 0) || (len > 1024))
+        buf[1023] = '\0';
+
+    lsd_fatal_error(file, line, buf);
+
+    va_end(ap);
+    return;
+}
+
+static int _advance_past_brackets (char *tok, char **str)
+{
+    /* if _single_ opening bracket exists b/w tok and str, push str
+     * past first closing bracket to next seperator */
+    if (   memchr(tok, '[', *str - tok) != NULL
+        && memchr(tok, ']', *str - tok) == NULL ) {
+        char *q = strchr(*str, ']');
+        if (q && memchr(*str, '[', q - *str) == NULL) {
+            *str = q + 1;
+            return (1);
+        }
+    }
+
+    return 0;
+}
+
+/*
+ * Helper function for host list string parsing routines
+ * Returns a pointer to the next token; additionally advance *str
+ * to the next separator.
+ *
+ * next_tok was taken directly from pdsh courtesy of Jim Garlick.
+ * (with modifications to support bracketed hostlists, i.e.:
+ *  xxx[xx,xx,xx] is a single token)
+ *
+ */
+static char * _next_tok(char *sep, char **str)
+{
+    char *tok;
+
+    /* push str past any leading separators */
+    while (**str != '\0' && strchr(sep, **str) != '\0')
+        (*str)++;
+
+    if (**str == '\0')
+        return NULL;
+
+    /* assign token ptr */
+    tok = *str;
+
+    /*
+     * Advance str past any separators, but if a separator occurs between
+     *  brackets, e.g. foo[0-3,5], then advance str past closing brackets and
+     *  try again.
+     */
+    do {
+        /* push str past token and leave pointing to first separator */
+        while (**str != '\0' && strchr(sep, **str) == '\0')
+            (*str)++;
+    } while (_advance_past_brackets (tok, str));
+
+   /* nullify consecutive separators and push str beyond them */
+    while (**str != '\0' && strchr(sep, **str) != '\0')
+        *(*str)++ = '\0';
+
+    return tok;
+}
+
+
+/* return the number of zeros needed to pad "num" to "width"
+ */
+static int _zero_padded(unsigned long num, int width)
+{
+    int n = 1;
+    while (num /= 10L)
+        n++;
+    return width > n ? width - n : 0;
+}
+
+/* test whether two format `width' parameters are "equivalent"
+ * The width arguments "wn" and "wm" for integers "n" and "m"
+ * are equivalent if:
+ *
+ *  o  wn == wm  OR
+ *
+ *  o  applying the same format width (either wn or wm) to both of
+ *     'n' and 'm' will not change the zero padding of *either* 'm' nor 'n'.
+ *
+ *  If this function returns 1 (or true), the appropriate width value
+ *  (either 'wm' or 'wn') will have been adjusted such that both format
+ *  widths are equivalent.
+ */
+static int _width_equiv(unsigned long n, int *wn, unsigned long m, int *wm)
+{
+    int npad, nmpad, mpad, mnpad;
+
+    if (wn == wm)
+        return 1;
+
+    npad = _zero_padded(n, *wn);
+    nmpad = _zero_padded(n, *wm);
+    mpad = _zero_padded(m, *wm);
+    mnpad = _zero_padded(m, *wn);
+
+    if (npad != nmpad && mpad != mnpad)
+        return 0;
+
+    if (npad != nmpad) {
+        if (mpad == mnpad) {
+            *wm = *wn;
+            return 1;
+        } else
+            return 0;
+    } else {        /* mpad != mnpad */
+        if (npad == nmpad) {
+            *wn = *wm;
+            return 1;
+        } else
+            return 0;
+    }
+
+    /* not reached */
+}
+
+
+/* ----[ hostname_t functions ]---- */
+
+/*
+ * return the location of the last char in the hostname prefix
+ */
+static int host_prefix_end(const char *hostname)
+{
+    int idx = strlen(hostname) - 1;
+
+    while (idx >= 0 && isdigit((char) hostname[idx]))
+        idx--;
+    return idx;
+}
+
+/*
+ * create a hostname_t object from a string hostname
+ */
+static hostname_t hostname_create(const char *hostname)
+{
+    hostname_t hn = NULL;
+    char *p = '\0';
+    int idx = 0;
+
+    assert(hostname != NULL);
+
+    if (!(hn = (hostname_t) malloc(sizeof(*hn))))
+        out_of_memory("hostname create");
+
+    idx = host_prefix_end(hostname);
+
+    if (!(hn->hostname = strdup(hostname))) {
+        free(hn);
+        out_of_memory("hostname create");
+    }
+
+    hn->num = 0;
+    hn->prefix = NULL;
+    hn->suffix = NULL;
+
+    if (idx == strlen(hostname) - 1) {
+        if ((hn->prefix = strdup(hostname)) == NULL) {
+            hostname_destroy(hn);
+            out_of_memory("hostname prefix create");
+        }
+        return hn;
+    }
+
+    hn->suffix = hn->hostname + idx + 1;
+    hn->num = strtoul(hn->suffix, &p, 10);
+
+    if ((*p == '\0') && (hn->num <= MAX_HOST_SUFFIX)) {
+        if (!(hn->prefix = malloc((idx + 2) * sizeof(char)))) {
+            hostname_destroy(hn);
+            out_of_memory("hostname prefix create");
+        }
+        memcpy(hn->prefix, hostname, idx + 1);
+        hn->prefix[idx + 1] = '\0';
+    } else {
+        if (!(hn->prefix = strdup(hostname))) {
+            hostname_destroy(hn);
+            out_of_memory("hostname prefix create");
+        }
+        hn->suffix = NULL;
+    }
+
+    return hn;
+}
+
+/* free a hostname object
+ */
+static void hostname_destroy(hostname_t hn)
+{
+    if (hn == NULL)
+        return;
+    hn->suffix = NULL;
+    if (hn->hostname)
+        free(hn->hostname);
+    if (hn->prefix)
+        free(hn->prefix);
+    free(hn);
+}
+
+/* return true if the hostname has a valid numeric suffix
+ */
+static int hostname_suffix_is_valid(hostname_t hn)
+{
+    return hn->suffix != NULL;
+}
+
+/* return the width (in characters) of the numeric part of the hostname
+ */
+static int hostname_suffix_width(hostname_t hn)
+{
+    assert(hn->suffix != NULL);
+    return (int) strlen(hn->suffix);
+}
+
+
+/* ----[ hostrange_t functions ]---- */
+
+/* allocate a new hostrange object
+ */
+static hostrange_t hostrange_new(void)
+{
+    hostrange_t new = (hostrange_t) malloc(sizeof(*new));
+    if (!new)
+        out_of_memory("hostrange create");
+    return new;
+}
+
+/* Create a hostrange_t containing a single host without a valid suffix
+ * hr->prefix will represent the entire hostname.
+ */
+static hostrange_t hostrange_create_single(const char *prefix)
+{
+    hostrange_t new;
+
+    assert(prefix != NULL);
+
+    if ((new = hostrange_new()) == NULL)
+        goto error1;
+
+    if ((new->prefix = strdup(prefix)) == NULL)
+        goto error2;
+
+    new->singlehost = 1;
+    new->lo = 0L;
+    new->hi = 0L;
+    new->width = 0;
+
+    return new;
+
+  error2:
+    free(new);
+  error1:
+    out_of_memory("hostrange create single");
+}
+
+
+/* Create a hostrange object with a prefix, hi, lo, and format width
+ */
+static hostrange_t
+hostrange_create(char *prefix, unsigned long lo, unsigned long hi, int width)
+{
+    hostrange_t new;
+
+    assert(prefix != NULL);
+
+    if ((new = hostrange_new()) == NULL)
+        goto error1;
+
+    if ((new->prefix = strdup(prefix)) == NULL)
+        goto error2;
+
+    new->lo = lo;
+    new->hi = hi;
+    new->width = width;
+
+    new->singlehost = 0;
+
+    return new;
+
+  error2:
+    free(new);
+  error1:
+    out_of_memory("hostrange create");
+}
+
+
+/* Return the number of hosts stored in the hostrange object
+ */
+static unsigned long hostrange_count(hostrange_t hr)
+{
+    assert(hr != NULL);
+    if (hr->singlehost)
+        return 1;
+    else
+        return hr->hi - hr->lo + 1;
+}
+
+/* Copy a hostrange object
+ */
+static hostrange_t hostrange_copy(hostrange_t hr)
+{
+    assert(hr != NULL);
+
+    if (hr->singlehost)
+        return hostrange_create_single(hr->prefix);
+    else
+        return hostrange_create(hr->prefix, hr->lo, hr->hi,
+                    hr->width);
+}
+
+
+/* free memory allocated by the hostrange object
+ */
+static void hostrange_destroy(hostrange_t hr)
+{
+    if (hr == NULL)
+        return;
+    if (hr->prefix)
+        free(hr->prefix);
+    free(hr);
+}
+
+/* hostrange_delete_host() deletes a specific host from the range.
+ * If the range is split into two, the greater range is returned,
+ * and `hi' of the lesser range is adjusted accordingly. If the
+ * highest or lowest host is deleted from a range, NULL is returned
+ * and the hostrange hr is adjusted properly.
+ */
+static hostrange_t hostrange_delete_host(hostrange_t hr, unsigned long n)
+{
+    hostrange_t new = NULL;
+
+    assert(hr != NULL);
+    assert(n >= hr->lo && n <= hr->hi);
+
+    if (n == hr->lo)
+        hr->lo++;
+    else if (n == hr->hi)
+        hr->hi--;
+    else {
+        if (!(new = hostrange_copy(hr)))
+            out_of_memory("hostrange copy");
+        hr->hi = n - 1;
+        new->lo = n + 1;
+    }
+
+    return new;
+}
+
+/* hostrange_cmp() is used to sort hostrange objects. It will
+ * sort based on the following (in order):
+ *  o result of strcmp on prefixes
+ *  o if widths are compatible, then:
+ *       sort based on lowest suffix in range
+ *    else
+ *       sort based on width                     */
+static int hostrange_cmp(hostrange_t h1, hostrange_t h2)
+{
+    int retval;
+
+    assert(h1 != NULL);
+    assert(h2 != NULL);
+
+    if ((retval = hostrange_prefix_cmp(h1, h2)) == 0)
+        retval = hostrange_width_combine(h1, h2) ?
+            h1->lo - h2->lo : h1->width - h2->width;
+
+    return retval;
+}
+
+
+/* compare the prefixes of two hostrange objects.
+ * returns:
+ *    < 0   if h1 prefix is less than h2 OR h1 == NULL.
+ *
+ *      0   if h1's prefix and h2's prefix match,
+ *          UNLESS, either h1 or h2 (NOT both) do not have a valid suffix.
+ *
+ *    > 0   if h1's prefix is greater than h2's OR h2 == NULL. */
+static int hostrange_prefix_cmp(hostrange_t h1, hostrange_t h2)
+{
+    int retval;
+    if (h1 == NULL)
+        return 1;
+    if (h2 == NULL)
+        return -1;
+
+    retval = strcmp(h1->prefix, h2->prefix);
+    return retval == 0 ? h2->singlehost - h1->singlehost : retval;
+}
+
+/* returns true if h1 and h2 would be included in the same bracketed hostlist.
+ * h1 and h2 will be in the same bracketed list iff:
+ *
+ *  1. h1 and h2 have same prefix
+ *  2. neither h1 nor h2 are singlet hosts (i.e. invalid suffix)
+ *
+ *  (XXX: Should incompatible widths be placed in the same bracketed list?
+ *        There's no good reason not to, except maybe aesthetics)
+ */
+static int hostrange_within_range(hostrange_t h1, hostrange_t h2)
+{
+    if (hostrange_prefix_cmp(h1, h2) == 0)
+        return h1->singlehost || h2->singlehost ? 0 : 1;
+    else
+        return 0;
+}
+
+
+/* compare two hostrange objects to determine if they are width
+ * compatible,  returns:
+ *  1 if widths can safely be combined
+ *  0 if widths cannot be safely combined
+ */
+static int hostrange_width_combine(hostrange_t h0, hostrange_t h1)
+{
+    assert(h0 != NULL);
+    assert(h1 != NULL);
+
+    return _width_equiv(h0->lo, &h0->width, h1->lo, &h1->width);
+}
+
+
+/* Return true if hostrange hr contains no hosts, i.e. hi < lo
+ */
+static int hostrange_empty(hostrange_t hr)
+{
+    assert(hr != NULL);
+    return ((hr->hi < hr->lo) || (hr->hi == (unsigned long) -1));
+}
+
+/* return the string representation of the last host in hostrange hr
+ * and remove that host from the range (i.e. decrement hi if possible)
+ *
+ * Returns NULL if malloc fails OR there are no more hosts left
+ */
+static char *hostrange_pop(hostrange_t hr)
+{
+    size_t size = 0;
+    char *host = NULL;
+
+    assert(hr != NULL);
+
+    if (hr->singlehost) {
+        hr->lo++;    /* effectively set count == 0 */
+        host = strdup(hr->prefix);
+    } else if (hostrange_count(hr) > 0) {
+        size = strlen(hr->prefix) + hr->width + 16;
+        if (!(host = (char *) malloc(size * sizeof(char))))
+            out_of_memory("hostrange pop");
+        snprintf(host, size, "%s%0*lu", hr->prefix,
+             hr->width, hr->hi--);
+    }
+
+    return host;
+}
+
+/* Same as hostrange_pop(), but remove host from start of range */
+static char *hostrange_shift(hostrange_t hr)
+{
+    size_t size = 0;
+    char *host = NULL;
+
+    assert(hr != NULL);
+
+    if (hr->singlehost) {
+        hr->lo++;
+        if (!(host = strdup(hr->prefix)))
+            out_of_memory("hostrange shift");
+    } else if (hostrange_count(hr) > 0) {
+        size = strlen(hr->prefix) + hr->width + 16;
+        if (!(host = (char *) malloc(size * sizeof(char))))
+            out_of_memory("hostrange shift");
+        snprintf(host, size, "%s%0*lu", hr->prefix,
+             hr->width, hr->lo++);
+    }
+
+    return host;
+}
+
+
+/* join two hostrange objects.
+ *
+ * returns:
+ *
+ * -1 if ranges do not overlap (including incompatible zero padding)
+ *  0 if ranges join perfectly
+ * >0 number of hosts that were duplicated in  h1 and h2
+ *
+ * h2 will be coalesced into h1 if rc >= 0
+ *
+ * it is assumed that h1->lo <= h2->lo, i.e. hr1 <= hr2
+ *
+ */
+static int hostrange_join(hostrange_t h1, hostrange_t h2)
+{
+    int duplicated = -1;
+
+    assert(h1 != NULL);
+    assert(h2 != NULL);
+    assert(hostrange_cmp(h1, h2) <= 0);
+
+    if (hostrange_prefix_cmp(h1, h2) == 0 &&
+        hostrange_width_combine(h1, h2)) {
+
+        if (h1->singlehost && h2->singlehost) {    /* matching singlets  */
+            duplicated = 1;
+        } else if (h1->hi == h2->lo - 1) {    /* perfect join       */
+            h1->hi = h2->hi;
+            duplicated = 0;
+        } else if (h1->hi >= h2->lo) {    /* some duplication   */
+            if (h1->hi < h2->hi) {
+                duplicated = h1->hi - h2->lo + 1;
+                h1->hi = h2->hi;
+            } else
+                duplicated = hostrange_count(h2);
+        }
+    }
+
+    return duplicated;
+}
+
+/* hostrange intersect returns the intersection (common hosts)
+ * of hostrange objects h1 and h2. If there is no intersection,
+ * NULL is returned.
+ *
+ * It is assumed that h1 <= h2 (i.e. h1->lo <= h2->lo)
+ */
+static hostrange_t hostrange_intersect(hostrange_t h1, hostrange_t h2)
+{
+    hostrange_t new = NULL;
+
+    assert(h1 != NULL);
+    assert(h2 != NULL);
+
+    if (h1->singlehost || h2->singlehost)
+        return NULL;
+
+    assert(hostrange_cmp(h1, h2) <= 0);
+
+    if ((hostrange_prefix_cmp(h1, h2) == 0)
+        && (h1->hi > h2->lo)
+        && (hostrange_width_combine(h1, h2))) {
+
+        if (!(new = hostrange_copy(h1)))
+            return NULL;
+        new->lo = h2->lo;
+        new->hi = h2->hi < h1->hi ? h2->hi : h1->hi;
+    }
+
+    return new;
+}
+
+/* return 1 if hostname hn is within the hostrange hr
+ *        0 if not.
+ */
+static int hostrange_hn_within(hostrange_t hr, hostname_t hn)
+{
+    int retval = 0;
+
+    if (hr->singlehost && (strcmp(hn->hostname, hr->prefix) == 0))
+        return 1;
+
+    if (strcmp(hr->prefix, hn->prefix) == 0) {
+        if (!hostname_suffix_is_valid(hn)) {
+            if (hr->singlehost)
+                retval = 1;
+        } else if (hn->num <= hr->hi && hn->num >= hr->lo) {
+            int width = hostname_suffix_width(hn);
+            int num = hn->num;
+            retval = _width_equiv(hr->lo, &hr->width, num, &width);
+        }
+    }
+    return retval;
+}
+
+
+/* copy a string representation of the hostrange hr into buffer buf,
+ * writing at most n chars including NUL termination
+ */
+static size_t
+hostrange_to_string(hostrange_t hr, size_t n, char *buf, char *separator)
+{
+    unsigned long i;
+    int truncated = 0;
+    int len = 0;
+    char sep = separator == NULL ? ',' : separator[0];
+
+    if (n == 0)
+        return 0;
+
+    if (hr->singlehost)
+        return snprintf(buf, n, "%s", hr->prefix);
+
+    for (i = hr->lo; i <= hr->hi; i++) {
+        size_t m = (n - len) <= n ? n - len : 0; /* check for < 0 */
+        int ret = snprintf(buf + len, m, "%s%0*lu",
+                   hr->prefix, hr->width, i);
+        if (ret < 0 || ret >= m) {
+            len = n;
+            truncated = 1;
+            break;
+        }
+        len+=ret;
+        buf[len++] = sep;
+    }
+
+    if (truncated) {
+        buf[n-1] = '\0';
+        return -1;
+    } else {
+        /* back up over final separator */
+        buf[--len] = '\0';
+        return len;
+    }
+}
+
+/* Place the string representation of the numeric part of hostrange into buf
+ * writing at most n chars including NUL termination.
+ */
+static size_t hostrange_numstr(hostrange_t hr, size_t n, char *buf)
+{
+    int len = 0;
+
+    assert(buf != NULL);
+
+    if (hr->singlehost || n == 0)
+        return 0;
+
+    len = snprintf(buf, n, "%0*lu", hr->width, hr->lo);
+
+    if ((len >= 0) && (len < n) && (hr->lo < hr->hi)) {
+        int len2 = snprintf(buf+len, n-len, "-%0*lu", hr->width, hr->hi);
+        if (len2 < 0)
+            len = -1;
+        else
+            len += len2;
+    }
+
+    return len;
+}
+
+
+/* ----[ hostlist functions ]---- */
+
+/* Create a new hostlist object.
+ * Returns an empty hostlist, or NULL if memory allocation fails.
+ */
+static hostlist_t hostlist_new(void)
+{
+    int i;
+    hostlist_t new = (hostlist_t) malloc(sizeof(*new));
+    if (!new)
+        goto fail1;
+
+    assert(new->magic = HOSTLIST_MAGIC);
+    mutex_init(&new->mutex);
+
+    new->hr = (hostrange_t *) malloc(HOSTLIST_CHUNK * sizeof(hostrange_t));
+    if (!new->hr)
+        goto fail2;
+
+    /* set entries in hostrange array to NULL */
+    for (i = 0; i < HOSTLIST_CHUNK; i++)
+        new->hr[i] = NULL;
+
+    new->size = HOSTLIST_CHUNK;
+    new->nranges = 0;
+    new->nhosts = 0;
+    new->ilist = NULL;
+    return new;
+
+  fail2:
+    free(new);
+  fail1:
+    out_of_memory("hostlist_create");
+}
+
+
+/* Resize the internal array used to store the list of hostrange objects.
+ *
+ * returns 1 for a successful resize,
+ *         0 if call to _realloc fails
+ *
+ * It is assumed that the caller has the hostlist hl locked
+ */
+static int hostlist_resize(hostlist_t hl, size_t newsize)
+{
+    int i;
+    size_t oldsize;
+    assert(hl != NULL);
+    assert(hl->magic == HOSTLIST_MAGIC);
+    oldsize = hl->size;
+    hl->size = newsize;
+    hl->hr = realloc((void *) hl->hr, hl->size*sizeof(hostrange_t));
+    if (!(hl->hr))
+        return 0;
+
+    for (i = oldsize; i < newsize; i++)
+        hl->hr[i] = NULL;
+
+    return 1;
+}
+
+/* Resize hostlist by one HOSTLIST_CHUNK
+ * Assumes that hostlist hl is locked by caller
+ */
+static int hostlist_expand(hostlist_t hl)
+{
+    if (!hostlist_resize(hl, hl->size + HOSTLIST_CHUNK))
+        return 0;
+    else
+        return 1;
+}
+
+/* Push a hostrange object onto hostlist hl
+ * Returns the number of hosts successfully pushed onto hl
+ * or -1 if there was an error allocating memory
+ */
+static int hostlist_push_range(hostlist_t hl, hostrange_t hr)
+{
+    hostrange_t tail;
+    int retval;
+
+    assert(hr != NULL);
+    LOCK_HOSTLIST(hl);
+
+    tail = (hl->nranges > 0) ? hl->hr[hl->nranges-1] : hl->hr[0];
+
+    if (hl->size == hl->nranges && !hostlist_expand(hl))
+        goto error;
+
+    if (hl->nranges > 0
+        && hostrange_prefix_cmp(tail, hr) == 0
+        && tail->hi == hr->lo - 1
+        && hostrange_width_combine(tail, hr)) {
+        tail->hi = hr->hi;
+    } else {
+        if ((hl->hr[hl->nranges++] = hostrange_copy(hr)) == NULL)
+            goto error;
+    }
+
+    retval = hl->nhosts += hostrange_count(hr);
+
+    UNLOCK_HOSTLIST(hl);
+
+    return retval;
+
+  error:
+    UNLOCK_HOSTLIST(hl);
+    return -1;
+}
+
+
+
+/* Same as hostlist_push_range() above, but prefix, lo, hi, and width
+ * are passed as args
+ */
+static int
+hostlist_push_hr(hostlist_t hl, char *prefix, unsigned long lo,
+         unsigned long hi, int width)
+{
+    hostrange_t hr = hostrange_create(prefix, lo, hi, width);
+    int retval = hostlist_push_range(hl, hr);
+    hostrange_destroy(hr);
+    return retval;
+}
+
+/* Insert a range object hr into position n of the hostlist hl
+ * Assumes that hl->mutex is already held by calling process
+ */
+static int hostlist_insert_range(hostlist_t hl, hostrange_t hr, int n)
+{
+    int i;
+    hostrange_t tmp;
+    hostlist_iterator_t hli;
+
+    assert(hl != NULL);
+    assert(hl->magic == HOSTLIST_MAGIC);
+    assert(hr != NULL);
+
+    if (n > hl->nranges)
+        return 0;
+
+    if (hl->size == hl->nranges && !hostlist_expand(hl))
+        return 0;
+
+    /* copy new hostrange into slot "n" in array */
+    tmp = hl->hr[n];
+    hl->hr[n] = hostrange_copy(hr);
+
+    /* push remaining hostrange entries up */
+    for (i = n + 1; i < hl->nranges + 1; i++) {
+        hostrange_t last = hl->hr[i];
+        hl->hr[i] = tmp;
+        tmp = last;
+    }
+    hl->nranges++;
+
+    /* adjust hostlist iterators if needed */
+    for (hli = hl->ilist; hli; hli = hli->next) {
+        if (hli->idx >= n)
+            hli->hr = hli->hl->hr[++hli->idx];
+    }
+
+    return 1;
+}
+
+/* Delete the range at position n in the range array
+ * Assumes the hostlist lock is already held.
+ */
+static void hostlist_delete_range(hostlist_t hl, int n)
+{
+    int i;
+    hostrange_t old;
+
+    assert(hl != NULL);
+    assert(hl->magic == HOSTLIST_MAGIC);
+    assert(n < hl->nranges && n >= 0);
+
+    old = hl->hr[n];
+    for (i = n; i < hl->nranges - 1; i++)
+        hl->hr[i] = hl->hr[i + 1];
+    hl->nranges--;
+    hl->hr[hl->nranges] = NULL;
+    hostlist_shift_iterators(hl, n, 0, 1);
+
+    /* XXX caller responsible for adjusting nhosts */
+    /* hl->nhosts -= hostrange_count(old) */
+
+    hostrange_destroy(old);
+}
+
+#if WANT_RECKLESS_HOSTRANGE_EXPANSION
+
+/* The reckless hostrange expansion function.
+ * See comment in hostlist.h:hostlist_create() for more info on
+ * the different choices for hostlist notation.
+ */
+hostlist_t _hostlist_create(const char *hostlist, char *sep, char *r_op)
+{
+    char *str, *orig;
+    char *tok, *cur;
+    int high, low, fmt = 0;
+    char prefix[256] = "";
+    int pos = 0;
+    int error = 0;
+    char range_op = r_op[0];/* XXX support > 1 char range ops in future? */
+
+    hostlist_t new = hostlist_new();
+
+    orig = str = strdup(hostlist);
+
+    /* return an empty list if an empty string was passed in */
+    if (str == NULL || strlen(str) == 0)
+        goto done;
+
+    /* Use hostlist_create_bracketed if we see "[" */
+    if (strchr(str, '[') != NULL)
+        return _hostlist_create_bracketed(hostlist, sep, r_op);
+
+    while ((tok = _next_tok(sep, &str)) != NULL) {
+
+        /* save the current string for error messages */
+        cur = tok;
+
+        high = low = 0;
+
+        /* find end of alpha part
+         *   do this by finding last occurence of range_op in str */
+        pos = strlen(tok) - 1;
+        if (strstr(tok, r_op) != '\0') {
+            while (pos >= 0 && (char) tok[pos] != range_op)
+                pos--;
+        }
+
+        /* now back up past any digits */
+        while (pos >= 0 && isdigit((char) tok[--pos])) {;}
+
+        /* Check for valid x-y range (x must be a digit)
+         *   Reset pos if the range is not valid         */
+        if (!isdigit((char) tok[++pos]))
+            pos = strlen(tok) - 1;
+
+        /* create prefix string
+         * if prefix will be zero length, but prefix already exists
+         * use the previous prefix and fmt
+         */
+        if ((pos > 0) || (prefix[0] == '\0')) {
+            memcpy(prefix, tok, (size_t) pos * sizeof(char));
+            prefix[pos] = '\0';
+
+            /* push pointer past prefix */
+            tok += pos;
+
+            /* count number of digits for ouput fmt */
+            for (fmt = 0; isdigit(tok[fmt]); ++fmt) {;}
+
+            if (fmt == 0)
+                error = 1;
+
+        } else
+            tok += pos;
+
+        /* get lower bound */
+        low = strtoul(tok, (char **) &tok, 10);
+
+        if (*tok == range_op) {    /* now get range upper bound */
+            /* push pointer past range op */
+            ++tok;
+
+            /* find length of alpha part */
+            for (pos = 0; tok[pos] && !isdigit(tok[pos]); ++pos) {;}
+
+            /* alpha part must match prefix or error
+             * this could mean we've got something like "rtr1-a2"
+             * so just record an error
+             */
+            if (pos > 0) {
+                if (pos != strlen(prefix) ||
+                    strncmp(prefix, tok, pos) != 0)
+                    error = 1;
+            }
+
+            if (*tok != '\0')
+                tok += pos;
+
+            /* make sure we have digits to the end */
+            for (pos = 0; tok[pos] && isdigit((char) tok[pos]); ++pos) {;}
+
+            if (pos > 0) {    /* we have digits to process */
+                high = strtoul(tok, (char **) &tok, 10);
+            } else {    /* bad boy, no digits */
+                error = 1;
+            }
+
+            if ((low > high) || (high - low > MAX_RANGE))
+                error = 1;
+
+        } else {    /* single value */
+            high = 0;    /* special case, ugh. */
+        }
+
+        /* error if:
+         * 1. we are not at end of string
+         * 2. upper bound equals lower bound
+         */
+        if (*tok != '\0' || high == low)
+            error = 1;
+
+        if (error) {    /* assume this is not a range on any error */
+            hostlist_push_host(new, cur);
+        } else {
+            if (high < low)
+                high = low;
+            hostlist_push_hr(new, prefix, low, high, fmt);
+        }
+
+        error = 0;
+    }
+
+  done:
+    free(orig);
+
+    return new;
+}
+
+#else                /* !WANT_RECKLESS_HOSTRANGE_EXPANSION */
+
+hostlist_t _hostlist_create(const char *hostlist, char *sep, char *r_op)
+{
+    return _hostlist_create_bracketed(hostlist, sep, r_op);
+}
+
+#endif                /* WANT_RECKLESS_HOSTRANGE_EXPANSION */
+
+struct _range {
+    unsigned long lo, hi;
+    int width;
+};
+
+/* Grab a single range from str
+ * returns 1 if str contained a valid number or range,
+ *         0 if conversion of str to a range failed.
+ */
+static int _parse_single_range(const char *str, struct _range *range)
+{
+    char *p, *q;
+    char *orig = strdup(str);
+    if (!orig)
+        seterrno_ret(ENOMEM, 0);
+
+    if ((p = strchr(str, '-'))) {
+        *p++ = '\0';
+        if (*p == '-')     /* do NOT allow negative numbers */
+            goto error;
+    }
+    range->lo = strtoul(str, &q, 10);
+    if (q == str)
+        goto error;
+
+    range->hi = (p && *p) ? strtoul(p, &q, 10) : range->lo;
+
+    if (q == p || *q != '\0')
+        goto error;
+
+    if (range->lo > range->hi)
+        goto error;
+
+    if (range->hi - range->lo + 1 > MAX_RANGE ) {
+        _error(__FILE__, __LINE__, "Too many hosts in range `%s'", orig);
+        free(orig);
+        seterrno_ret(ERANGE, 0);
+    }
+
+    free(orig);
+    range->width = strlen(str);
+    return 1;
+
+  error:
+    _error(__FILE__, __LINE__, "Invalid range: `%s'", orig);
+    free(orig);
+    seterrno_ret(EINVAL, 0);
+}
+
+
+/*
+ * Convert 'str' containing comma separated digits and ranges into an array
+ *  of struct _range types (max 'len' elements).
+ *
+ * Return number of ranges created, or -1 on error.
+ */
+static int _parse_range_list(char *str, struct _range *ranges, int len)
+{
+    char *p;
+    int count = 0;
+
+    while (str) {
+        if (count == len)
+            return -1;
+        if ((p = strchr(str, ',')))
+            *p++ = '\0';
+        if (!_parse_single_range(str, &ranges[count++]))
+            return -1;
+        str = p;
+    }
+    return count;
+}
+
+static void
+_push_range_list(hostlist_t hl, char *pfx, struct _range *rng,
+             int n)
+{
+    int i;
+    for (i = 0; i < n; i++) {
+        hostlist_push_hr(hl, pfx, rng->lo, rng->hi, rng->width);
+        rng++;
+    }
+}
+
+static void
+_push_range_list_with_suffix(hostlist_t hl, char *pfx, char *sfx,
+                             struct _range *rng, int n)
+{
+    int i;
+    unsigned long j;
+    for (i = 0; i < n; i++) {
+        for (j = rng->lo; j <= rng->hi; j++) {
+            char host[4096];
+            hostrange_t hr;
+            snprintf (host, 4096, "%s%0*lu%s", pfx, rng->width, j, sfx);
+            hr = hostrange_create_single (host);
+            hostlist_push_range (hl, hr);
+            /*
+             * hr is copied in hostlist_push_range. Need to free here.
+             */
+            hostrange_destroy (hr);
+        }
+        rng++;
+    }
+}
+
+/*
+ * Create a hostlist from a string with brackets '[' ']' to aid
+ * detection of ranges and compressed lists
+ */
+static hostlist_t
+_hostlist_create_bracketed(const char *hostlist, char *sep, char *r_op)
+{
+    hostlist_t new = hostlist_new();
+    struct _range ranges[MAX_RANGES];
+    int nr, err;
+    char *p, *tok, *str, *orig;
+    char cur_tok[1024];
+
+    if (hostlist == NULL)
+        return new;
+
+    if (!(orig = str = strdup(hostlist))) {
+        hostlist_destroy(new);
+        return NULL;
+    }
+
+    while ((tok = _next_tok(sep, &str)) != NULL) {
+        strncpy(cur_tok, tok, 1024);
+
+        if ((p = strchr(tok, '[')) != NULL) {
+            char *q, *prefix = tok;
+            *p++ = '\0';
+
+            if ((q = strchr(p, ']'))) {
+                *q = '\0';
+                nr = _parse_range_list(p, ranges, MAX_RANGES);
+                if (nr < 0)
+                    goto error;
+
+                if (*(++q) != '\0')
+                    _push_range_list_with_suffix (new, prefix, q, ranges, nr);
+                else
+                    _push_range_list(new, prefix, ranges, nr);
+
+
+            } else
+                hostlist_push_host(new, cur_tok);
+
+        } else
+            hostlist_push_host(new, cur_tok);
+    }
+
+    free(orig);
+    return new;
+
+  error:
+    err = errno;
+    hostlist_destroy(new);
+    free(orig);
+    seterrno_ret(err, NULL);
+}
+
+
+
+hostlist_t hostlist_create(const char *str)
+{
+    return _hostlist_create(str, "\t, ", "-");
+}
+
+
+hostlist_t hostlist_copy(const hostlist_t hl)
+{
+    int i;
+    hostlist_t new;
+
+    if (hl == NULL)
+        return NULL;
+
+    LOCK_HOSTLIST(hl);
+    if (!(new = hostlist_new()))
+        goto done;
+
+    new->nranges = hl->nranges;
+    new->nhosts = hl->nhosts;
+    if (new->nranges > new->size)
+        hostlist_resize(new, new->nranges);
+
+    for (i = 0; i < hl->nranges; i++)
+        new->hr[i] = hostrange_copy(hl->hr[i]);
+
+  done:
+    UNLOCK_HOSTLIST(hl);
+    return new;
+}
+
+
+void hostlist_destroy(hostlist_t hl)
+{
+    int i;
+    if (hl == NULL)
+        return;
+    LOCK_HOSTLIST(hl);
+    while (hl->ilist) {
+        mutex_unlock(&hl->mutex);
+        hostlist_iterator_destroy(hl->ilist);
+        mutex_lock(&hl->mutex);
+    }
+    for (i = 0; i < hl->nranges; i++)
+        hostrange_destroy(hl->hr[i]);
+    free(hl->hr);
+    assert(hl->magic = 0x1);
+    UNLOCK_HOSTLIST(hl);
+    mutex_destroy(&hl->mutex);
+    free(hl);
+}
+
+
+int hostlist_push(hostlist_t hl, const char *hosts)
+{
+    hostlist_t new;
+    int retval;
+    if (hosts == NULL)
+        return 0;
+    new = hostlist_create(hosts);
+    if (!new)
+        return 0;
+    mutex_lock(&new->mutex);
+    retval = new->nhosts;
+    mutex_unlock(&new->mutex);
+    hostlist_push_list(hl, new);
+    hostlist_destroy(new);
+    return retval;
+}
+
+int hostlist_push_host(hostlist_t hl, const char *str)
+{
+    hostrange_t hr;
+    hostname_t hn;
+
+    if (str == NULL)
+        return 0;
+
+    hn = hostname_create(str);
+
+    if (hostname_suffix_is_valid(hn)) {
+        hr = hostrange_create(hn->prefix, hn->num, hn->num,
+                      hostname_suffix_width(hn));
+    } else
+        hr = hostrange_create_single(str);
+
+    hostlist_push_range(hl, hr);
+
+    hostrange_destroy(hr);
+    hostname_destroy(hn);
+
+    return 1;
+}
+
+int hostlist_push_list(hostlist_t h1, hostlist_t h2)
+{
+    int i, n = 0;
+
+    if (h2 == NULL)
+        return 0;
+
+    LOCK_HOSTLIST(h2);
+
+    for (i = 0; i < h2->nranges; i++)
+        n += hostlist_push_range(h1, h2->hr[i]);
+
+    UNLOCK_HOSTLIST(h2);
+
+    return n;
+}
+
+
+char *hostlist_pop(hostlist_t hl)
+{
+    char *host = NULL;
+
+    LOCK_HOSTLIST(hl);
+    if (hl->nhosts > 0) {
+        hostrange_t hr = hl->hr[hl->nranges - 1];
+        host = hostrange_pop(hr);
+        hl->nhosts--;
+        if (hostrange_empty(hr)) {
+            hostrange_destroy(hl->hr[--hl->nranges]);
+            hl->hr[hl->nranges] = NULL;
+        }
+    }
+    UNLOCK_HOSTLIST(hl);
+    return host;
+}
+
+/* find all iterators affected by a shift (or deletion) at
+ * hl->hr[idx], depth, with the deletion of n ranges */
+static void
+hostlist_shift_iterators(hostlist_t hl, int idx, int depth, int n)
+{
+    hostlist_iterator_t i;
+    for (i = hl->ilist; i; i = i->next) {
+        if (n == 0) {
+            if (i->idx == idx && i->depth >= depth)
+                i->depth = i->depth > -1 ? i->depth - 1 : -1;
+        } else {
+            if (i->idx >= idx) {
+                if ((i->idx -= n) >= 0)
+                    i->hr = i->hl->hr[i->idx];
+                else
+                    hostlist_iterator_reset(i);
+            }
+        }
+    }
+}
+
+char *hostlist_shift(hostlist_t hl)
+{
+    char *host = NULL;
+
+    LOCK_HOSTLIST(hl);
+
+    if (hl->nhosts > 0) {
+        hostrange_t hr = hl->hr[0];
+
+        host = hostrange_shift(hr);
+        hl->nhosts--;
+
+        if (hostrange_empty(hr)) {
+            hostlist_delete_range(hl, 0);
+            /* hl->nranges--; */
+        } else
+            hostlist_shift_iterators(hl, 0, 0, 0);
+    }
+
+    UNLOCK_HOSTLIST(hl);
+
+    return host;
+}
+
+
+char *hostlist_pop_range(hostlist_t hl)
+{
+    int i;
+    char buf[MAXHOSTRANGELEN + 1];
+    hostlist_t hltmp;
+    hostrange_t tail;
+
+    LOCK_HOSTLIST(hl);
+    if (hl->nranges < 1 || !(hltmp = hostlist_new())) {
+        UNLOCK_HOSTLIST(hl);
+        return NULL;
+    }
+
+    i = hl->nranges - 2;
+    tail = hl->hr[hl->nranges - 1];
+    while (i >= 0 && hostrange_within_range(tail, hl->hr[i]))
+        i--;
+
+    for (i++; i < hl->nranges; i++) {
+        hostlist_push_range(hltmp, hl->hr[i]);
+        hostrange_destroy(hl->hr[i]);
+        hl->hr[i] = NULL;
+    }
+    hl->nhosts -= hltmp->nhosts;
+    hl->nranges -= hltmp->nranges;
+
+    UNLOCK_HOSTLIST(hl);
+    hostlist_ranged_string(hltmp, MAXHOSTRANGELEN, buf);
+    hostlist_destroy(hltmp);
+    return strdup(buf);
+}
+
+
+char *hostlist_shift_range(hostlist_t hl)
+{
+    int i;
+    char buf[1024];
+    hostlist_t hltmp = hostlist_new();
+    if (!hltmp)
+        return NULL;
+
+    LOCK_HOSTLIST(hl);
+
+    if (hl->nranges == 0) {
+        hostlist_destroy(hltmp);
+        UNLOCK_HOSTLIST(hl);
+        return NULL;
+    }
+
+    i = 0;
+    do {
+        hostlist_push_range(hltmp, hl->hr[i]);
+        hostrange_destroy(hl->hr[i]);
+    } while ( (++i < hl->nranges)
+            && hostrange_within_range(hltmp->hr[0], hl->hr[i]) );
+
+    hostlist_shift_iterators(hl, i, 0, hltmp->nranges);
+
+    /* shift rest of ranges back in hl */
+    for (; i < hl->nranges; i++) {
+        hl->hr[i - hltmp->nranges] = hl->hr[i];
+        hl->hr[i] = NULL;
+    }
+    hl->nhosts -= hltmp->nhosts;
+    hl->nranges -= hltmp->nranges;
+
+    UNLOCK_HOSTLIST(hl);
+
+    hostlist_ranged_string(hltmp, 1024, buf);
+    hostlist_destroy(hltmp);
+
+    return strdup(buf);
+}
+
+/* XXX: Note: efficiency improvements needed */
+int hostlist_delete(hostlist_t hl, const char *hosts)
+{
+    int n = 0;
+    char *hostname = NULL;
+    hostlist_t hltmp;
+
+    if (!(hltmp = hostlist_create(hosts)))
+        seterrno_ret(EINVAL, 0);
+
+    while ((hostname = hostlist_pop(hltmp)) != NULL) {
+        n += hostlist_delete_host(hl, hostname);
+        free(hostname);
+    }
+    hostlist_destroy(hltmp);
+
+    return n;
+}
+
+
+/* XXX watch out! poor implementation follows! (fix it at some point) */
+int hostlist_delete_host(hostlist_t hl, const char *hostname)
+{
+    int n = hostlist_find(hl, hostname);
+    if (n >= 0)
+        hostlist_delete_nth(hl, n);
+    return n >= 0 ? 1 : 0;
+}
+
+
+static char *
+_hostrange_string(hostrange_t hr, int depth)
+{
+    char buf[MAXHOSTNAMELEN + 16];
+    int  len = snprintf(buf, MAXHOSTNAMELEN + 15, "%s", hr->prefix);
+
+    if (!hr->singlehost)
+        snprintf(buf+len, MAXHOSTNAMELEN+15 - len, "%0*lu",
+                 hr->width, hr->lo + depth);
+    return strdup(buf);
+}
+
+char * hostlist_nth(hostlist_t hl, int n)
+{
+    char *host = NULL;
+    int   i, count;
+
+    LOCK_HOSTLIST(hl);
+    count = 0;
+    for (i = 0; i < hl->nranges; i++) {
+        int num_in_range = hostrange_count(hl->hr[i]);
+
+        if (n <= (num_in_range - 1 + count)) {
+            host = _hostrange_string(hl->hr[i], n - count);
+            break;
+        } else
+            count += num_in_range;
+    }
+
+    UNLOCK_HOSTLIST(hl);
+
+    return host;
+}
+
+
+int hostlist_delete_nth(hostlist_t hl, int n)
+{
+    int i, count;
+
+    LOCK_HOSTLIST(hl);
+    assert(n >= 0 && n <= hl->nhosts);
+
+    count = 0;
+
+    for (i = 0; i < hl->nranges; i++) {
+        int num_in_range = hostrange_count(hl->hr[i]);
+        hostrange_t hr = hl->hr[i];
+
+        if (n <= (num_in_range - 1 + count)) {
+            unsigned long num = hr->lo + n - count;
+            hostrange_t new;
+
+            if (hr->singlehost) { /* this wasn't a range */
+                hostlist_delete_range(hl, i);
+            } else if ((new = hostrange_delete_host(hr, num))) {
+                hostlist_insert_range(hl, new, i + 1);
+                hostrange_destroy(new);
+            } else if (hostrange_empty(hr))
+                hostlist_delete_range(hl, i);
+
+            goto done;
+        } else
+            count += num_in_range;
+
+    }
+
+  done:
+    UNLOCK_HOSTLIST(hl);
+    hl->nhosts--;
+    return 1;
+}
+
+int hostlist_count(hostlist_t hl)
+{
+    int retval;
+    LOCK_HOSTLIST(hl);
+    retval = hl->nhosts;
+    UNLOCK_HOSTLIST(hl);
+    return retval;
+}
+
+int hostlist_find(hostlist_t hl, const char *hostname)
+{
+    int i, count, ret = -1;
+    hostname_t hn;
+
+    if (!hostname)
+        return -1;
+
+    hn = hostname_create(hostname);
+
+    LOCK_HOSTLIST(hl);
+
+    for (i = 0, count = 0; i < hl->nranges; i++) {
+        if (hostrange_hn_within(hl->hr[i], hn)) {
+            if (hostname_suffix_is_valid(hn) && !hl->hr[i]->singlehost)
+                ret = count + hn->num - hl->hr[i]->lo;
+            else
+                ret = count;
+            goto done;
+        } else
+            count += hostrange_count(hl->hr[i]);
+    }
+
+  done:
+    UNLOCK_HOSTLIST(hl);
+    hostname_destroy(hn);
+    return ret;
+}
+
+/* hostrange compare with void * arguments to allow use with
+ * libc qsort()
+ */
+int _cmp(const void *hr1, const void *hr2)
+{
+    hostrange_t *h1 = (hostrange_t *) hr1;
+    hostrange_t *h2 = (hostrange_t *) hr2;
+    return hostrange_cmp((hostrange_t) * h1, (hostrange_t) * h2);
+}
+
+
+void hostlist_sort(hostlist_t hl)
+{
+    hostlist_iterator_t i;
+    LOCK_HOSTLIST(hl);
+
+    if (hl->nranges <= 1) {
+        UNLOCK_HOSTLIST(hl);
+        return;
+    }
+
+    qsort(hl->hr, hl->nranges, sizeof(hostrange_t), &_cmp);
+
+    /* reset all iterators */
+    for (i = hl->ilist; i; i = i->next)
+        hostlist_iterator_reset(i);
+
+    UNLOCK_HOSTLIST(hl);
+
+    hostlist_coalesce(hl);
+
+}
+
+
+/* search through hostlist for ranges that can be collapsed
+ * does =not= delete any hosts
+ */
+static void hostlist_collapse(hostlist_t hl)
+{
+    int i;
+
+    LOCK_HOSTLIST(hl);
+    for (i = hl->nranges - 1; i > 0; i--) {
+        hostrange_t hprev = hl->hr[i - 1];
+        hostrange_t hnext = hl->hr[i];
+
+        if (hostrange_prefix_cmp(hprev, hnext) == 0 &&
+            hprev->hi == hnext->lo - 1 &&
+            hostrange_width_combine(hprev, hnext)) {
+            hprev->hi = hnext->hi;
+            hostlist_delete_range(hl, i);
+        }
+    }
+    UNLOCK_HOSTLIST(hl);
+}
+
+/* search through hostlist (hl) for intersecting ranges
+ * split up duplicates and coalesce ranges where possible
+ */
+static void hostlist_coalesce(hostlist_t hl)
+{
+    int i, j;
+    hostrange_t new;
+
+    LOCK_HOSTLIST(hl);
+
+    for (i = hl->nranges - 1; i > 0; i--) {
+
+        new = hostrange_intersect(hl->hr[i - 1], hl->hr[i]);
+
+        if (new) {
+            hostrange_t hprev = hl->hr[i - 1];
+            hostrange_t hnext = hl->hr[i];
+            j = i;
+
+            if (new->hi < hprev->hi)
+                hnext->hi = hprev->hi;
+
+            hprev->hi = new->lo;
+            hnext->lo = new->hi;
+
+            if (hostrange_empty(hprev))
+                hostlist_delete_range(hl, i);
+
+            while (new->lo <= new->hi) {
+                hostrange_t hr = hostrange_create( new->prefix,
+                                                   new->lo, new->lo,
+                                                   new->width );
+
+                if (new->lo > hprev->hi)
+                    hostlist_insert_range(hl, hr, j++);
+
+                if (new->lo < hnext->lo)
+                    hostlist_insert_range(hl, hr, j++);
+
+                hostrange_destroy(hr);
+
+                new->lo++;
+            }
+            i = hl->nranges;
+            hostrange_destroy(new);
+        }
+    }
+    UNLOCK_HOSTLIST(hl);
+
+    hostlist_collapse(hl);
+
+}
+
+/* attempt to join ranges at loc and loc-1 in a hostlist  */
+/* delete duplicates, return the number of hosts deleted  */
+/* assumes that the hostlist hl has been locked by caller */
+/* returns -1 if no range join occurred */
+static int _attempt_range_join(hostlist_t hl, int loc)
+{
+    int ndup;
+    assert(hl != NULL);
+    assert(hl->magic == HOSTLIST_MAGIC);
+    assert(loc > 0);
+    assert(loc < hl->nranges);
+    ndup = hostrange_join(hl->hr[loc - 1], hl->hr[loc]);
+    if (ndup >= 0) {
+        hostlist_delete_range(hl, loc);
+        hl->nhosts -= ndup;
+    }
+    return ndup;
+}
+
+void hostlist_uniq(hostlist_t hl)
+{
+    int i = 1;
+    hostlist_iterator_t hli;
+    LOCK_HOSTLIST(hl);
+    if (hl->nranges <= 1) {
+        UNLOCK_HOSTLIST(hl);
+        return;
+    }
+    qsort(hl->hr, hl->nranges, sizeof(hostrange_t), &_cmp);
+
+    while (i < hl->nranges) {
+        if (_attempt_range_join(hl, i) < 0) /* No range join occurred */
+            i++;
+    }
+
+    /* reset all iterators */
+    for (hli = hl->ilist; hli; hli = hli->next)
+        hostlist_iterator_reset(hli);
+
+    UNLOCK_HOSTLIST(hl);
+}
+
+
+size_t hostlist_deranged_string(hostlist_t hl, size_t n, char *buf)
+{
+    int i;
+    int len = 0;
+    int truncated = 0;
+
+    LOCK_HOSTLIST(hl);
+    for (i = 0; i < hl->nranges; i++) {
+        size_t m = (n - len) <= n ? n - len : 0;
+        int ret = hostrange_to_string(hl->hr[i], m, buf + len, ",");
+        if (ret < 0 || ret > m) {
+            len = n;
+            truncated = 1;
+            break;
+        }
+        len+=ret;
+        buf[len++] = ',';
+    }
+    UNLOCK_HOSTLIST(hl);
+
+    buf[len > 0 ? --len : 0] = '\0';
+    if (len == n)
+        truncated = 1;
+
+    return truncated ? -1 : len;
+}
+
+/* return true if a bracket is needed for the range at i in hostlist hl */
+static int _is_bracket_needed(hostlist_t hl, int i)
+{
+    hostrange_t h1 = hl->hr[i];
+    hostrange_t h2 = i < hl->nranges - 1 ? hl->hr[i + 1] : NULL;
+    return hostrange_count(h1) > 1 || hostrange_within_range(h1, h2);
+}
+
+/* write the next bracketed hostlist, i.e. prefix[n-m,k,...]
+ * into buf, writing at most n chars including the terminating '\0'
+ *
+ * leaves start pointing to one past last range object in bracketed list,
+ * and returns the number of bytes written into buf.
+ *
+ * Assumes hostlist is locked.
+ */
+static int
+_get_bracketed_list(hostlist_t hl, int *start, const size_t n, char *buf)
+{
+    hostrange_t *hr = hl->hr;
+    int i = *start;
+    int m, len = 0;
+    int bracket_needed = _is_bracket_needed(hl, i);
+
+    len = snprintf(buf, n, "%s", hr[i]->prefix);
+
+    if ((len < 0) || (len > n))
+        return n; /* truncated, buffer filled */
+
+    if (bracket_needed && len < n && len >= 0)
+        buf[len++] = '[';
+
+    do {
+        m = (n - len) <= n ? n - len : 0;
+        len += hostrange_numstr(hr[i], m, buf + len);
+        if (len >= n)
+            break;
+        if (bracket_needed) /* Only need commas inside brackets */
+            buf[len++] = ',';
+    } while (++i < hl->nranges && hostrange_within_range(hr[i], hr[i-1]));
+
+    if (bracket_needed && len < n && len > 0) {
+
+        /* Add trailing bracket (change trailing "," from above to "]" */
+        buf[len - 1] = ']';
+
+        /* NUL terminate for safety, but do not add terminator to len */
+        buf[len]   = '\0';
+
+    } else if (len >= n) {
+        if (n > 0)
+            buf[n-1] = '\0';
+
+    } else {
+        /* If len is > 0, NUL terminate (but do not add to len) */
+        buf[len > 0 ? len : 0] = '\0';
+    }
+
+    *start = i;
+    return len;
+}
+
+size_t hostlist_ranged_string(hostlist_t hl, size_t n, char *buf)
+{
+    int i = 0;
+    int len = 0;
+    int truncated = 0;
+
+    LOCK_HOSTLIST(hl);
+    while (i < hl->nranges && len < n) {
+        len += _get_bracketed_list(hl, &i, n - len, buf + len);
+        if ((len > 0) && (len < n) && (i < hl->nranges))
+            buf[len++] = ',';
+    }
+    UNLOCK_HOSTLIST(hl);
+
+    /* NUL terminate */
+    if (len >= n) {
+        truncated = 1;
+        if (n > 0)
+            buf[n-1] = '\0';
+    } else
+        buf[len > 0 ? len : 0] = '\0';
+
+    return truncated ? -1 : len;
+}
+
+/* ----[ hostlist iterator functions ]---- */
+
+static hostlist_iterator_t hostlist_iterator_new(void)
+{
+    hostlist_iterator_t i = (hostlist_iterator_t) malloc(sizeof(*i));
+    if (!i)
+        return NULL;
+    i->hl = NULL;
+    i->hr = NULL;
+    i->idx = 0;
+    i->depth = -1;
+    i->next = i;
+    assert(i->magic = HOSTLIST_MAGIC);
+    return i;
+}
+
+hostlist_iterator_t hostlist_iterator_create(hostlist_t hl)
+{
+    hostlist_iterator_t i;
+
+    if (!(i = hostlist_iterator_new()))
+        out_of_memory("hostlist_iterator_create");
+
+    LOCK_HOSTLIST(hl);
+    i->hl = hl;
+    i->hr = hl->hr[0];
+    i->next = hl->ilist;
+    hl->ilist = i;
+    UNLOCK_HOSTLIST(hl);
+    return i;
+}
+
+hostlist_iterator_t hostset_iterator_create(hostset_t set)
+{
+    return hostlist_iterator_create(set->hl);
+}
+
+void hostlist_iterator_reset(hostlist_iterator_t i)
+{
+    assert(i != NULL);
+    assert(i->magic == HOSTLIST_MAGIC);
+    i->idx = 0;
+    i->hr = i->hl->hr[0];
+    i->depth = -1;
+    return;
+}
+
+void hostlist_iterator_destroy(hostlist_iterator_t i)
+{
+    hostlist_iterator_t *pi;
+    if (i == NULL)
+        return;
+    assert(i != NULL);
+    assert(i->magic == HOSTLIST_MAGIC);
+    LOCK_HOSTLIST(i->hl);
+    for (pi = &i->hl->ilist; *pi; pi = &(*pi)->next) {
+        assert((*pi)->magic == HOSTLIST_MAGIC);
+        if (*pi == i) {
+            *pi = (*pi)->next;
+            break;
+        }
+    }
+    UNLOCK_HOSTLIST(i->hl);
+    assert(i->magic = 0x1);
+    free(i);
+}
+
+static void _iterator_advance(hostlist_iterator_t i)
+{
+    assert(i != NULL);
+    assert(i->magic == HOSTLIST_MAGIC);
+    if (i->idx > i->hl->nranges - 1)
+        return;
+    if (++(i->depth) > (i->hr->hi - i->hr->lo)) {
+        i->depth = 0;
+        i->hr = i->hl->hr[++i->idx];
+    }
+}
+
+/* advance iterator to end of current range (meaning within "[" "]")
+ * i.e. advance iterator past all range objects that could be represented
+ * in on bracketed hostlist.
+ */
+static void _iterator_advance_range(hostlist_iterator_t i)
+{
+    int nr, j;
+    hostrange_t *hr;
+    assert(i != NULL);
+    assert(i->magic == HOSTLIST_MAGIC);
+
+    nr = i->hl->nranges;
+    hr = i->hl->hr;
+    j = i->idx;
+    if (++i->depth > 0) {
+        while (++j < nr && hostrange_within_range(i->hr, hr[j])) {;}
+        i->idx = j;
+        i->hr = i->hl->hr[i->idx];
+        i->depth = 0;
+    }
+}
+
+char *hostlist_next(hostlist_iterator_t i)
+{
+    char *buf = NULL;
+    char suffix[16];
+    int len = 0;
+    assert(i != NULL);
+    assert(i->magic == HOSTLIST_MAGIC);
+    LOCK_HOSTLIST(i->hl);
+    _iterator_advance(i);
+
+    if (i->idx > i->hl->nranges - 1) {
+        UNLOCK_HOSTLIST(i->hl);
+        return NULL;
+    }
+
+    suffix[0] = '\0';
+
+    if (!i->hr->singlehost)
+        snprintf (suffix, 15, "%0*lu", i->hr->width, i->hr->lo + i->depth);
+
+    len = strlen (i->hr->prefix) + strlen (suffix) + 1;
+    if (!(buf = malloc (len)))
+        out_of_memory("hostlist_next");
+
+    buf[0] = '\0';
+    strcat (buf, i->hr->prefix);
+    strcat (buf, suffix);
+
+    UNLOCK_HOSTLIST(i->hl);
+    return (buf);
+}
+
+char *hostlist_next_range(hostlist_iterator_t i)
+{
+    char buf[MAXHOSTRANGELEN + 1];
+    int j;
+
+    assert(i != NULL);
+    assert(i->magic == HOSTLIST_MAGIC);
+    LOCK_HOSTLIST(i->hl);
+
+    _iterator_advance_range(i);
+
+    if (i->idx > i->hl->nranges - 1) {
+        UNLOCK_HOSTLIST(i->hl);
+        return NULL;
+    }
+
+    j = i->idx;
+    _get_bracketed_list(i->hl, &j, MAXHOSTRANGELEN, buf);
+
+    UNLOCK_HOSTLIST(i->hl);
+
+    return strdup(buf);
+}
+
+int hostlist_remove(hostlist_iterator_t i)
+{
+    hostrange_t new;
+    assert(i != NULL);
+    assert(i->magic == HOSTLIST_MAGIC);
+    LOCK_HOSTLIST(i->hl);
+    new = hostrange_delete_host(i->hr, i->hr->lo + i->depth);
+    if (new) {
+        hostlist_insert_range(i->hl, new, i->idx + 1);
+        hostrange_destroy(new);
+        i->hr = i->hl->hr[++i->idx];
+        i->depth = -1;
+    } else if (hostrange_empty(i->hr)) {
+        hostlist_delete_range(i->hl, i->idx);
+    } else
+        i->depth--;
+
+    i->hl->nhosts--;
+    UNLOCK_HOSTLIST(i->hl);
+
+    return 1;
+}
+
+/* ----[ hostset functions ]---- */
+
+hostset_t hostset_create(const char *hostlist)
+{
+    hostset_t new;
+
+    if (!(new = (hostset_t) malloc(sizeof(*new))))
+        goto error1;
+
+    if (!(new->hl = hostlist_create(hostlist)))
+        goto error2;
+
+    hostlist_uniq(new->hl);
+    return new;
+
+  error2:
+    free(new);
+  error1:
+    return NULL;
+}
+
+hostset_t hostset_copy(const hostset_t set)
+{
+    hostset_t new;
+    if (!(new = (hostset_t) malloc(sizeof(*new))))
+        goto error1;
+
+    if (!(new->hl = hostlist_copy(set->hl)))
+        goto error2;
+
+    return new;
+  error2:
+    free(new);
+  error1:
+    return NULL;
+}
+
+void hostset_destroy(hostset_t set)
+{
+    if (set == NULL)
+        return;
+    hostlist_destroy(set->hl);
+    free(set);
+}
+
+/* inserts a single range object into a hostset
+ * Assumes that the set->hl lock is already held
+ * Updates hl->nhosts
+ */
+static int hostset_insert_range(hostset_t set, hostrange_t hr)
+{
+    int i = 0;
+    int inserted = 0;
+    int nhosts = 0;
+    int ndups = 0;
+    hostlist_t hl;
+
+    hl = set->hl;
+
+    if (hl->size == hl->nranges && !hostlist_expand(hl))
+        return 0;
+
+    nhosts = hostrange_count(hr);
+
+    for (i = 0; i < hl->nranges; i++) {
+        if (hostrange_cmp(hr, hl->hr[i]) <= 0) {
+
+            if ((ndups = hostrange_join(hr, hl->hr[i])) >= 0)
+                hostlist_delete_range(hl, i);
+            else if (ndups < 0)
+                ndups = 0;
+
+            hostlist_insert_range(hl, hr, i);
+
+            /* now attempt to join hr[i] and hr[i-1] */
+            if (i > 0) {
+                int m;
+                if ((m = _attempt_range_join(hl, i)) > 0)
+                    ndups += m;
+            }
+            hl->nhosts += nhosts - ndups;
+            inserted = 1;
+            break;
+        }
+    }
+
+    if (inserted == 0) {
+        hl->hr[hl->nranges++] = hostrange_copy(hr);
+        hl->nhosts += nhosts;
+        if (hl->nranges > 1) {
+            if ((ndups = _attempt_range_join(hl, hl->nranges - 1)) <= 0)
+                ndups = 0;
+        }
+    }
+
+    /*
+     *  Return the number of unique hosts inserted
+     */
+    return nhosts - ndups;
+}
+
+int hostset_insert(hostset_t set, const char *hosts)
+{
+    int i, n = 0;
+    hostlist_t hl = hostlist_create(hosts);
+    if (!hl)
+        return 0;
+
+    hostlist_uniq(hl);
+    LOCK_HOSTLIST(set->hl);
+    for (i = 0; i < hl->nranges; i++)
+        n += hostset_insert_range(set, hl->hr[i]);
+    UNLOCK_HOSTLIST(set->hl);
+    hostlist_destroy(hl);
+    return n;
+}
+
+
+/* linear search through N ranges for hostname "host"
+ * */
+static int hostset_find_host(hostset_t set, const char *host)
+{
+    int i;
+    int retval = 0;
+    hostname_t hn;
+    LOCK_HOSTLIST(set->hl);
+    hn = hostname_create(host);
+    for (i = 0; i < set->hl->nranges; i++) {
+        if (hostrange_hn_within(set->hl->hr[i], hn)) {
+            retval = 1;
+            goto done;
+        }
+    }
+  done:
+    UNLOCK_HOSTLIST(set->hl);
+    hostname_destroy(hn);
+    return retval;
+}
+
+int hostset_within(hostset_t set, const char *hosts)
+{
+    int nhosts, nfound;
+    hostlist_t hl;
+    char *hostname;
+
+    assert(set->hl->magic == HOSTLIST_MAGIC);
+
+    hl = hostlist_create(hosts);
+    nhosts = hostlist_count(hl);
+    nfound = 0;
+
+    while ((hostname = hostlist_pop(hl)) != NULL) {
+        nfound += hostset_find_host(set, hostname);
+        free(hostname);
+    }
+
+    hostlist_destroy(hl);
+
+    return (nhosts == nfound);
+}
+
+int hostset_delete(hostset_t set, const char *hosts)
+{
+    return hostlist_delete(set->hl, hosts);
+}
+
+int hostset_delete_host(hostset_t set, const char *hostname)
+{
+    return hostlist_delete_host(set->hl, hostname);
+}
+
+char *hostset_shift(hostset_t set)
+{
+    return hostlist_shift(set->hl);
+}
+
+char *hostset_pop(hostset_t set)
+{
+    return hostlist_pop(set->hl);
+}
+
+char *hostset_shift_range(hostset_t set)
+{
+    return hostlist_shift_range(set->hl);
+}
+
+char *hostset_pop_range(hostset_t set)
+{
+    return hostlist_pop_range(set->hl);
+}
+
+int hostset_count(hostset_t set)
+{
+    return hostlist_count(set->hl);
+}
+
+size_t hostset_ranged_string(hostset_t set, size_t n, char *buf)
+{
+    return hostlist_ranged_string(set->hl, n, buf);
+}
+
+size_t hostset_deranged_string(hostset_t set, size_t n, char *buf)
+{
+    return hostlist_deranged_string(set->hl, n, buf);
+}
+
+#if TEST_MAIN
+
+int hostlist_nranges(hostlist_t hl)
+{
+    return hl->nranges;
+}
+
+int hostset_nranges(hostset_t set)
+{
+    return set->hl->nranges;
+}
+
+/* test iterator functionality on the list of hosts represented
+ * by list
+ */
+int iterator_test(char *list)
+{
+    int j;
+    char buf[1024];
+    hostlist_t hl = hostlist_create(list);
+    hostset_t set = hostset_create(list);
+
+    hostlist_iterator_t i = hostlist_iterator_create(hl);
+    hostlist_iterator_t seti = hostset_iterator_create(set);
+    hostlist_iterator_t i2 = hostlist_iterator_create(hl);
+    char *host;
+
+
+    hostlist_ranged_string(hl, 1024, buf);
+    printf("iterator_test: hl = `%s' passed in `%s'\n", buf, list);
+    host = hostlist_next(i);
+    printf("first host in list hl = `%s'\n", host);
+    free(host);
+
+    /* forge ahead three hosts with i2 */
+    for (j = 0; j < 4; j++) {
+        host = hostlist_next(i2);
+        free(host);
+    }
+
+    host = hostlist_shift(hl);
+    printf("result of shift(hl)   = `%s'\n", host);
+    free(host);
+    host = hostlist_next(i);
+    printf("next host in list hl  = `%s'\n", host);
+    free(host);
+    host = hostlist_next(i2);
+    printf("next host for i2      = `%s'\n", host);
+    free(host);
+
+    hostlist_iterator_destroy(i);
+
+    hostlist_destroy(hl);
+    hostset_destroy(set);
+    return 1;
+}
+
+int main(int ac, char **av)
+{
+    char buf[1024000];
+    int i;
+    char *str;
+
+    hostlist_t hl1, hl2, hl3;
+    hostset_t set, set1;
+    hostlist_iterator_t iter, iter2;
+
+    if (!(hl1 = hostlist_create(ac > 1 ? av[1] : NULL)))
+        perror("hostlist_create");
+    if (!(set = hostset_create(ac > 1 ? av[1] : NULL)))
+        perror("hostlist_create");
+
+    hl3 = hostlist_create("f[0-5]");
+    hostlist_delete(hl3, "f[1-3]");
+    hostlist_ranged_string(hl3, 102400, buf);
+    printf("after delete = `%s'\n", buf);
+    hostlist_destroy(hl3);
+
+    for (i = 2; i < ac; i++) {
+        hostlist_push(hl1, av[i]);
+        hostset_insert(set, av[i]);
+    }
+
+    hostlist_ranged_string(hl1, 102400, buf);
+    printf("ranged   = `%s'\n", buf);
+
+    iterator_test(buf);
+
+    hostlist_deranged_string(hl1, 10240, buf);
+    printf("deranged = `%s'\n", buf);
+
+    hostset_ranged_string(set, 1024, buf);
+    printf("hostset  = `%s'\n", buf);
+
+    hostlist_sort(hl1);
+    hostlist_ranged_string(hl1, 1024, buf);
+    printf("sorted   = `%s'\n", buf);
+
+    hostlist_uniq(hl1);
+    hostlist_ranged_string(hl1, 1024, buf);
+    printf("uniqed   = `%s'\n", buf);
+
+    hl2 = hostlist_copy(hl1);
+    printf("pop_range: ");
+    while ((str = hostlist_pop_range(hl2))) {
+        printf("`%s' ", str);
+        free(str);
+    }
+    hostlist_destroy(hl2);
+    printf("\n");
+
+    hl2 = hostlist_copy(hl1);
+    printf("shift_range: ");
+    while ((str = hostlist_shift_range(hl2))) {
+        printf("`%s' ", str);
+        free(str);
+    }
+    hostlist_destroy(hl2);
+    printf("\n");
+
+    iter = hostset_iterator_create(set);
+    iter2 = hostset_iterator_create(set);
+    hostlist_iterator_destroy(iter2);
+
+    printf("next: ");
+    while ((str = hostlist_next(iter))) {
+        printf("`%s' ", str);
+        free(str);
+    }
+    printf("\n");
+
+    hostlist_iterator_reset(iter);
+    printf("next_range: ");
+    while ((str = hostlist_next_range(iter))) {
+        printf("`%s' ", str);
+        free(str);
+    }
+    printf("\n");
+
+    printf("nranges = %d\n", hostset_nranges(set));
+
+    hostset_ranged_string(set, 1024, buf);
+    printf("set = %s\n", buf);
+
+    hostset_destroy(set);
+    hostlist_destroy(hl1);
+    return 0;
+}
+
+#endif                /* TEST_MAIN */
+
+/*
+ * vi: tabstop=4 shiftwidth=4 expandtab
+ */
diff --git a/lustre/utils/hostlist.h b/lustre/utils/hostlist.h

new file mode 100644 (file)

index 0000000..8b3d509
--- /dev/null
+++ b/lustre/utils/hostlist.h
@@ -0,0 +1,417 @@
+/*****************************************************************************\
+ *  $Id: hostlist.h,v 1.1.10.2 2008/12/18 18:02:14 johann Exp $
+ *****************************************************************************
+ *  Copyright (C) 2002 The Regents of the University of California.
+ *  Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER).
+ *  Written by Mark Grondona <mgrondona@llnl.gov>
+ *  UCRL-CODE-2002-040.
+ *
+ *  This file is part of SLURM, a resource management program.
+ *  For details, see <http://www.llnl.gov/linux/slurm/>.
+ *
+ *  SLURM is free software; you can redistribute it and/or modify it under
+ *  the terms of the GNU General Public License as published by the Free
+ *  Software Foundation; either version 2 of the License, or (at your option)
+ *  any later version.
+ *
+ *  SLURM is distributed in the hope that it will be useful, but WITHOUT ANY
+ *  WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+ *  FOR A PARTICULAR PURPOSE.  See the GNU General Public License for more
+ *  details.
+ *
+ *  You should have received a copy of the GNU General Public License along
+ *  with SLURM; if not, write to the Free Software Foundation, Inc.,
+ *  59 Temple Place, Suite 330, Boston, MA  02111-1307  USA.
+\*****************************************************************************/
+
+#ifndef _HOSTLIST_H
+#define _HOSTLIST_H
+
+/* Notes:
+ *
+ * If WITH_LSD_FATAL_ERROR_FUNC is defined, the linker will expect to
+ * find and external lsd_fatal_error(file,line,mesg) function. By default,
+ * lsd_fatal_error(file,line,mesg) is a macro definition that outputs an
+ * error message to stderr. This macro may be redefined to invoke another
+ * routine instead. e.g.:
+ *
+ *    #define lsd_fatal_error(file,line,mesg)  \
+ *              error("%s:%s %s\n",file,line,mesg);
+ *
+ * If WITH_LSD_NOMEM_ERROR_FUNC is defined, the linker will expect to
+ * find an external lsd_nomem_error(file,line,mesg) function. By default,
+ * lsd_nomem_error(file,line,mesg) is a macro definition that returns NULL.
+ * This macro may be redefined to invoke another routine instead.
+ *
+ * If WITH_PTHREADS is defined, these routines will be thread-safe.
+ *
+ */
+
+/* The hostlist opaque data type
+ *
+ * A hostlist is a list of hostnames optimized for a prefixXXXX style
+ * naming convention, where XXXX  is a decimal, numeric suffix.
+ */
+typedef struct hostlist * hostlist_t;
+
+/* A hostset is a special case of a hostlist. It:
+ *
+ * 1. never contains duplicates
+ * 2. is always sorted
+ *    (Note: sort occurs first on alphanumeric prefix -- where prefix
+ *     matches, numeric suffixes will be sorted *by value*)
+ */
+typedef struct hostset * hostset_t;
+
+/* The hostlist iterator type (may be used with a hostset as well)
+ * used for non-destructive access to hostlist members.
+ *
+ */
+typedef struct hostlist_iterator * hostlist_iterator_t;
+
+/* ----[ hostlist_t functions: ]---- */
+
+/* ----[ hostlist creation and destruction ]---- */
+
+/*
+ * hostlist_create():
+ *
+ * Create a new hostlist from a string representation.
+ *
+ * The string representation (str) may contain one or more hostnames or
+ * bracketed hostlists separated by either `,' or whitespace. A bracketed
+ * hostlist is denoted by a common prefix followed by a list of numeric
+ * ranges contained within brackets: e.g. "tux[0-5,12,20-25]"
+ *
+ * Note: if this module is compiled with WANT_RECKLESS_HOSTRANGE_EXPANSION
+ * defined, a much more loose interpretation of host ranges is used.
+ * Reckless hostrange expansion allows all of the following (in addition to
+ * bracketed hostlists):
+ *
+ *  o tux0-5,tux12,tux20-25
+ *  o tux0-tux5,tux12,tux20-tux25
+ *  o tux0-5,12,20-25
+ *
+ * If str is NULL, and empty hostlist is created and returned.
+ *
+ * If the create fails, hostlist_create() returns NULL.
+ *
+ * The returned hostlist must be freed with hostlist_destroy()
+ *
+ */
+hostlist_t hostlist_create(const char *hostlist);
+
+/* hostlist_copy():
+ *
+ * Allocate a copy of a hostlist object. Returned hostlist must be freed
+ * with hostlist_destroy.
+ */
+hostlist_t hostlist_copy(const hostlist_t hl);
+
+/* hostlist_destroy():
+ *
+ * Destroy a hostlist object. Frees all memory allocated to the hostlist.
+ */
+void hostlist_destroy(hostlist_t hl);
+
+
+/* ----[ hostlist list operations ]---- */
+
+/* hostlist_push():
+ *
+ * push a string representation of hostnames onto a hostlist.
+ *
+ * The hosts argument may take the same form as in hostlist_create()
+ *
+ * Returns the number of hostnames inserted into the list,
+ * or 0 on failure.
+ */
+int hostlist_push(hostlist_t hl, const char *hosts);
+
+
+/* hostlist_push_host():
+ *
+ * Push a single host onto the hostlist hl.
+ * This function is more efficient than hostlist_push() for a single
+ * hostname, since the argument does not need to be checked for ranges.
+ *
+ * return value is 1 for success, 0 for failure.
+ */
+int hostlist_push_host(hostlist_t hl, const char *host);
+
+
+/* hostlist_push_list():
+ *
+ * Push a hostlist (hl2) onto another list (hl1)
+ *
+ * Returns 1 for success, 0 for failure.
+ *
+ */
+int hostlist_push_list(hostlist_t hl1, hostlist_t hl2);
+
+
+/* hostlist_pop():
+ *
+ * Returns the string representation of the last host pushed onto the list
+ * or NULL if hostlist is empty or there was an error allocating memory.
+ * The host is removed from the hostlist.
+ *
+ * Note: Caller is responsible for freeing the returned memory.
+ */
+char * hostlist_pop(hostlist_t hl);
+
+
+char * hostlist_nth(hostlist_t hl, int n);
+
+/* hostlist_shift():
+ *
+ * Returns the string representation of the first host in the hostlist
+ * or NULL if the hostlist is empty or there was an error allocating memory.
+ * The host is removed from the hostlist.
+ *
+ * Note: Caller is responsible for freeing the returned memory.
+ */
+char * hostlist_shift(hostlist_t hl);
+
+
+/* hostlist_pop_range():
+ *
+ * Pop the last bracketed list of hosts of the hostlist hl.
+ * Returns the string representation in bracketed list form.
+ * All hosts associated with the returned list are removed
+ * from hl.
+ *
+ * Caller is responsible for freeing returned memory
+ */
+char * hostlist_pop_range(hostlist_t hl);
+
+/* hostlist_shift_range():
+ *
+ * Shift the first bracketed hostlist (improperly: range) off the
+ * hostlist hl. Returns the string representation in bracketed list
+ * form. All hosts associated with the list are removed from the
+ * hostlist.
+ *
+ * Caller is responsible for freeing returned memory.
+ */
+char * hostlist_shift_range(hostlist_t hl);
+
+
+/* hostlist_find():
+ *
+ * Searches hostlist hl for the first host matching hostname
+ * and returns position in list if found.
+ *
+ * Returns -1 if host is not found.
+ *
+ */
+int hostlist_find(hostlist_t hl, const char *hostname);
+
+/* hostlist_delete():
+ *
+ * Deletes all hosts in the list represented by `hosts'
+ *
+ * Returns the number of hosts successfully deleted
+ */
+int hostlist_delete(hostlist_t hl, const char *hosts);
+
+
+/* hostlist_delete_host():
+ *
+ * Deletes the first host that matches `hostname' from the hostlist hl.
+ * Note: "hostname" argument cannot contain a range of hosts
+ *       (see hostlist_delete() for this functionality.)
+ *
+ * Returns 1 if successful, 0 if hostname is not found in list.
+ */
+int hostlist_delete_host(hostlist_t hl, const char *hostname);
+
+
+/* hostlist_delete_nth():
+ *
+ * Deletes the host from position n in the hostlist.
+ *
+ * Returns 1 if successful 0 on error.
+ *
+ */
+int hostlist_delete_nth(hostlist_t hl, int n);
+
+
+/* hostlist_count():
+ *
+ * Return the number of hosts in hostlist hl.
+ */
+int hostlist_count(hostlist_t hl);
+
+/* hostlist_is_empty(): return true if hostlist is empty. */
+#define hostlist_is_empty(__hl) ( hostlist_count(__hl) == 0 )
+
+/* ----[ Other hostlist operations ]---- */
+
+/* hostlist_sort():
+ *
+ * Sort the hostlist hl.
+ *
+ */
+void hostlist_sort(hostlist_t hl);
+
+/* hostlist_uniq():
+ *
+ * Sort the hostlist hl and remove duplicate entries.
+ *
+ */
+void hostlist_uniq(hostlist_t hl);
+
+
+/* ----[ hostlist print functions ]---- */
+
+/* hostlist_ranged_string():
+ *
+ * Write the string representation of the hostlist hl into buf,
+ * writing at most n chars. Returns the number of bytes written,
+ * or -1 if truncation occurred.
+ *
+ * The result will be NULL terminated.
+ *
+ * hostlist_ranged_string() will write a bracketed hostlist representation
+ * where possible.
+ */
+size_t hostlist_ranged_string(hostlist_t hl, size_t n, char *buf);
+size_t hostset_ranged_string(hostset_t hs, size_t n, char *buf);
+
+/* hostlist_deranged_string():
+ *
+ * Writes the string representation of the hostlist hl into buf,
+ * writing at most n chars. Returns the number of bytes written,
+ * or -1 if truncation occurred.
+ *
+ * hostlist_deranged_string() will not attempt to write a bracketed
+ * hostlist representation. Every hostname will be explicitly written.
+ */
+size_t hostlist_deranged_string(hostlist_t hl, size_t n, char *buf);
+size_t hostset_deranged_string(hostset_t hs, size_t n, char *buf);
+
+
+/* ----[ hostlist utility functions ]---- */
+
+
+/* hostlist_nranges():
+ *
+ * Return the number of ranges currently held in hostlist hl.
+ */
+int hostlist_nranges(hostlist_t hl);
+
+
+/* ----[ hostlist iterator functions ]---- */
+
+/* hostlist_iterator_create():
+ *
+ * Creates and returns a hostlist iterator used for non destructive
+ * access to a hostlist or hostset. Returns NULL on failure.
+ */
+hostlist_iterator_t hostlist_iterator_create(hostlist_t hl);
+
+/* hostset_iterator_create():
+ *
+ * Same as hostlist_iterator_create(), but creates a hostlist_iterator
+ * from a hostset.
+ */
+hostlist_iterator_t hostset_iterator_create(hostset_t set);
+
+/* hostlist_iterator_destroy():
+ *
+ * Destroys a hostlist iterator.
+ */
+void hostlist_iterator_destroy(hostlist_iterator_t i);
+
+/* hostlist_iterator_reset():
+ *
+ * Reset an iterator to the beginning of the list.
+ */
+void hostlist_iterator_reset(hostlist_iterator_t i);
+
+/* hostlist_next():
+ *
+ * Returns a pointer to the  next hostname on the hostlist
+ * or NULL at the end of the list
+ *
+ * The caller is responsible for freeing the returned memory.
+ */
+char * hostlist_next(hostlist_iterator_t i);
+
+
+/* hostlist_next_range():
+ *
+ * Returns the next bracketed hostlist or NULL if the iterator i is
+ * at the end of the list.
+ *
+ * The caller is responsible for freeing the returned memory.
+ *
+ */
+char * hostlist_next_range(hostlist_iterator_t i);
+
+
+/* hostlist_remove():
+ * Removes the last host returned by hostlist iterator i
+ *
+ * Returns 1 for success, 0 for failure.
+ */
+int hostlist_remove(hostlist_iterator_t i);
+
+
+/* ----[ hostset operations ]---- */
+
+/* hostset_create():
+ *
+ * Create a new hostset object from a string representation of a list of
+ * hosts. See hostlist_create() for valid hostlist forms.
+ */
+hostset_t hostset_create(const char *hostlist);
+
+/* hostset_copy():
+ *
+ * Copy a hostset object. Returned set must be freed with hostset_destroy().
+ */
+hostset_t hostset_copy(hostset_t set);
+
+/* hostset_destroy():
+ */
+void hostset_destroy(hostset_t set);
+
+/* hostset_insert():
+ * Add a host or list of hosts into hostset "set."
+ *
+ * Returns number of hosts successfully added to "set"
+ * (insertion of a duplicate is not considered successful)
+ */
+int hostset_insert(hostset_t set, const char *hosts);
+
+/* hostset_delete():
+ * Delete a host or list of hosts from hostset "set."
+ * Returns number of hosts deleted from set.
+ */
+int hostset_delete(hostset_t set, const char *hosts);
+
+/* hostset_within():
+ * Return 1 if all hosts specified by "hosts" are within the hostset "set"
+ * Retrun 0 if every host in "hosts" is not in the hostset "set"
+ */
+int hostset_within(hostset_t set, const char *hosts);
+
+/* hostset_shift():
+ * hostset equivalent to hostlist_shift()
+ */
+char * hostset_shift(hostset_t set);
+
+/* hostset_shift_range():
+ * hostset eqivalent to hostlist_shift_range()
+ */
+char * hostset_shift_range(hostset_t set);
+
+/* hostset_count():
+ * Count the number of hosts currently in hostset
+ */
+int hostset_count(hostset_t set);
+
+
+#endif /* !_HOSTLIST_H */
diff --git a/lustre/utils/l_getgroups.c b/lustre/utils/l_getgroups.c

index 1aa53e7..10064b3 100644 (file)
--- a/lustre/utils/l_getgroups.c
+++ b/lustre/utils/l_getgroups.c
@@ -1,23 +1,37 @@
  /* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
   * vim:expandtab:shiftwidth=8:tabstop=8:
   *
- *  Copyright (C) 2004 Cluster File Systems, Inc.
+ * GPL HEADER START
   *
- *   This file is part of Lustre, http://www.lustre.org.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   *
- *   Lustre is free software; you can redistribute it and/or
- *   modify it under the terms of version 2 of the GNU General Public
- *   License as published by the Free Software Foundation.
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
   *
- *   Lustre is distributed in the hope that it will be useful,
- *   but WITHOUT ANY WARRANTY; without even the implied warranty of
- *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- *   GNU General Public License for more details.
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
   *
- *   You should have received a copy of the GNU General Public License
- *   along with Lustre; if not, write to the Free Software
- *   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
   *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright  2008 Sun Microsystems, Inc. All rights reserved
+ * Use is subject to license terms.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
   */
  
  #include <stdlib.h>
diff --git a/lustre/utils/lctl.c b/lustre/utils/lctl.c

index 7381c70..2afdca7 100644 (file)
--- a/lustre/utils/lctl.c
+++ b/lustre/utils/lctl.c
@@ -1,26 +1,43 @@
  /* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
   * vim:expandtab:shiftwidth=8:tabstop=8:
   *
- *  Copyright (C) 2002 Cluster File Systems, Inc.
- *   Author: Peter J. Braam <braam@clusterfs.com>
- *   Author: Phil Schwan <phil@clusterfs.com>
- *   Author: Robert Read <rread@clusterfs.com>
+ * GPL HEADER START
   *
- *   This file is part of Lustre, http://www.lustre.org.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   *
- *   Lustre is free software; you can redistribute it and/or
- *   modify it under the terms of version 2 of the GNU General Public
- *   License as published by the Free Software Foundation.
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
   *
- *   Lustre is distributed in the hope that it will be useful,
- *   but WITHOUT ANY WARRANTY; without even the implied warranty of
- *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- *   GNU General Public License for more details.
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
   *
- *   You should have received a copy of the GNU General Public License
- *   along with Lustre; if not, write to the Free Software
- *   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
   *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright  2008 Sun Microsystems, Inc. All rights reserved
+ * Use is subject to license terms.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/utils/lctl.c
+ *
+ * Author: Peter J. Braam <braam@clusterfs.com>
+ * Author: Phil Schwan <phil@clusterfs.com>
+ * Author: Robert Read <rread@clusterfs.com>
   */
  
  #include <stdlib.h>
@@ -191,6 +208,24 @@ command_t cmdlist[] = {
           "get the device info of a attached file\n"
           "usage: blockdev_info <device_name>"},
  
+        /* Pool commands */
+        {"===  Pools ==", jt_noop, 0, "pool management"},
+        {"pool_new", jt_pool_cmd, 0,
+         "add a new pool\n"
+         "usage pool_new <fsname>.<poolname>"},
+        {"pool_add", jt_pool_cmd, 0,
+         "add the named OSTs to the pool\n"
+         "usage pool_add <fsname>.<poolname> <ostname indexed list>"},
+        {"pool_remove", jt_pool_cmd, 0,
+         "remove the named OST from the pool\n"
+         "usage pool_remove <fsname>.<poolname> <ostname indexed list>"},
+        {"pool_destroy", jt_pool_cmd, 0,
+         "destroy a pool\n"
+         "usage pool_destroy <fsname>.<poolname>"},
+        {"pool_list", jt_pool_cmd, 0,
+         "list pools and pools members\n"
+         "usage pool_list  <fsname>[.<poolname>] | <pathname>"},
+
          /* Test only commands */
          {"==== testing (DANGEROUS) ====", jt_noop, 0, "testing (DANGEROUS)"},
          {"--threads", jt_opt_threads, 0,
@@ -209,7 +244,7 @@ command_t cmdlist[] = {
          {"add_peer", jt_ptl_add_peer, 0, "add an peer entry\n"
           "usage: add_peer <nid> <host> <port>"},
          {"del_peer", jt_ptl_del_peer, 0, "remove an peer entry\n"
-         "usage: del_autoconn [<nid>] [<host>] [ks]"},
+         "usage: del_peer [<nid>] [<ipaddr|pid>]"},
          {"add_conn ", jt_lcfg_add_conn, 0,
           "usage: add_conn <conn_uuid> [priority]\n"},
          {"del_conn ", jt_lcfg_del_conn, 0,
diff --git a/lustre/utils/lfs.c b/lustre/utils/lfs.c

index 45d5f80..631e070 100644 (file)
--- a/lustre/utils/lfs.c
+++ b/lustre/utils/lfs.c
@@ -1,28 +1,50 @@
  /* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
   * vim:expandtab:shiftwidth=8:tabstop=8:
   *
- *  Copyright (C) 2002 Cluster File Systems, Inc.
- *   Author: Peter J. Braam <braam@clusterfs.com>
- *   Author: Phil Schwan <phil@clusterfs.com>
- *   Author: Robert Read <rread@clusterfs.com>
+ * GPL HEADER START
   *
- *   This file is part of Lustre, http://www.lustre.org.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   *
- *   Lustre is free software; you can redistribute it and/or
- *   modify it under the terms of version 2 of the GNU General Public
- *   License as published by the Free Software Foundation.
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
   *
- *   Lustre is distributed in the hope that it will be useful,
- *   but WITHOUT ANY WARRANTY; without even the implied warranty of
- *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- *   GNU General Public License for more details.
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
   *
- *   You should have received a copy of the GNU General Public License
- *   along with Lustre; if not, write to the Free Software
- *   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
   *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright  2008 Sun Microsystems, Inc. All rights reserved
+ * Use is subject to license terms.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/utils/lfs.c
+ *
+ * Author: Peter J. Braam <braam@clusterfs.com>
+ * Author: Phil Schwan <phil@clusterfs.com>
+ * Author: Robert Read <rread@clusterfs.com>
   */
  
+/* for O_DIRECTORY */
+#ifndef _GNU_SOURCE
+#define _GNU_SOURCE
+#endif
+
  #include <stdlib.h>
  #include <stdio.h>
  #include <getopt.h>
@@ -33,10 +55,14 @@
  #include <grp.h>
  #include <sys/types.h>
  #include <sys/stat.h>
+#include <sys/param.h>
  #include <fcntl.h>
  #include <dirent.h>
  #include <time.h>
  #include <ctype.h>
+#ifdef HAVE_SYS_QUOTA_H
+#include <sys/quota.h>
+#endif
  
  #include <lnet/api-support.h>
  #include <lnet/lnetctl.h>
@@ -59,7 +85,7 @@ static int lfs_osts(int argc, char **argv);
  static int lfs_df(int argc, char **argv);
  static int lfs_check(int argc, char **argv);
  static int lfs_catinfo(int argc, char **argv);
-#ifdef HAVE_QUOTA_SUPPORT
+#ifdef HAVE_SYS_QUOTA_H
  static int lfs_quotachown(int argc, char **argv);
  static int lfs_quotacheck(int argc, char **argv);
  static int lfs_quotaon(int argc, char **argv);
@@ -69,6 +95,7 @@ static int lfs_quota(int argc, char **argv);
  static int lfs_quotainv(int argc, char **argv);
  #endif
  static int lfs_join(int argc, char **argv);
+static int lfs_poollist(int argc, char **argv);
  
  /* all avaialable commands */
  command_t cmdlist[] = {
@@ -76,30 +103,34 @@ command_t cmdlist[] = {
           "Create a new file with a specific striping pattern or\n"
           "set the default striping pattern on an existing directory or\n"
           "delete the default striping pattern from an existing directory\n"
-         "usage: setstripe <filename|dirname> <stripe_size> <stripe_index> <stripe_count>\n"
-         "       or \n"
-         "       setstripe <filename|dirname> [--size|-s stripe_size]\n"
-         "                                    [--index|-i stripe_index]\n"
-         "                                    [--count|-c stripe_count]\n"
+         "usage: setstripe [--size|-s stripe_size] [--offset|-o start_ost]\n"
+         "                 [--count|-c stripe_count] [--pool|-p pool_name]\n"
+         "                 <dir|filename>\n"
           "       or \n"
-         "       setstripe -d <dirname>   (to delete default striping)\n"
+         "       setstripe -d <dir>   (to delete default striping)\n"
           "\tstripe_size:  Number of bytes on each OST (0 filesystem default)\n"
-         "\t              Can be specified with k, m or g (in KB, MB and GB respectively)\n"
-         "\tstripe_index: OST index of first stripe (-1 filesystem default)\n"
-         "\tstripe_count: Number of OSTs to stripe over (0 default, -1 all)"},
+         "\t              Can be specified with k, m or g (in KB, MB and GB\n"
+         "\t              respectively)\n"
+         "\tstart_ost:    OST index of first stripe (-1 filesystem default)\n"
+         "\tstripe_count: Number of OSTs to stripe over (0 default, -1 all)\n"
+         "\tpool_name:    Name of OST pool"},
          {"getstripe", lfs_getstripe, 0,
-         "To list the striping info for a given filename or files in a\n"
+         "To list the striping info for a given file or files in a\n"
           "directory or recursively for all files in a directory tree.\n"
           "usage: getstripe [--obd|-O <uuid>] [--quiet | -q] [--verbose | -v]\n"
           "                 [--recursive | -r] <dir|file> ..."},
+        {"poollist", lfs_poollist, 0,
+         "List pools or pool OSTs\n"
+         "usage: poollist <fsname>[.<poolname>] | <pathname>\n"},
          {"find", lfs_find, 0,
           "To find files that match given parameters recursively in a directory tree.\n"
-         "usage: find <dir/file> ... \n"
+         "usage: find <dir|file> ... \n"
           "     [[!] --atime|-A [+-]N] [[!] --mtime|-M [+-]N] [[!] --ctime|-C [+-]N]\n"
           "     [--maxdepth|-D N] [[!] --name|-n <pattern>] [--print0|-P]\n"
           "     [--print|-p] [--obd|-O <uuid[s]>] [[!] --size|-s [+-]N[bkMGTP]]\n"
           "     [[!] --type|-t <filetype>] [[!] --gid|-g N] [[!] --group|-G <name>]\n"
           "     [[!] --uid|-u N] [[!] --user|-U <name>]\n"
+         "     [[!] --pool <name>]\n"
           "\t !: used before an option indicates 'NOT' the requested attribute\n"
           "\t -: used before an value indicates 'AT MOST' the requested value\n"
           "\t +: used before an option indicates 'AT LEAST' the requested value\n"},
@@ -120,7 +151,7 @@ command_t cmdlist[] = {
           "report filesystem disk space usage or inodes usage"
           "of each MDS/OSD.\n"
           "Usage: df [-i] [-h] [path]"},
-#ifdef HAVE_QUOTA_SUPPORT
+#ifdef HAVE_SYS_QUOTA_H
          {"quotachown",lfs_quotachown, 0,
           "Change files' owner or group on the specified filesystem.\n"
           "usage: quotachown [-i] <filesystem>\n"
@@ -159,6 +190,21 @@ command_t cmdlist[] = {
          { 0, 0, 0, NULL }
  };
  
+static int isnumber(const char *str)
+{
+        const char *ptr;
+
+        if (str[0] != '-' && !isdigit(str[0]))
+                return 0;
+
+        for (ptr = str + 1; *ptr != '\0'; ptr++) {
+                if (!isdigit(*ptr))
+                        return 0;
+        }
+
+        return 1;
+}
+
  /* functions */
  static int lfs_setstripe(int argc, char **argv)
  {
@@ -172,12 +218,15 @@ static int lfs_setstripe(int argc, char **argv)
          char *stripe_size_arg = NULL;
          char *stripe_off_arg = NULL;
          char *stripe_count_arg = NULL;
+        char *pool_name_arg = NULL;
          unsigned long long size_units;
  
          struct option long_opts[] = {
                  {"size",        required_argument, 0, 's'},
                  {"count",       required_argument, 0, 'c'},
                  {"index",       required_argument, 0, 'i'},
+                {"offset",      required_argument, 0, 'o'},
+                {"pool",        required_argument, 0, 'p'},
                  {"delete",      no_argument,       0, 'd'},
                  {0, 0, 0, 0}
          };
@@ -185,15 +234,12 @@ static int lfs_setstripe(int argc, char **argv)
          st_size = 0;
          st_offset = -1;
          st_count = 0;
-        if (argc == 3 && strcmp(argv[1], "-d") == 0) {
-                /* for compatibility with the existing positional parameter
-                 * usage */
-                fname = argv[2];
-                optind = 2;
-        } else if (argc == 5  && 
-                   (argv[2][0] != '-' || isdigit(argv[2][1])) &&
-                   (argv[3][0] != '-' || isdigit(argv[3][1])) &&
-                   (argv[4][0] != '-' || isdigit(argv[4][1])) ) {
+
+#if LUSTRE_VERSION < OBD_OCD_VERSION(2,1,0,0)
+        if (argc == 5 && argv[1][0] != '-' &&
+            isnumber(argv[2]) && isnumber(argv[3]) && isnumber(argv[4])) {
+                fprintf(stderr, "warning: deprecated usage of setstripe "
+                        "positional parameters.  Use -c, -i, -s instead.\n");
                  /* for compatibility with the existing positional parameter
                   * usage */
                  fname = argv[1];
@@ -201,10 +247,14 @@ static int lfs_setstripe(int argc, char **argv)
                  stripe_off_arg = argv[3];
                  stripe_count_arg = argv[4];
                  optind = 4;
-        } else {
+        } else
+#else
+#warning "remove obsolete positional parameter code"
+#endif
+        {
                  optind = 0;
-                while ((c = getopt_long(argc, argv, "c:di:s:",
-                                                long_opts, NULL)) >= 0) {
+                while ((c = getopt_long(argc, argv, "c:di:o:s:p:",
+                                        long_opts, NULL)) >= 0) {
                          switch (c) {
                          case 0:
                                  /* Long options. */
@@ -217,11 +267,15 @@ static int lfs_setstripe(int argc, char **argv)
                                  delete = 1;
                                  break;
                          case 'i':
+                        case 'o':
                                  stripe_off_arg = optarg;
                                  break;
                          case 's':
                                  stripe_size_arg = optarg;
                                  break;
+                        case 'p':
+                                pool_name_arg = optarg;
+                                break;
                          case '?':
                                  return CMD_HELP;
                          default:
@@ -232,25 +286,21 @@ static int lfs_setstripe(int argc, char **argv)
                          }
                  }
  
-                if (optind < argc)
-                        fname = argv[optind];
-                else
-                        return CMD_HELP;
+                fname = argv[optind];
  
-                if (delete && 
-                    (stripe_size_arg != NULL || stripe_off_arg != NULL || 
-                     stripe_count_arg != NULL)) {
+                if (delete &&
+                    (stripe_size_arg != NULL || stripe_off_arg != NULL ||
+                     stripe_count_arg != NULL || pool_name_arg != NULL)) {
                          fprintf(stderr, "error: %s: cannot specify -d with "
-                                        "-s, -c or -i options\n",
+                                        "-s, -c -o or -p options\n",
                                          argv[0]);
                          return CMD_HELP;
                  }
          }
  
-        if (optind != argc - 1) {
-                fprintf(stderr, "error: %s: only 1 filename|dirname can be "
-                                "specified: '%s'\n",
-                                argv[0], argv[argc-1]);
+        if (optind == argc) {
+                fprintf(stderr, "error: %s: missing filename|dirname\n",
+                        argv[0]);
                  return CMD_HELP;
          }
  
@@ -258,8 +308,8 @@ static int lfs_setstripe(int argc, char **argv)
          if (stripe_size_arg != NULL) {
                  result = parse_size(stripe_size_arg, &st_size, &size_units, 0);
                  if (result) {
-                        fprintf(stderr,"error: bad size '%s'\n",
-                                stripe_size_arg);
+                        fprintf(stderr, "error: %s: bad size '%s'\n",
+                                argv[0], stripe_size_arg);
                          return result;
                  }
          }
@@ -282,19 +332,33 @@ static int lfs_setstripe(int argc, char **argv)
                  }
          }
  
-        result = llapi_file_create(fname, st_size, st_offset, st_count, 0);
-        if (result)
-                fprintf(stderr, "error: %s: create stripe file failed\n",
-                                argv[0]);
+        do {
+                result = llapi_file_create_pool(fname, st_size, st_offset,
+                                                st_count, 0, pool_name_arg);
+                if (result) {
+                        fprintf(stderr,"error: %s: create stripe file '%s' "
+                                "failed\n", argv[0], fname);
+                        break;
+                }
+                fname = argv[++optind];
+        } while (fname != NULL);
  
          return result;
  }
  
+static int lfs_poollist(int argc, char **argv)
+{
+        if (argc != 2)
+                return CMD_HELP;
+
+        return llapi_poollist(argv[1]);
+}
+
  static int set_time(time_t *time, time_t *set, char *str)
  {
          time_t t;
          int res = 0;
-        
+
          if (str[0] == '+')
                  res = 1;
          else if (str[0] == '-')
@@ -315,9 +379,12 @@ static int set_time(time_t *time, time_t *set, char *str)
          return res;
  }
  
+#define USER 0
+#define GROUP 1
+
  static int name2id(unsigned int *id, char *name, int type)
  {
-        if (type == USRQUOTA) {
+        if (type == USER) {
                  struct passwd *entry;
  
                  if (!(entry = getpwnam(name))) {
@@ -344,7 +411,7 @@ static int name2id(unsigned int *id, char *name, int type)
  
  static int id2name(char **name, unsigned int id, int type)
  {
-        if (type == USRQUOTA) {
+        if (type == USER) {
                  struct passwd *entry;
  
                  if (!(entry = getpwuid(id))) {
@@ -369,6 +436,7 @@ static int id2name(char **name, unsigned int id, int type)
          return 0;
  }
  
+#define FIND_POOL_OPT 3
  static int lfs_find(int argc, char **argv)
  {
          int new_fashion = 1;
@@ -387,6 +455,8 @@ static int lfs_find(int argc, char **argv)
                  {"uid",       required_argument, 0, 'u'},
                  {"user",      required_argument, 0, 'U'},
                  {"name",      required_argument, 0, 'n'},
+                /* no short option for pool, p/P already used */
+                {"pool",      required_argument, 0, FIND_POOL_OPT},
                  /* --obd is considered as a new option. */
                  {"obd",       required_argument, 0, 'O'},
                  {"ost",       required_argument, 0, 'O'},
@@ -411,6 +481,7 @@ static int lfs_find(int argc, char **argv)
          time(&t);
  
          optind = 0;
+        /* when getopt_long_only() hits '!' it returns 1 and puts "!" in optarg */
          while ((c = getopt_long_only(argc, argv, "-A:C:D:g:G:M:n:PpO:qrs:t:u:U:v",
                                       long_opts, NULL)) >= 0) {
                  xtime = NULL;
@@ -418,6 +489,12 @@ static int lfs_find(int argc, char **argv)
                  if (neg_opt)
                          --neg_opt;
                  /* '!' is part of option */
+                /* when getopt_long_only() finds a string which is not
+                 * an option nor a known option argument it returns 1
+                 * in that case if we already have found pathstart and pathend
+                 * (i.e. we have the list of pathnames),
+                 * the only supported value is "!"
+                 */
                  isoption = (c != 1) || (strcmp(optarg, "!") == 0);
                  if (!isoption && pathend != -1) {
                          fprintf(stderr, "err: %s: filename|dirname must either "
@@ -439,6 +516,9 @@ static int lfs_find(int argc, char **argv)
                          /* Long options. */
                          break;
                  case 1:
+                        /* unknown; opt is "!" or path component,
+                         * checking done above.
+                         */
                          if (strcmp(optarg, "!") == 0)
                                  neg_opt = 2;
                        break;
@@ -492,8 +572,8 @@ static int lfs_find(int argc, char **argv)
                          new_fashion = 1;
                          param.gid = strtol(optarg, &endptr, 10);
                          if (optarg == endptr) {
-                               ret = name2id(&param.gid, optarg, GRPQUOTA);
-                               if (ret != 0) {
+                                ret = name2id(&param.gid, optarg, GROUP);
+                                if (ret != 0) {
                                          fprintf(stderr, "Group/GID: %s cannot "
                                                  "be found.\n", optarg);
                                          return -1;
@@ -516,8 +596,8 @@ static int lfs_find(int argc, char **argv)
                          new_fashion = 1;
                          param.uid = strtol(optarg, &endptr, 10);
                          if (optarg == endptr) {
-                               ret = name2id(&param.uid, optarg, USRQUOTA);
-                               if (ret != 0) {
+                                ret = name2id(&param.uid, optarg, USER);
+                                if (ret != 0) {
                                          fprintf(stderr, "User/UID: %s cannot "
                                                  "be found.\n", optarg);
                                          return -1;
@@ -526,6 +606,22 @@ static int lfs_find(int argc, char **argv)
                          param.exclude_uid = !!neg_opt;
                          param.check_uid = 1;
                          break;
+                case FIND_POOL_OPT:
+                        new_fashion = 1;
+                        if (strlen(optarg) > LOV_MAXPOOLNAME) {
+                                fprintf(stderr,
+                                        "Pool name %s is too long"
+                                        " (max is %d)\n", optarg,
+                                        LOV_MAXPOOLNAME);
+                                return -1;
+                        }
+                        /* we do check for empty pool because empty pool
+                         * is used to find V1 lov attributes */
+                        strncpy(param.poolname, optarg, LOV_MAXPOOLNAME);
+                        param.poolname[LOV_MAXPOOLNAME] = '\0';
+                        param.exclude_pool = !!neg_opt;
+                        param.check_pool = 1;
+                        break;
                  case 'n':
                          new_fashion = 1;
                          param.pattern = (char *)optarg;
@@ -542,7 +638,7 @@ static int lfs_find(int argc, char **argv)
                          strcpy(buf, (char *)optarg);
  
                          if (param.num_alloc_obds == 0) {
-                                param.obduuid = (struct obd_uuid *)malloc(FIND_MAX_OSTS *
+                                param.obduuid = malloc(FIND_MAX_OSTS *
                                                         sizeof(struct obd_uuid));
                                  if (param.obduuid == NULL)
                                          return -ENOMEM;
@@ -735,7 +831,7 @@ static int lfs_getstripe(int argc, char **argv)
          } while (++optind < argc && !rc);
  
          if (rc)
-                fprintf(stderr, "error: %s failed for %s.\n", 
+                fprintf(stderr, "error: %s failed for %s.\n",
                          argv[0], argv[optind - 1]);
          return rc;
  }
@@ -1102,7 +1198,8 @@ static int lfs_check(int argc, char **argv)
                  return -1;
          }
  
-        rc = llapi_target_check(num_types, obd_types, mnt->mnt_dir);
+        rc = llapi_target_iterate(num_types, obd_types,
+                                  mnt->mnt_dir, llapi_ping_target);
  
          if (rc)
                  fprintf(stderr, "error: %s: %s status failed\n",
@@ -1184,7 +1281,7 @@ out:
          return rc;
  }
  
-#ifdef HAVE_QUOTA_SUPPORT
+#ifdef HAVE_SYS_QUOTA_H
  static int lfs_quotachown(int argc, char **argv)
  {
  
@@ -1211,7 +1308,6 @@ static int lfs_quotachown(int argc, char **argv)
          return rc;
  }
  
-
  static int lfs_quotacheck(int argc, char **argv)
  {
          int c, check_type = 0;
@@ -1252,6 +1348,8 @@ static int lfs_quotacheck(int argc, char **argv)
          memset(&qctl, 0, sizeof(qctl));
          qctl.qc_cmd = LUSTRE_Q_QUOTAOFF;
          qctl.qc_type = check_type;
+        qctl.qc_id = QFMT_LDISKFS; /* compatibility: 1.6.5 and earliers
+                                    * take this parameter into account */
          rc = llapi_quotactl(mnt, &qctl);
          if (rc) {
                  fprintf(stderr, "quota off failed: %s\n", strerror(errno));
@@ -1276,6 +1374,8 @@ static int lfs_quotacheck(int argc, char **argv)
          memset(&qctl, 0, sizeof(qctl));
          qctl.qc_cmd = LUSTRE_Q_QUOTAON;
          qctl.qc_type = check_type;
+        qctl.qc_id = QFMT_LDISKFS; /* compatibility: 1.6.5 and earliers
+                                    * take this parameter into account */
          rc = llapi_quotactl(mnt, &qctl);
          if (rc) {
                  if (*obd_type)
@@ -1299,6 +1399,8 @@ static int lfs_quotaon(int argc, char **argv)
  
          memset(&qctl, 0, sizeof(qctl));
          qctl.qc_cmd = LUSTRE_Q_QUOTAON;
+        qctl.qc_id = QFMT_LDISKFS; /* compatibility: 1.6.5 and earliers
+                                    * take this parameter into account */
  
          optind = 0;
          while ((c = getopt(argc, argv, "ugf")) != -1) {
@@ -1380,7 +1482,7 @@ static int lfs_quotaoff(int argc, char **argv)
  
          rc = llapi_quotactl(mnt, &qctl);
          if (rc == -1 && errno == ESRCH) {
-                fprintf(stderr, "\n%s quotas are not enabled.\n", 
+                fprintf(stderr, "\n%s quotas are not enabled.\n",
                          qctl.qc_type == 0x00 ? "user" : "group");
                  return 0;
          }
@@ -1484,7 +1586,7 @@ static unsigned long str2sec(const char* timestr) {
  
                  v = strtoul(timestr, &tail, 10);
                  if (v == ULONG_MAX || *tail == '\0')
-                        /* value too large (ULONG_MAX or more) 
+                        /* value too large (ULONG_MAX or more)
                             or missing specifier */
                          goto error;
  
@@ -1567,11 +1669,11 @@ int lfs_setquota_times(int argc, char **argv)
                  qctl.qc_type = !strcmp(argv[2], "-u") ? USRQUOTA : GRPQUOTA;
  
                  if ((dqi->dqi_bgrace = str2sec(argv[3])) == ULONG_MAX) {
-                        fprintf(stderr, "error: bad block-grace: %s\n", optarg);
+                        fprintf(stderr, "error: bad block-grace: %s\n", argv[3]);
                          return CMD_HELP;
                  }
                  if ((dqi->dqi_igrace = str2sec(argv[4])) == ULONG_MAX) {
-                        fprintf(stderr, "error: bad inode-grace: %s\n", optarg);
+                        fprintf(stderr, "error: bad inode-grace: %s\n", argv[4]);
                          return CMD_HELP;
                  }
                  dqb->dqb_valid = QIF_TIMES;
@@ -1683,9 +1785,10 @@ int lfs_setquota(int argc, char **argv)
                                  " be available in future releases!\n");
  
                  qctl.qc_type = !strcmp(argv[1], "-u") ? USRQUOTA : GRPQUOTA;
-                rc = name2id(&qctl.qc_id, argv[2], qctl.qc_type);
+                rc = name2id(&qctl.qc_id, argv[2],
+                             (qctl.qc_type == USRQUOTA) ? USER : GROUP);
                  if (rc) {
-                        fprintf(stderr, "error: unknown id %s\n", optarg);
+                        fprintf(stderr, "error: unknown id %s\n", argv[2]);
                          return CMD_HELP;
                  }
  
@@ -1713,7 +1816,8 @@ int lfs_setquota(int argc, char **argv)
                                  return CMD_HELP;
                          }
                          qctl.qc_type = (c == 'u') ? USRQUOTA : GRPQUOTA;
-                        rc = name2id(&qctl.qc_id, optarg, qctl.qc_type);
+                        rc = name2id(&qctl.qc_id, optarg,
+                                     (qctl.qc_type == USRQUOTA) ? USER : GROUP);
                          if (rc) {
                                  fprintf(stderr, "error: unknown id %s\n",
                                          optarg);
@@ -1872,10 +1976,10 @@ static void print_quota(char *mnt, struct if_quotactl *qctl, int type)
                  struct obd_dqblk *dqb = &qctl->qc_dqblk;
  
                  if (dqb->dqb_bhardlimit &&
-                    toqb(dqb->dqb_curspace) > dqb->dqb_bhardlimit) {
+                    toqb(dqb->dqb_curspace) >= dqb->dqb_bhardlimit) {
                          bover = 1;
                  } else if (dqb->dqb_bsoftlimit &&
-                           toqb(dqb->dqb_curspace) > dqb->dqb_bsoftlimit) {
+                           toqb(dqb->dqb_curspace) >= dqb->dqb_bsoftlimit) {
                          if (dqb->dqb_btime > now) {
                                  bover = 2;
                          } else {
@@ -1884,10 +1988,10 @@ static void print_quota(char *mnt, struct if_quotactl *qctl, int type)
                  }
  
                  if (dqb->dqb_ihardlimit &&
-                    dqb->dqb_curinodes > dqb->dqb_ihardlimit) {
+                    dqb->dqb_curinodes >= dqb->dqb_ihardlimit) {
                          iover = 1;
                  } else if (dqb->dqb_isoftlimit &&
-                           dqb->dqb_curinodes > dqb->dqb_isoftlimit) {
+                           dqb->dqb_curinodes >= dqb->dqb_isoftlimit) {
                          if (dqb->dqb_btime > now) {
                                  iover = 2;
                          } else {
@@ -2033,21 +2137,30 @@ out:
  static int lfs_quota(int argc, char **argv)
  {
          int c;
-        char *name = NULL, *mnt;
+        char *mnt, *name = NULL;
          struct if_quotactl qctl = { .qc_cmd = LUSTRE_Q_GETQUOTA,
-                                    .qc_type = 0x01 };
+                                    .qc_type = UGQUOTA };
          char *obd_type = (char *)qctl.obd_type;
          char *obd_uuid = (char *)qctl.obd_uuid.uuid;
-        int rc, rc1 = 0, rc2 = 0, rc3 = 0;
+        int rc, rc1 = 0, rc2 = 0, rc3 = 0, verbose = 0;
+        int pass = 0;
  
          optind = 0;
-        while ((c = getopt(argc, argv, "ugto:")) != -1) {
+        while ((c = getopt(argc, argv, "ugto:v")) != -1) {
                  switch (c) {
                  case 'u':
-                        qctl.qc_type = 0x01;
+                        if (qctl.qc_type != UGQUOTA) {
+                                fprintf(stderr, "error: use either -u or -g\n");
+                                return CMD_HELP;
+                        }
+                        qctl.qc_type = USRQUOTA;
                          break;
                  case 'g':
-                        qctl.qc_type = 0x02;
+                        if (qctl.qc_type != UGQUOTA) {
+                                fprintf(stderr, "error: use either -u or -g\n");
+                                return CMD_HELP;
+                        }
+                        qctl.qc_type = GRPQUOTA;
                          break;
                  case 't':
                          qctl.qc_cmd = LUSTRE_Q_GETINFO;
@@ -2055,6 +2168,9 @@ static int lfs_quota(int argc, char **argv)
                  case 'o':
                          strncpy(obd_uuid, optarg, sizeof(qctl.obd_uuid));
                          break;
+                case 'v':
+                        verbose = 1;
+                        break;
                  default:
                          fprintf(stderr, "error: %s: option '-%c' "
                                          "unrecognized\n", argv[0], c);
@@ -2062,51 +2178,62 @@ static int lfs_quota(int argc, char **argv)
                  }
          }
  
-        if (qctl.qc_type)
-                qctl.qc_type--;
-
-
-        if (qctl.qc_cmd == LUSTRE_Q_GETQUOTA) {
+        /* current uid/gid info for "lfs quota /path/to/lustre/mount" */
+        if (qctl.qc_cmd == LUSTRE_Q_GETQUOTA && qctl.qc_type == UGQUOTA &&
+            optind == argc - 1) {
+ug_output:
+                memset(&qctl, 0, sizeof(qctl)); /* spoiled by print_*_quota */
+                qctl.qc_cmd = LUSTRE_Q_GETQUOTA;
+                if (pass++ == 0) {
+                        qctl.qc_type = USRQUOTA;
+                        qctl.qc_id = geteuid();
+                } else {
+                        qctl.qc_type = GRPQUOTA;
+                        qctl.qc_id = getegid();
+                }
+                rc = id2name(&name, qctl.qc_id,
+                             (qctl.qc_type == USRQUOTA) ? USER : GROUP);
+                if (rc)
+                        name = "<unknown>";
+        } else if (qctl.qc_cmd == LUSTRE_Q_GETQUOTA) {
                  if (optind + 2 != argc) {
                          fprintf(stderr, "error: missing quota argument(s)\n");
                          return CMD_HELP;
                  }
  
                  name = argv[optind++];
-                rc = name2id(&qctl.qc_id, name, qctl.qc_type);
+                rc = name2id(&qctl.qc_id, name,
+                             (qctl.qc_type == USRQUOTA) ? USER : GROUP);
                  if (rc) {
                          fprintf(stderr,"error: can't find id for name %s: %s\n",
                                  name, strerror(errno));
                          return CMD_HELP;
                  }
-                print_quota_title(name, &qctl);
          } else if (optind + 1 != argc) {
                  fprintf(stderr, "error: missing quota info argument(s)\n");
                  return CMD_HELP;
          }
  
+        if (qctl.qc_cmd == LUSTRE_Q_GETQUOTA)
+                print_quota_title(name, &qctl);
+
          mnt = argv[optind];
  
          rc1 = llapi_quotactl(mnt, &qctl);
          if (rc1 == -1 && errno == ESRCH) {
-                fprintf(stderr, "\n%s quotas are not enabled.\n", 
-                        qctl.qc_type == 0x00 ? "user" : "group");
-                return 0;
+                fprintf(stderr, "\n%s quotas are not enabled.\n",
+                        qctl.qc_type == USRQUOTA ? "user" : "group");
+                goto out;
          }
          if (rc1 && *obd_type)
                  fprintf(stderr, "%s %s ", obd_type, obd_uuid);
  
-        if (!name)
-                rc = id2name(&name, getuid(), qctl.qc_type);
-
-        if (*obd_uuid) {
+        if (*obd_uuid)
                  mnt = "";
-                name = obd_uuid;
-        }
  
          print_quota(mnt, &qctl, GENERAL_QUOTA_INFO);
  
-        if (!*obd_uuid && qctl.qc_cmd != LUSTRE_Q_GETINFO) {
+        if (!*obd_uuid && qctl.qc_cmd != LUSTRE_Q_GETINFO && verbose) {
                  rc2 = print_mds_quota(mnt, &qctl);
                  rc3 = print_lov_quota(mnt, &qctl);
          }
@@ -2115,9 +2242,14 @@ static int lfs_quota(int argc, char **argv)
                  printf("Some errors happened when getting quota info. "
                         "Some devices may be not working or deactivated. "
                         "The data in \"[]\" is inaccurate.\n");
+
+out:
+        if (pass == 1)
+                goto ug_output;
+
          return 0;
  }
-#endif /* HAVE_QUOTA_SUPPORT */
+#endif /* HAVE_SYS_QUOTA_H! */
  
  int main(int argc, char **argv)
  {
diff --git a/lustre/utils/liblustreapi.c b/lustre/utils/liblustreapi.c

index 0043904..764b59a 100644 (file)
--- a/lustre/utils/liblustreapi.c
+++ b/lustre/utils/liblustreapi.c
@@ -1,30 +1,49 @@
  /* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
   * vim:expandtab:shiftwidth=8:tabstop=8:
   *
- *  Copyright (C) 2002 Cluster File Systems, Inc.
- *   Author: Peter J. Braam <braam@clusterfs.com>
- *   Author: Phil Schwan <phil@clusterfs.com>
- *   Author: Robert Read <rread@clusterfs.com>
+ * GPL HEADER START
   *
- *   This file is part of Lustre, http://www.lustre.org.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   *
- *   Lustre is free software; you can redistribute it and/or
- *   modify it under the terms of version 2 of the GNU General Public
- *   License as published by the Free Software Foundation.
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
   *
- *   Lustre is distributed in the hope that it will be useful,
- *   but WITHOUT ANY WARRANTY; without even the implied warranty of
- *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- *   GNU General Public License for more details.
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
   *
- *   You should have received a copy of the GNU General Public License
- *   along with Lustre; if not, write to the Free Software
- *   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
   *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright  2008 Sun Microsystems, Inc. All rights reserved
+ * Use is subject to license terms.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/utils/liblustreapi.c
+ *
+ * Author: Peter J. Braam <braam@clusterfs.com>
+ * Author: Phil Schwan <phil@clusterfs.com>
+ * Author: Robert Read <rread@clusterfs.com>
   */
  
  /* for O_DIRECTORY */
+#ifndef _GNU_SOURCE
  #define _GNU_SOURCE
+#endif
  
  #include <stdlib.h>
  #include <stdio.h>
@@ -39,7 +58,9 @@
  #include <sys/stat.h>
  #include <sys/types.h>
  #include <sys/syscall.h>
+#include <sys/param.h>
  #include <fnmatch.h>
+#include <glob.h>
  #ifdef HAVE_ASM_TYPES_H
  #include <asm/types.h>
  #endif
@@ -198,71 +219,103 @@ int parse_size(char *optarg, unsigned long long *size,
          return 0;
  }
  
-int llapi_file_open(const char *name, int flags, int mode,
-                    unsigned long stripe_size, int stripe_offset,
-                    int stripe_count, int stripe_pattern)
+int llapi_stripe_limit_check(unsigned long long stripe_size, int stripe_offset,
+                             int stripe_count, int stripe_pattern)
  {
-        struct lov_user_md lum = { 0 };
-        int fd, rc = 0;
-        int isdir = 0;
          int page_size;
  
-        fd = open(name, flags | O_LOV_DELAY_CREATE, mode);
-        if (fd < 0 && errno == EISDIR) {
-                fd = open(name, O_DIRECTORY | O_RDONLY);
-                isdir++;
-        }
-
-        if (fd < 0) {
-                rc = -errno;
-                llapi_err(LLAPI_MSG_ERROR, "unable to open '%s'", name);
-                return rc;
-        }
-
          /* 64 KB is the largest common page size I'm aware of (on ia64), but
           * check the local page size just in case. */
          page_size = LOV_MIN_STRIPE_SIZE;
          if (getpagesize() > page_size) {
                  page_size = getpagesize();
-                llapi_err_noerrno(LLAPI_MSG_WARN, 
+                llapi_err_noerrno(LLAPI_MSG_WARN,
                                    "warning: your page size (%u) is "
-                                  "larger than expected (%u)", page_size, 
+                                  "larger than expected (%u)", page_size,
                                    LOV_MIN_STRIPE_SIZE);
          }
          if (stripe_size < 0 || (stripe_size & (LOV_MIN_STRIPE_SIZE - 1))) {
-                errno = rc = -EINVAL;
+                errno = -EINVAL;
                  llapi_err(LLAPI_MSG_ERROR, "error: bad stripe_size %lu, "
-                          "must be an even multiple of %d bytes", 
+                          "must be an even multiple of %d bytes",
                            stripe_size, page_size);
-                goto out;
+                return errno;
          }
          if (stripe_offset < -1 || stripe_offset > MAX_OBD_DEVICES) {
-                errno = rc = -EINVAL;
-                llapi_err(LLAPI_MSG_ERROR, "error: bad stripe offset %d", 
+                errno = -EINVAL;
+                llapi_err(LLAPI_MSG_ERROR, "error: bad stripe offset %d",
                            stripe_offset);
-                goto out;
+                return errno;
          }
          if (stripe_count < -1 || stripe_count > LOV_MAX_STRIPE_COUNT) {
-                errno = rc = -EINVAL;
-                llapi_err(LLAPI_MSG_ERROR, "error: bad stripe count %d", 
+                errno = -EINVAL;
+                llapi_err(LLAPI_MSG_ERROR, "error: bad stripe count %d",
                            stripe_count);
-                goto out;
+                return errno;
          }
-        if (stripe_count > 0 && (__u64)stripe_size * stripe_count > 0xffffffff){
-                errno = rc = -EINVAL;
-                llapi_err(LLAPI_MSG_ERROR, "error: stripe_size %lu * "
-                          "stripe_count %u exceeds 4GB", stripe_size, 
-                          stripe_count);
+        if (stripe_size >= (1ULL << 32)) {
+                errno = -EINVAL;
+                llapi_err(LLAPI_MSG_ERROR, "warning: stripe size larger than 4G"
+                          " is not currently supported and would wrap");
+                return errno;
+        }
+        return 0;
+}
+
+static int poolpath(char *fsname, char *pathname, char *pool_pathname);
+
+int llapi_file_open_pool(const char *name, int flags, int mode,
+                         unsigned long long stripe_size, int stripe_offset,
+                         int stripe_count, int stripe_pattern, char *pool_name)
+{
+        struct lov_user_md_v3 lum = { 0 };
+        int fd, rc = 0;
+        int isdir = 0;
+        char fsname[MAX_OBD_NAME + 1], *ptr;
+
+        fd = open(name, flags | O_LOV_DELAY_CREATE, mode);
+        if (fd < 0 && errno == EISDIR) {
+                fd = open(name, O_DIRECTORY | O_RDONLY);
+                isdir++;
+        }
+
+        if (fd < 0) {
+                rc = -errno;
+                llapi_err(LLAPI_MSG_ERROR, "unable to open '%s'", name);
+                return rc;
+        }
+
+        if ((rc = llapi_stripe_limit_check(stripe_size, stripe_offset,
+                                           stripe_count, stripe_pattern)) != 0){
+                errno = rc;
                  goto out;
          }
  
          /*  Initialize IOCTL striping pattern structure */
-        lum.lmm_magic = LOV_USER_MAGIC;
+        lum.lmm_magic = LOV_USER_MAGIC_V3;
          lum.lmm_pattern = stripe_pattern;
          lum.lmm_stripe_size = stripe_size;
          lum.lmm_stripe_count = stripe_count;
          lum.lmm_stripe_offset = stripe_offset;
  
+        /* in case user give the full pool name <fsname>.<poolname>, skip
+         * the fsname */
+        if (pool_name != NULL) {
+                ptr = strchr(pool_name, '.');
+                if (ptr != NULL) {
+                        strncpy(fsname, pool_name, ptr - pool_name);
+                        fsname[ptr - pool_name] = '\0';
+                        /* if fsname matches a filesystem skip it
+                         * if not keep the poolname as is */
+                        if (poolpath(fsname, NULL, NULL) == 0)
+                                pool_name = ptr + 1;
+                }
+                strncpy(lum.lmm_pool_name, pool_name, LOV_MAXPOOLNAME);
+        } else {
+                /* If no pool is specified at all, use V1 request */
+                lum.lmm_magic = LOV_USER_MAGIC_V1;
+        }
+
          if (ioctl(fd, LL_IOC_LOV_SETSTRIPE, &lum)) {
                  char *errmsg = "stripe already set";
                  rc = -errno;
@@ -271,7 +324,7 @@ int llapi_file_open(const char *name, int flags, int mode,
  
                  llapi_err_noerrno(LLAPI_MSG_ERROR,
                                    "error on ioctl "LPX64" for '%s' (%d): %s",
-                                  (__u64)LL_IOC_LOV_SETSTRIPE, name, fd, errmsg);
+                                  (__u64)LL_IOC_LOV_SETSTRIPE, name, fd,errmsg);
          }
  out:
          if (rc) {
@@ -282,13 +335,39 @@ out:
          return fd;
  }
  
-int llapi_file_create(const char *name, unsigned long stripe_size,
+int llapi_file_open(const char *name, int flags, int mode,
+                    unsigned long long stripe_size, int stripe_offset,
+                    int stripe_count, int stripe_pattern)
+{
+        return llapi_file_open_pool(name, flags, mode, stripe_size,
+                                    stripe_offset, stripe_count,
+                                    stripe_pattern, NULL);
+}
+
+int llapi_file_create(const char *name, unsigned long long stripe_size,
                        int stripe_offset, int stripe_count, int stripe_pattern)
  {
          int fd;
  
-        fd = llapi_file_open(name, O_CREAT | O_WRONLY, 0644, stripe_size,
-                             stripe_offset, stripe_count, stripe_pattern);
+        fd = llapi_file_open_pool(name, O_CREAT | O_WRONLY, 0644, stripe_size,
+                                  stripe_offset, stripe_count, stripe_pattern,
+                                  NULL);
+        if (fd < 0)
+                return fd;
+
+        close(fd);
+        return 0;
+}
+
+int llapi_file_create_pool(const char *name, unsigned long long stripe_size,
+                           int stripe_offset, int stripe_count,
+                           int stripe_pattern, char *pool_name)
+{
+        int fd;
+
+        fd = llapi_file_open_pool(name, O_CREAT | O_WRONLY, 0644, stripe_size,
+                                  stripe_offset, stripe_count, stripe_pattern,
+                                  pool_name);
          if (fd < 0)
                  return fd;
  
@@ -296,6 +375,187 @@ int llapi_file_create(const char *name, unsigned long stripe_size,
          return 0;
  }
  
+
+static int print_pool_members(char *fs, char *pool_dir, char *pool_file)
+{
+        char path[MAXPATHLEN + 1];
+        char buf[1024];
+        FILE *fd;
+
+        llapi_printf(LLAPI_MSG_NORMAL, "Pool: %s.%s\n", fs, pool_file);
+        sprintf(path, "%s/%s", pool_dir, pool_file);
+        if ((fd = fopen(path, "r")) == NULL) {
+                llapi_err(LLAPI_MSG_ERROR, "Cannot open %s\n", path);
+                return -EINVAL;
+        }
+        while (fgets(buf, sizeof(buf), fd) != NULL)
+               llapi_printf(LLAPI_MSG_NORMAL, buf);
+
+        fclose(fd);
+        return 0;
+}
+
+/*
+ * search lustre fsname from pathname
+ *
+ */
+static int search_fsname(char *pathname, char *fsname)
+{
+        char *ptr;
+        FILE *fp;
+        struct mntent *mnt = NULL;
+
+        /* get the mount point */
+        fp = setmntent(MOUNTED, "r");
+        if (fp == NULL) {
+                 llapi_err(LLAPI_MSG_ERROR,
+                           "setmntent(%s) failed: %s:", MOUNTED,
+                           strerror (errno));
+                 return -EIO;
+        }
+        mnt = getmntent(fp);
+        while ((feof(fp) == 0) && ferror(fp) == 0) {
+                if (llapi_is_lustre_mnt(mnt)) {
+                        /* search by pathname */
+                        if (strncmp(mnt->mnt_dir, pathname,
+                                    strlen(mnt->mnt_dir)) == 0) {
+                                ptr = strchr(mnt->mnt_fsname, '/');
+                                if (ptr == NULL)
+                                        return -EINVAL;
+                                ptr++;
+                                strcpy(fsname, ptr);
+                                return 0;
+                        }
+                }
+                mnt = getmntent(fp);
+        }
+        endmntent(fp);
+        return -ENOENT;
+
+}
+
+/*
+ * find the pool directory path under /proc
+ * (can be also used to test if a fsname is known)
+ */
+static int poolpath(char *fsname, char *pathname, char *pool_pathname)
+{
+        int rc = 0;
+        glob_t glob_info;
+        char pattern[MAXPATHLEN + 1];
+        char buffer[MAXPATHLEN];
+
+        if (fsname == NULL) {
+                rc = search_fsname(pathname, buffer);
+                if (rc != 0)
+                        return rc;
+                fsname = buffer;
+                strcpy(pathname, fsname);
+        }
+
+        snprintf(pattern, MAXPATHLEN,
+                 "/proc/fs/lustre/lov/%s-*/pools",
+                 fsname);
+        rc = glob(pattern, GLOB_BRACE, NULL, &glob_info);
+        if (rc)
+                return -ENOENT;
+
+        if (glob_info.gl_pathc == 0) {
+                globfree(&glob_info);
+                return -ENOENT;
+        }
+
+        /* in fsname test mode, pool_pathname is NULL */
+        if (pool_pathname != NULL)
+                strcpy(pool_pathname, glob_info.gl_pathv[0]);
+
+        return 0;
+}
+
+int llapi_poollist(char *name)
+{
+        char *poolname;
+        char *fsname;
+        char rname[MAXPATHLEN + 1], pathname[MAXPATHLEN + 1];
+        char *ptr;
+        int rc = 0;
+
+        /* is name a pathname ? */
+        ptr = strchr(name, '/');
+        if (ptr != NULL) {
+                /* only absolute pathname is supported */
+                if (*name != '/')
+                        return -EINVAL;
+                if (!realpath(name, rname)) {
+                        rc = -errno;
+                        llapi_err(LLAPI_MSG_ERROR,
+                                  "llapi_poollist: invalid path '%s'",
+                                  name);
+                        return rc;
+                }
+
+                rc = poolpath(NULL, rname, pathname);
+                if (rc != 0) {
+                        errno = -rc;
+                        llapi_err(LLAPI_MSG_ERROR,
+                                  "llapi_poollist: '%s' is not"
+                                  " a Lustre filesystem",
+                                  name);
+                        return rc;
+                }
+                fsname = rname;
+                poolname = NULL;
+        } else {
+                /* name is FSNAME[.POOLNAME] */
+                fsname = name;
+                poolname = strchr(name, '.');
+                if (poolname != NULL) {
+                        *poolname = '\0';
+                        poolname++;
+                }
+                rc = poolpath(fsname, NULL, pathname);
+                if (rc != 0) {
+                        errno = -rc;
+                        llapi_err(LLAPI_MSG_ERROR,
+                                  "llapi_poollist: Lustre filesystem '%s'"
+                                  " not found", name);
+                        return rc;
+                }
+        }
+        if (rc != 0) {
+                errno = -rc;
+                llapi_err(LLAPI_MSG_ERROR,
+                          "llapi_poollist: Lustre filesystem '%s' not found",
+                          name);
+                return rc;
+        }
+
+        if (poolname != NULL) {
+                rc = print_pool_members(fsname, pathname, poolname);
+                poolname--;
+                *poolname = '.';
+        } else {
+                DIR *dir;
+                struct dirent *pool;
+
+                llapi_printf(LLAPI_MSG_NORMAL, "Pools from %s:\n", fsname);
+                if ((dir = opendir(pathname)) == NULL) {
+                        return -EINVAL;
+                }
+                while ((pool = readdir(dir)) != NULL) {
+                        if (!((pool->d_name[0] == '.') &&
+                              (pool->d_name[1] == '\0')) &&
+                            !((pool->d_name[0] == '.') &&
+                              (pool->d_name[1] == '.') &&
+                              (pool->d_name[2] == '\0')))
+                        llapi_printf(LLAPI_MSG_NORMAL, " %s.%s\n",
+                                     fsname, pool->d_name);
+                }
+                closedir(dir);
+        }
+        return rc;
+}
+
  typedef int (semantic_func_t)(char *path, DIR *parent, DIR *d,
                                void *data, struct dirent64 *de);
  
@@ -304,9 +564,9 @@ typedef int (semantic_func_t)(char *path, DIR *parent, DIR *d,
  
  static int common_param_init(struct find_param *param)
  {
-        param->lumlen = lov_mds_md_size(MAX_LOV_UUID_COUNT);
+        param->lumlen = lov_mds_md_size(MAX_LOV_UUID_COUNT, LOV_MAGIC_V3);
          if ((param->lmd = malloc(sizeof(lstat_t) + param->lumlen)) == NULL) {
-                llapi_err(LLAPI_MSG_ERROR, 
+                llapi_err(LLAPI_MSG_ERROR,
                            "error: allocation of %d bytes for ioctl",
                            sizeof(lstat_t) + param->lumlen);
                  return -ENOMEM;
@@ -327,7 +587,7 @@ static void find_param_fini(struct find_param *param)
                  free(param->lmd);
  }
  
-int llapi_file_get_lov_fuuid(int fd, struct obd_uuid *lov_name)
+int llapi_file_fget_lov_uuid(int fd, struct obd_uuid *lov_name)
  {
          int rc = ioctl(fd, OBD_IOC_GETNAME, lov_name);
          if (rc) {
@@ -344,11 +604,11 @@ int llapi_file_get_lov_uuid(const char *path, struct obd_uuid *lov_uuid)
          fd = open(path, O_RDONLY);
          if (fd < 0) {
                  rc = errno;
-                llapi_err(LLAPI_MSG_ERROR, "error opening %s\n", path);
+                llapi_err(LLAPI_MSG_ERROR, "error opening %s", path);
                  return rc;
          }
  
-        rc = llapi_file_get_lov_fuuid(fd, lov_uuid);
+        rc = llapi_file_fget_lov_uuid(fd, lov_uuid);
  
          close(fd);
  
@@ -369,7 +629,7 @@ int llapi_lov_get_uuids(int fd, struct obd_uuid *uuidp, int *ost_count)
          int rc = 0, index = 0;
  
          /* Get the lov name */
-        rc = llapi_file_get_lov_fuuid(fd, &lov_name);
+        rc = llapi_file_fget_lov_uuid(fd, &lov_name);
          if (rc)
                  return rc;
  
@@ -411,11 +671,11 @@ static int setup_obd_uuid(DIR *dir, char *dname, struct find_param *param)
          int rc = 0, index;
  
          /* Get the lov name */
-        rc = llapi_file_get_lov_fuuid(dirfd(dir), &lov_uuid);
+        rc = llapi_file_fget_lov_uuid(dirfd(dir), &lov_uuid);
          if (rc) {
                  if (errno != ENOTTY) {
                          rc = errno;
-                        llapi_err(LLAPI_MSG_ERROR, 
+                        llapi_err(LLAPI_MSG_ERROR,
                                    "error: can't get lov name: %s", dname);
                  } else {
                          rc = 0;
@@ -528,14 +788,89 @@ retry_get_uuids:
          return 0;
  }
  
-void lov_dump_user_lmm_v1(struct lov_user_md_v1 *lum, char *path, int is_dir,
-                          int obdindex, int quiet, int header, int body)
+static void lov_dump_user_lmm_join(struct lov_user_md_v1 *lum, char *path,
+                                   int is_dir, int obdindex, int quiet,
+                                   int header, int body)
+{
+        struct lov_user_md_join *lumj = (struct lov_user_md_join *)lum;
+        int i, obdstripe = 0;
+
+        if (obdindex != OBD_NOT_FOUND) {
+                for (i = 0; i < lumj->lmm_stripe_count; i++) {
+                        if (obdindex == lumj->lmm_objects[i].l_ost_idx) {
+                                llapi_printf(LLAPI_MSG_NORMAL, "%s\n", path);
+                                obdstripe = 1;
+                                break;
+                        }
+                }
+        } else if (!quiet) {
+                llapi_printf(LLAPI_MSG_NORMAL, "%s\n", path);
+                obdstripe = 1;
+        }
+
+        if (header && obdstripe == 1) {
+                llapi_printf(LLAPI_MSG_NORMAL, "lmm_magic:          0x%08X\n",
+                             lumj->lmm_magic);
+                llapi_printf(LLAPI_MSG_NORMAL, "lmm_object_gr:      "LPX64"\n",
+                             lumj->lmm_object_gr);
+                llapi_printf(LLAPI_MSG_NORMAL, "lmm_object_id:      "LPX64"\n",
+                             lumj->lmm_object_id);
+                llapi_printf(LLAPI_MSG_NORMAL, "lmm_stripe_count:   %u\n",
+                             (int)lumj->lmm_stripe_count);
+                llapi_printf(LLAPI_MSG_NORMAL, "lmm_stripe_size:    %u\n",
+                             lumj->lmm_stripe_size);
+                llapi_printf(LLAPI_MSG_NORMAL, "lmm_stripe_pattern: %x\n",
+                             lumj->lmm_pattern);
+                llapi_printf(LLAPI_MSG_NORMAL, "lmm_extent_count:   %x\n",
+                             lumj->lmm_extent_count);
+        }
+
+        if (body) {
+                unsigned long long start = -1, end = 0;
+                if (!quiet && obdstripe == 1)
+                        llapi_printf(LLAPI_MSG_NORMAL,
+                                     "joined\tobdidx\t\t objid\t\tobjid\t\t "
+                                     "group\t\tstart\t\tend\n");
+                for (i = 0; i < lumj->lmm_stripe_count; i++) {
+                        int idx = lumj->lmm_objects[i].l_ost_idx;
+                        long long oid = lumj->lmm_objects[i].l_object_id;
+                        long long gr = lumj->lmm_objects[i].l_object_gr;
+                        if (obdindex == OBD_NOT_FOUND || obdindex == idx)
+                                llapi_printf(LLAPI_MSG_NORMAL,
+                                             "\t%6u\t%14llu\t%#13llx\t%14llu%s",
+                                             idx, oid, oid, gr,
+                                             obdindex == idx ? " *" : "");
+                        if (start != lumj->lmm_objects[i].l_extent_start ||
+                            end != lumj->lmm_objects[i].l_extent_end) {
+                                start = lumj->lmm_objects[i].l_extent_start;
+                                llapi_printf(LLAPI_MSG_NORMAL, "\t%14llu",
+                                             start);
+                                end = lumj->lmm_objects[i].l_extent_end;
+                                if (end == (unsigned long long)-1)
+                                        llapi_printf(LLAPI_MSG_NORMAL,
+                                                     "\t\tEOF\n");
+                                else
+                                        llapi_printf(LLAPI_MSG_NORMAL,
+                                                     "\t\t%llu\n", end);
+                        } else {
+                                llapi_printf(LLAPI_MSG_NORMAL, "\t\t\t\t\n");
+                        }
+                }
+                llapi_printf(LLAPI_MSG_NORMAL, "\n");
+        }
+}
+
+static void lov_dump_user_lmm_v1v3(struct lov_user_md *lum, char *pool_name,
+                                   struct lov_user_ost_data_v1 *objects,
+                                   char *path,
+                                   int is_dir, int obdindex, int quiet,
+                                   int header, int body)
  {
          int i, obdstripe = 0;
  
          if (obdindex != OBD_NOT_FOUND) {
                  for (i = 0; !is_dir && i < lum->lmm_stripe_count; i++) {
-                        if (obdindex == lum->lmm_objects[i].l_ost_idx) {
+                        if (obdindex == objects[i].l_ost_idx) {
                                  llapi_printf(LLAPI_MSG_NORMAL, "%s\n", path);
                                  obdstripe = 1;
                                  break;
@@ -553,14 +888,16 @@ void lov_dump_user_lmm_v1(struct lov_user_md_v1 *lum, char *path, int is_dir,
                                  llapi_printf(LLAPI_MSG_NORMAL, "(Default) ");
                                  lum->lmm_object_gr = LOV_OBJECT_GROUP_CLEAR;
                          }
-                        llapi_printf(LLAPI_MSG_NORMAL, 
+                        llapi_printf(LLAPI_MSG_NORMAL,
                                       "stripe_count: %d stripe_size: %u "
-                                     "stripe_offset: %d\n",
+                                     "stripe_offset: %d%s%s\n",
                                       lum->lmm_stripe_count == (__u16)-1 ? -1 :
-                                     lum->lmm_stripe_count,
+                                                        lum->lmm_stripe_count,
                                       lum->lmm_stripe_size,
                                       lum->lmm_stripe_offset == (__u16)-1 ? -1 :
-                                     lum->lmm_stripe_offset);
+                                                        lum->lmm_stripe_offset,
+                                     pool_name != NULL ? " pool: " : "",
+                                     pool_name != NULL ? pool_name : "");
                  }
                  return;
          }
@@ -573,111 +910,47 @@ void lov_dump_user_lmm_v1(struct lov_user_md_v1 *lum, char *path, int is_dir,
                  llapi_printf(LLAPI_MSG_NORMAL, "lmm_object_id:      "LPX64"\n",
                               lum->lmm_object_id);
                  llapi_printf(LLAPI_MSG_NORMAL, "lmm_stripe_count:   %u\n",
-                             (int)lum->lmm_stripe_count);
+                             lum->lmm_stripe_count);
                  llapi_printf(LLAPI_MSG_NORMAL, "lmm_stripe_size:    %u\n",
                               lum->lmm_stripe_size);
                  llapi_printf(LLAPI_MSG_NORMAL, "lmm_stripe_pattern: %x\n",
                               lum->lmm_pattern);
+                if (pool_name != NULL)
+                        llapi_printf(LLAPI_MSG_NORMAL,
+                                     "lmm_pool_name:      %s\n", pool_name);
          }
  
          if (body) {
                  if ((!quiet) && (obdstripe == 1))
-                        llapi_printf(LLAPI_MSG_NORMAL, 
+                        llapi_printf(LLAPI_MSG_NORMAL,
                                       "\tobdidx\t\t objid\t\tobjid\t\t group\n");
  
                  for (i = 0; i < lum->lmm_stripe_count; i++) {
-                        int idx = lum->lmm_objects[i].l_ost_idx;
-                        long long oid = lum->lmm_objects[i].l_object_id;
-                        long long gr = lum->lmm_objects[i].l_object_gr;
+                        int idx = objects[i].l_ost_idx;
+                        long long oid = objects[i].l_object_id;
+                        long long gr = objects[i].l_object_gr;
                          if ((obdindex == OBD_NOT_FOUND) || (obdindex == idx))
-                                llapi_printf(LLAPI_MSG_NORMAL, 
-                                             "\t%6u\t%14llu\t%#13llx\t%14llu%s\n",
-                                             idx, oid, oid, gr,
-                                             obdindex == idx ? " *" : "");
+                                llapi_printf(LLAPI_MSG_NORMAL,
+                                           "\t%6u\t%14llu\t%#13llx\t%14llu%s\n",
+                                           idx, oid, oid, gr,
+                                           obdindex == idx ? " *" : "");
                  }
                  llapi_printf(LLAPI_MSG_NORMAL, "\n");
          }
  }
  
-void lov_dump_user_lmm_join(struct lov_user_md_v1 *lum, char *path,
-                            int is_dir, int obdindex, int quiet,
-                            int header, int body)
-{
-        struct lov_user_md_join *lumj = (struct lov_user_md_join *)lum;
-        int i, obdstripe = 0;
-
-        if (obdindex != OBD_NOT_FOUND) {
-                for (i = 0; i < lumj->lmm_stripe_count; i++) {
-                        if (obdindex == lumj->lmm_objects[i].l_ost_idx) {
-                                llapi_printf(LLAPI_MSG_NORMAL, "%s\n", path);
-                                obdstripe = 1;
-                                break;
-                        }
-                }
-        } else if (!quiet) {
-                llapi_printf(LLAPI_MSG_NORMAL, "%s\n", path);
-                obdstripe = 1;
-        }
-
-        if (header && obdstripe == 1) {
-                llapi_printf(LLAPI_MSG_NORMAL, "lmm_magic:          0x%08X\n",  
-                             lumj->lmm_magic);
-                llapi_printf(LLAPI_MSG_NORMAL, "lmm_object_gr:      "LPX64"\n", 
-                             lumj->lmm_object_gr);
-                llapi_printf(LLAPI_MSG_NORMAL, "lmm_object_id:      "LPX64"\n", 
-                             lumj->lmm_object_id);
-                llapi_printf(LLAPI_MSG_NORMAL, "lmm_stripe_count:   %u\n", 
-                             (int)lumj->lmm_stripe_count);
-                llapi_printf(LLAPI_MSG_NORMAL, "lmm_stripe_size:    %u\n",
-                             lumj->lmm_stripe_size);
-                llapi_printf(LLAPI_MSG_NORMAL, "lmm_stripe_pattern: %x\n",
-                             lumj->lmm_pattern);
-                llapi_printf(LLAPI_MSG_NORMAL, "lmm_extent_count:   %x\n",
-                             lumj->lmm_extent_count);
-        }
-
-        if (body) {
-                unsigned long long start = -1, end = 0;
-                if (!quiet && obdstripe == 1)
-                        llapi_printf(LLAPI_MSG_NORMAL, 
-                                     "joined\tobdidx\t\t objid\t\tobjid\t\t group"
-                                     "\t\tstart\t\tend\n");
-                for (i = 0; i < lumj->lmm_stripe_count; i++) {
-                        int idx = lumj->lmm_objects[i].l_ost_idx;
-                        long long oid = lumj->lmm_objects[i].l_object_id;
-                        long long gr = lumj->lmm_objects[i].l_object_gr;
-                        if (obdindex == OBD_NOT_FOUND || obdindex == idx)
-                                llapi_printf(LLAPI_MSG_NORMAL, 
-                                             "\t%6u\t%14llu\t%#13llx\t%14llu%s",
-                                             idx, oid, oid, gr,
-                                             obdindex == idx ? " *" : "");
-                        if (start != lumj->lmm_objects[i].l_extent_start ||
-                            end != lumj->lmm_objects[i].l_extent_end) {
-                                start = lumj->lmm_objects[i].l_extent_start;
-                                llapi_printf(LLAPI_MSG_NORMAL, "\t%14llu", start);
-                                end = lumj->lmm_objects[i].l_extent_end;
-                                if (end == (unsigned long long)-1)
-                                        llapi_printf(LLAPI_MSG_NORMAL, "\t\tEOF\n");
-                                else
-                                        llapi_printf(LLAPI_MSG_NORMAL, "\t\t%llu\n",
-                                                  end);
-                        } else {
-                                llapi_printf(LLAPI_MSG_NORMAL, "\t\t\t\t\n");
-                        }
-                }
-                llapi_printf(LLAPI_MSG_NORMAL, "\n");
-        }
-}
  
  void llapi_lov_dump_user_lmm(struct find_param *param,
                               char *path, int is_dir)
  {
          switch(*(__u32 *)&param->lmd->lmd_lmm) { /* lum->lmm_magic */
          case LOV_USER_MAGIC_V1:
-                lov_dump_user_lmm_v1(&param->lmd->lmd_lmm, path, is_dir,
-                                      param->obdindex, param->quiet,
-                                      param->verbose,
-                                      (param->verbose || !param->obduuid));
+                lov_dump_user_lmm_v1v3(&param->lmd->lmd_lmm, NULL,
+                                       param->lmd->lmd_lmm.lmm_objects,
+                                       path, is_dir,
+                                       param->obdindex, param->quiet,
+                                       param->verbose,
+                                       (param->verbose || !param->obduuid));
                  break;
          case LOV_USER_MAGIC_JOIN:
                  lov_dump_user_lmm_join(&param->lmd->lmd_lmm, path, is_dir,
@@ -685,10 +958,27 @@ void llapi_lov_dump_user_lmm(struct find_param *param,
                                         param->verbose,
                                         (param->verbose || !param->obduuid));
                  break;
+        case LOV_USER_MAGIC_V3: {
+                char pool_name[LOV_MAXPOOLNAME + 1];
+                struct lov_user_ost_data_v1 *objects;
+                struct lov_user_md_v3 *lmmv3 = (void *)&param->lmd->lmd_lmm;
+
+                strncpy(pool_name, lmmv3->lmm_pool_name, LOV_MAXPOOLNAME);
+                pool_name[LOV_MAXPOOLNAME] = '\0';
+                objects = lmmv3->lmm_objects;
+                lov_dump_user_lmm_v1v3(&param->lmd->lmd_lmm, pool_name,
+                                       objects, path, is_dir,
+                                       param->obdindex, param->quiet,
+                                       param->verbose,
+                                       (param->verbose || !param->obduuid));
+                break;
+        }
          default:
-                llapi_printf(LLAPI_MSG_NORMAL, 
-                             "unknown lmm_magic:  %#x (expecting %#x)\n",
-                       *(__u32 *)&param->lmd->lmd_lmm, LOV_USER_MAGIC_V1);
+                llapi_printf(LLAPI_MSG_NORMAL, "unknown lmm_magic:  %#x "
+                             "(expecting one of %#x %#x %#x)\n",
+                             param->lmd->lmd_lmm.lmm_magic,
+                             LOV_USER_MAGIC_V1, LOV_USER_MAGIC_JOIN,
+                             LOV_USER_MAGIC_V3);
                  return;
          }
  }
@@ -773,7 +1063,8 @@ int llapi_mds_getfileinfo(char *path, DIR *parent,
  
          fname = (fname == NULL ? path : fname + 1);
          /* retrieve needed file info */
-        strncpy((char *)lmd, fname, lov_mds_md_size(MAX_LOV_UUID_COUNT));
+        strncpy((char *)lmd, fname, lov_mds_md_size(MAX_LOV_UUID_COUNT,
+                LOV_MAGIC));
          ret = ioctl(dirfd(parent), IOC_MDC_GETFILEINFO, (void *)lmd);
  
          if (ret) {
@@ -782,20 +1073,20 @@ int llapi_mds_getfileinfo(char *path, DIR *parent,
                           * Do the regular lstat(2) instead. */
                          ret = lstat_f(path, st);
                          if (ret) {
-                                llapi_err(LLAPI_MSG_ERROR, 
+                                llapi_err(LLAPI_MSG_ERROR,
                                            "error: %s: lstat failed for %s",
                                            __FUNCTION__, path);
                                  return ret;
                          }
                  } else if (errno == ENOENT) {
-                        llapi_err(LLAPI_MSG_WARN, 
-                                  "warning: %s: %s does not exist", 
+                        llapi_err(LLAPI_MSG_WARN,
+                                  "warning: %s: %s does not exist",
                                    __FUNCTION__, path);
                          return -ENOENT;
                  } else {
-                        llapi_err(LLAPI_MSG_ERROR, 
-                                  "error: %s: IOC_MDC_GETFILEINFO failed for %s",
-                                  __FUNCTION__, path);
+                        llapi_err(LLAPI_MSG_ERROR,
+                                 "error: %s: IOC_MDC_GETFILEINFO failed for %s",
+                                 __FUNCTION__, path);
                          return ret;
                  }
          }
@@ -873,8 +1164,9 @@ static int llapi_semantic_traverse(char *path, int size, DIR *parent,
                                               ((struct find_param *)data)->lmd);
                          if (ret == 0) {
                                  ((struct find_param *)data)->have_fileinfo = 1;
-                                dent->d_type = llapi_filetype_dir_table[st->st_mode &
-                                                                        S_IFMT];
+                                dent->d_type =
+                                        llapi_filetype_dir_table[st->st_mode &
+                                                                 S_IFMT];
                          }
                          if (ret == -ENOENT)
                                  continue;
@@ -882,7 +1174,7 @@ static int llapi_semantic_traverse(char *path, int size, DIR *parent,
  
                  switch (dent->d_type) {
                  case DT_UNKNOWN:
-                        llapi_err(LLAPI_MSG_ERROR, 
+                        llapi_err(LLAPI_MSG_ERROR,
                                    "error: %s: '%s' is UNKNOWN type %d",
                                    __FUNCTION__, dent->d_name, dent->d_type);
                          break;
@@ -1013,6 +1305,7 @@ static int cb_find_init(char *path, DIR *parent, DIR *dir,
                          void *data, struct dirent64 *de)
  {
          struct find_param *param = (struct find_param *)data;
+        struct lov_user_md_v3 *lmmv3 = (void *)&param->lmd->lmd_lmm;
          int decision = 1; /* 1 is accepted; -1 is rejected. */
          lstat_t *st = &param->lmd->lmd_st;
          int lustre_fs = 1;
@@ -1077,18 +1370,18 @@ static int cb_find_init(char *path, DIR *parent, DIR *dir,
                          lustre_fs = 0;
                          ret = lstat_f(path, st);
                          if (ret) {
-                                llapi_err(LLAPI_MSG_ERROR, 
+                                llapi_err(LLAPI_MSG_ERROR,
                                            "error: %s: lstat failed for %s",
                                            __FUNCTION__, path);
                                  return ret;
                          }
                  } else if (errno == ENOENT) {
-                        llapi_err(LLAPI_MSG_WARN, 
+                        llapi_err(LLAPI_MSG_WARN,
                                    "warning: %s: %s does not exist",
                                    __FUNCTION__, path);
                          goto decided;
                  } else {
-                        llapi_err(LLAPI_MSG_ERROR, "error: %s: %s failed for %s",
+                        llapi_err(LLAPI_MSG_ERROR,"error: %s: %s failed for %s",
                                    __FUNCTION__, dir ? "LL_IOC_MDC_GETINFO" :
                                    "IOC_MDC_GETFILEINFO", path);
                          return ret;
@@ -1115,6 +1408,22 @@ static int cb_find_init(char *path, DIR *parent, DIR *dir,
                  }
          }
  
+        if (param->check_pool) {
+                /* empty requested pool is taken as no pool search => V1 */
+                if ((param->lmd->lmd_lmm.lmm_magic == LOV_USER_MAGIC_V1 &&
+                     param->poolname[0] == '\0') ||
+                    (param->lmd->lmd_lmm.lmm_magic == LOV_USER_MAGIC_V3 &&
+                     (strncmp(lmmv3->lmm_pool_name, param->poolname,
+                              LOV_MAXPOOLNAME) == 0 ||
+                      strcmp(param->poolname, "*") == 0))) {
+                        if (param->exclude_pool)
+                                goto decided;
+                } else {
+                        if (!param->exclude_pool)
+                                goto decided;
+                }
+        }
+
          /* Check the time on mds. */
          if (!decision) {
                  int for_mds;
@@ -1175,11 +1484,18 @@ static int cb_find_init(char *path, DIR *parent, DIR *dir,
                          goto decided;
                  } else {
                          int i, j;
+                        struct lov_user_ost_data_v1 *lmm_objects;
+
+                        if (param->lmd->lmd_lmm.lmm_magic == LOV_USER_MAGIC_V3)
+                                lmm_objects = lmmv3->lmm_objects;
+                        else
+                                lmm_objects = param->lmd->lmd_lmm.lmm_objects;
+
                          for (i = 0;
                               i < param->lmd->lmd_lmm.lmm_stripe_count; i++) {
                                  for (j = 0; j < param->num_obds; j++) {
                                          if (param->obdindexes[j] ==
-                                            param->lmd->lmd_lmm.lmm_objects[i].l_ost_idx)
+                                            lmm_objects[i].l_ost_idx)
                                                  goto obd_matches;
                                  }
                          }
@@ -1206,12 +1522,12 @@ obd_matches:
  
                  if (ret) {
                          if (errno == ENOENT) {
-                                llapi_err(LLAPI_MSG_ERROR, 
+                                llapi_err(LLAPI_MSG_ERROR,
                                            "warning: %s: %s does not exist",
                                            __FUNCTION__, path);
                                  goto decided;
                          } else {
-                                llapi_err(LLAPI_MSG_ERROR, 
+                                llapi_err(LLAPI_MSG_ERROR,
                                            "%s: IOC_LOV_GETINFO on %s failed",
                                            __FUNCTION__, path);
                                  return ret;
@@ -1316,20 +1632,20 @@ static int cb_getstripe(char *path, DIR *parent, DIR *d, void *data,
          if (ret) {
                  if (errno == ENODATA) {
                          if (!param->obduuid && !param->quiet)
-                                llapi_printf(LLAPI_MSG_NORMAL, 
+                                llapi_printf(LLAPI_MSG_NORMAL,
                                               "%s has no stripe info\n", path);
                          goto out;
                  } else if (errno == ENOTTY) {
-                        llapi_err(LLAPI_MSG_ERROR, 
+                        llapi_err(LLAPI_MSG_ERROR,
                                    "%s: '%s' not on a Lustre fs?",
                                    __FUNCTION__, path);
                  } else if (errno == ENOENT) {
-                        llapi_err(LLAPI_MSG_WARN, 
+                        llapi_err(LLAPI_MSG_WARN,
                                    "warning: %s: %s does not exist",
                                    __FUNCTION__, path);
                          goto out;
                  } else {
-                        llapi_err(LLAPI_MSG_ERROR, 
+                        llapi_err(LLAPI_MSG_ERROR,
                                    "error: %s: %s failed for %s",
                                     __FUNCTION__, d ? "LL_IOC_LOV_GETSTRIPE" :
                                    "IOC_MDC_GETFILESTRIPE", path);
@@ -1354,7 +1670,7 @@ int llapi_getstripe(char *path, struct find_param *param)
          int ret = 0, len = strlen(path);
  
          if (len > PATH_MAX) {
-                llapi_err(LLAPI_MSG_ERROR, 
+                llapi_err(LLAPI_MSG_ERROR,
                            "%s: Path name '%s' is too long",
                            __FUNCTION__, path);
                  return -EINVAL;
@@ -1400,7 +1716,7 @@ int llapi_obd_statfs(char *path, __u32 type, __u32 index,
          data.ioc_plen2 = sizeof(struct obd_uuid);
  
          if ((rc = obd_ioctl_pack(&data, &rawbuf, sizeof(raw))) != 0) {
-                llapi_err(LLAPI_MSG_ERROR, 
+                llapi_err(LLAPI_MSG_ERROR,
                            "llapi_obd_statfs: error packing ioctl data");
                  return rc;
          }
@@ -1411,7 +1727,7 @@ int llapi_obd_statfs(char *path, __u32 type, __u32 index,
  
          if (fd < 0) {
                  rc = errno ? -errno : -EBADF;
-                llapi_err(LLAPI_MSG_ERROR, "error: %s: opening '%s'", 
+                llapi_err(LLAPI_MSG_ERROR, "error: %s: opening '%s'",
                            __FUNCTION__, path);
                  return rc;
          }
@@ -1450,7 +1766,7 @@ int llapi_ping(char *obd_type, char *obd_name)
          return rc;
  }
  
-int llapi_target_iterate(int type_num, char **obd_type, void *args, llapi_cb_t cb)
+int llapi_target_iterate(int type_num, char **obd_type,void *args,llapi_cb_t cb)
  {
          char buf[MAX_STRING_SIZE];
          FILE *fp = fopen(DEVICES_LIST, "r");
@@ -1658,11 +1974,11 @@ static int cb_quotachown(char *path, DIR *parent, DIR *d, void *data,
          if (rc) {
                  if (errno == ENODATA) {
                          if (!param->obduuid && !param->quiet)
-                                llapi_err(LLAPI_MSG_ERROR, 
+                                llapi_err(LLAPI_MSG_ERROR,
                                            "%s has no stripe info", path);
                          rc = 0;
                  } else if (errno == ENOENT) {
-                        llapi_err(LLAPI_MSG_ERROR, 
+                        llapi_err(LLAPI_MSG_ERROR,
                                    "warning: %s: %s does not exist",
                                    __FUNCTION__, path);
                          rc = 0;
@@ -1686,7 +2002,7 @@ static int cb_quotachown(char *path, DIR *parent, DIR *d, void *data,
  
          rc = chmod(path, st->st_mode);
          if (rc)
-                llapi_err(LLAPI_MSG_ERROR, "error: chmod %s (%hu)", 
+                llapi_err(LLAPI_MSG_ERROR, "error: chmod %s (%hu)",
                            path, st->st_mode);
  
          return rc;
diff --git a/lustre/utils/ll_recover_lost_found_objs.c b/lustre/utils/ll_recover_lost_found_objs.c

index 71b5bd2..449cfae 100644 (file)
--- a/lustre/utils/ll_recover_lost_found_objs.c
+++ b/lustre/utils/ll_recover_lost_found_objs.c
@@ -1,32 +1,51 @@
-/*
- *   Copyright (C) 2008 Sun Microssystems, Inc.
- *   Author: Rupesh Thakare <rupesh.thakare@sun.com>
- *   Author: Kalpak Shah <kalpak.shah@sun.com>
+/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
+ * vim:expandtab:shiftwidth=8:tabstop=8:
+ *
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   *
- *   This file is part of Lustre, http://www.lustre.org.
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
   *
- *   Lustre is free software; you can redistribute it and/or
- *   modify it under the terms of version 2 of the GNU General Public
- *   License as published by the Free Software Foundation.
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
   *
- *   Lustre is distributed in the hope that it will be useful,
- *   but WITHOUT ANY WARRANTY; without even the implied warranty of
- *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- *   GNU General Public License for more details.
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
   *
- *   You should have received a copy of the GNU General Public License
- *   along with Lustre; if not, write to the Free Software
- *   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
   */
-
  /*
+ * Copyright  2008 Sun Microsystems, Inc. All rights reserved
+ * Use is subject to license terms.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/utils/ll_recover_lost_found_objs.c
+ *
   * Tool for recovering objects from lost+found that might result from a
   * Lustre OST with a corrupted directory. Running e2fsck will fix the
   * directory, but puts all of the objects into lost+found, where they are
   * inaccessible to Lustre.
+ *
+ * Author: Kalpak Shah <kalpak.shah@sun.com>
   */
  
+#ifndef _GNU_SOURCE
  #define _GNU_SOURCE
+#endif
  
  #include <stdio.h>
  #include <stdlib.h>
@@ -304,7 +323,7 @@ int check_last_id(char *mount_path)
  
         for (group = 0; group < MAX_GROUPS; group++) {
                 max_objid = 0;
-               sprintf(dirname, "%s/O/"LPU64, mount_path, group);
+               sprintf(dirname, "%s/O/%llu", mount_path, group);
  
                 strcpy(lastid_path, dirname);
                 strcat(lastid_path, "/LAST_ID");
diff --git a/lustre/utils/llanalyze b/lustre/utils/llanalyze

index c464ff0..0ca2d18 100644 (file)
--- a/lustre/utils/llanalyze
+++ b/lustre/utils/llanalyze
@@ -433,12 +433,12 @@ Displays all the debug messages from the occurence of the specified string till
  
  =head1 REPORTING BUGS
  
-Please report all bugs to Cluster FileSystems, support@clusterfs.com
+Please report all bugs to Sun Microsystems, Inc. http://bugzilla.lustre.org/
  
  
  =head1 AVAILABILITY
  
-llanalyze is part of the Lustre(7) filesystem package and is available from CFS http://clusterfs.com
+llanalyze is part of the Lustre(7) filesystem package and is available from http://www.sun.com/software/products/lustre/index.xml
  
  =head1 SEE ALSO
  
diff --git a/lustre/utils/llog_reader.c b/lustre/utils/llog_reader.c

index 117bf64..524daa8 100644 (file)
--- a/lustre/utils/llog_reader.c
+++ b/lustre/utils/llog_reader.c
@@ -1,23 +1,37 @@
  /* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
   * vim:expandtab:shiftwidth=8:tabstop=8:
   *
- *  Copyright (C) 2006 Cluster File Systems, Inc.
+ * GPL HEADER START
   *
- *   This file is part of Lustre, http://www.lustre.org.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   *
- *   Lustre is free software; you can redistribute it and/or
- *   modify it under the terms of version 2 of the GNU General Public
- *   License as published by the Free Software Foundation.
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
   *
- *   Lustre is distributed in the hope that it will be useful,
- *   but WITHOUT ANY WARRANTY; without even the implied warranty of
- *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- *   GNU General Public License for more details.
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
   *
- *   You should have received a copy of the GNU General Public License
- *   along with Lustre; if not, write to the Free Software
- *   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
   *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright  2008 Sun Microsystems, Inc. All rights reserved
+ * Use is subject to license terms.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
   */
   /* Interpret configuration llogs */
  
@@ -341,6 +355,8 @@ void print_lustre_cfg(struct lustre_cfg *lcfg, int *skip)
          case(LCFG_MARKER):{
                  struct cfg_marker *marker = lustre_cfg_buf(lcfg, 1);
                  char createtime[26], canceltime[26] = "";
+                time_t time_tmp;
+
                  if (marker->cm_flags & CM_SKIP) {
                          if (marker->cm_flags & CM_START) {
                                  printf("SKIP START ");
@@ -350,18 +366,39 @@ void print_lustre_cfg(struct lustre_cfg *lcfg, int *skip)
                                  *skip = 0;
                          }
                  }
+
                  if (marker->cm_flags & CM_EXCLUDE) {
                          if (marker->cm_flags & CM_START) 
                                  printf("EXCLUDE START ");
                          else
                                  printf("EXCLUDE END   ");
                  }
-                ctime_r(&marker->cm_createtime, createtime);
-                createtime[strlen(createtime) - 1] = 0;
+
+                /* Handle overflow of 32-bit time_t gracefully.
+                 * The copy to time_tmp is needed in any case to
+                 * keep the pointer happy, even on 64-bit systems. */
+                time_tmp = marker->cm_createtime;
+                if (time_tmp == marker->cm_createtime) {
+                        ctime_r(&time_tmp, createtime);
+                        createtime[strlen(createtime) - 1] = 0;
+                } else {
+                        strcpy(createtime, "in the distant future");
+                }
+
                  if (marker->cm_canceltime) {
-                        ctime_r(&marker->cm_canceltime, canceltime);
-                        canceltime[strlen(canceltime) - 1] = 0;
+                        /* Like cm_createtime, we try to handle overflow of
+                         * 32-bit time_t gracefully. The copy to time_tmp
+                         * is also needed on 64-bit systems to keep the
+                         * pointer happy, see bug 16771 */
+                        time_tmp = marker->cm_canceltime;
+                        if (time_tmp == marker->cm_canceltime) {
+                                ctime_r(&time_tmp, canceltime);
+                                canceltime[strlen(canceltime) - 1] = 0;
+                        } else {
+                                strcpy(canceltime, "in the distant future");
+                        }
                  }
+
                  printf("marker %3d (flags=%#04x, v%d.%d.%d.%d) %-15s '%s' %s-%s",
                         marker->cm_step, marker->cm_flags,
                         OBD_OCD_VERSION_MAJOR(marker->cm_vers),
@@ -372,6 +409,26 @@ void print_lustre_cfg(struct lustre_cfg *lcfg, int *skip)
                         createtime, canceltime);
                  break;
          }
+        case(LCFG_POOL_NEW):{
+                printf("pool new ");
+                print_1_cfg(lcfg);
+                break;
+        }
+        case(LCFG_POOL_ADD):{
+                printf("pool add ");
+                print_1_cfg(lcfg);
+                break;
+        }
+        case(LCFG_POOL_REM):{
+                printf("pool remove ");
+                print_1_cfg(lcfg);
+                break;
+        }
+        case(LCFG_POOL_DEL):{
+                printf("pool destroy ");
+                print_1_cfg(lcfg);
+                break;
+        }
          default:
                  printf("unsupported cmd_code = %x\n",cmd);
          }
diff --git a/lustre/utils/llverdev.c b/lustre/utils/llverdev.c

index 91a21c2..94db011 100644 (file)
--- a/lustre/utils/llverdev.c
+++ b/lustre/utils/llverdev.c
@@ -1,4 +1,40 @@
+/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
+ * vim:expandtab:shiftwidth=8:tabstop=8:
+ *
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright  2008 Sun Microsystems, Inc. All rights reserved
+ * Use is subject to license terms.
+ */
  /*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/utils/llverdev.c
+ *
   * Large Block Device Verification Tool.
   * This program is used to test whether the block device is correctly
   * handling IO beyond 2TB boundary.
diff --git a/lustre/utils/llverfs.c b/lustre/utils/llverfs.c

index e1e9e3c..613d1b8 100644 (file)
--- a/lustre/utils/llverfs.c
+++ b/lustre/utils/llverfs.c
@@ -1,4 +1,40 @@
+/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
+ * vim:expandtab:shiftwidth=8:tabstop=8:
+ *
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright  2008 Sun Microsystems, Inc. All rights reserved
+ * Use is subject to license terms.
+ */
  /*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/utils/llverfs.c
+ *
   * ext3 Filesystem Verification Tool.
   * This program tests the correct operation of ext3 filesystem.
   * This tool have two working modes
diff --git a/lustre/utils/loadgen.c b/lustre/utils/loadgen.c

index 4ba3dcd..35ba76b 100644 (file)
--- a/lustre/utils/loadgen.c
+++ b/lustre/utils/loadgen.c
@@ -1,30 +1,46 @@
  /* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
   * vim:expandtab:shiftwidth=8:tabstop=8:
   *
- *   Copyright (C) 2006 Cluster File Systems, Inc.
- *   Author: Nathan Rutman <nathan@clusterfs.com>
+ * GPL HEADER START
   *
- *   This file is part of Lustre, http://www.lustre.org.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   *
- *   Lustre is free software; you can redistribute it and/or
- *   modify it under the terms of version 2 of the GNU General Public
- *   License as published by the Free Software Foundation.
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
   *
- *   Lustre is distributed in the hope that it will be useful,
- *   but WITHOUT ANY WARRANTY; without even the implied warranty of
- *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- *   GNU General Public License for more details.
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
   *
- *   You should have received a copy of the GNU General Public License
- *   along with Lustre; if not, write to the Free Software
- *   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright  2008 Sun Microsystems, Inc. All rights reserved
+ * Use is subject to license terms.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/utils/loadgen.c
   *
- * loadgen.c
   * See how many local OSCs we can start whaling on a OST
   * We're doing direct ioctls instead of going though a system() call to lctl
   * to avoid the bash overhead.
   * Adds an osc / echo client pair in each thread and starts echo transactions.
   *
+ * Author: Nathan Rutman <nathan@clusterfs.com>
   */
  
  #include <pthread.h>
@@ -1034,4 +1050,3 @@ int main (int argc, char **argv)
          return rc;
  }
  #endif
-
diff --git a/lustre/utils/lr_reader.c b/lustre/utils/lr_reader.c

index f1275b3..996f4d9 100644 (file)
--- a/lustre/utils/lr_reader.c
+++ b/lustre/utils/lr_reader.c
@@ -1,28 +1,47 @@
  /* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
   * vim:expandtab:shiftwidth=8:tabstop=8:
   *
- *   Copyright (C) 2006 Cluster File Systems, Inc.
- *   Author: Nathan Rutman <nathan@clusterfs.com>
+ * GPL HEADER START
   *
- *   This file is part of Lustre, http://www.lustre.org.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   *
- *   Lustre is free software; you can redistribute it and/or
- *   modify it under the terms of version 2 of the GNU General Public
- *   License as published by the Free Software Foundation.
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
   *
- *   Lustre is distributed in the hope that it will be useful,
- *   but WITHOUT ANY WARRANTY; without even the implied warranty of
- *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- *   GNU General Public License for more details.
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
   *
- *   You should have received a copy of the GNU General Public License
- *   along with Lustre; if not, write to the Free Software
- *   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
   *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright  2008 Sun Microsystems, Inc. All rights reserved
+ * Use is subject to license terms.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/utils/lr_reader.c
+ *
+ * Author: Nathan Rutman <nathan@clusterfs.com>
   */
   /* Safely read the last_rcvd file from a device */
  
+#ifndef _GNU_SOURCE
  #define _GNU_SOURCE
+#endif
  #include <stdlib.h>
  #include <stdio.h>
  #include <unistd.h>
@@ -204,6 +223,3 @@ out_rmdir:
          run_command(cmd);
          return ret;
  }
-
-
-
diff --git a/lustre/utils/lshowmount.c b/lustre/utils/lshowmount.c

new file mode 100644 (file)

index 0000000..4b2675f
--- /dev/null
+++ b/lustre/utils/lshowmount.c
@@ -0,0 +1,411 @@
+#ifndef _GNU_SOURCE
+#define _GNU_SOURCE
+#endif
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <unistd.h>
+#include <getopt.h>
+#include <dirent.h>
+#include <string.h>
+#include <sys/stat.h>
+#include <fcntl.h>
+#include <arpa/inet.h>
+#include <netdb.h>
+#include <errno.h>
+#include "lshowmount.h"
+#include "hash.h"
+#include "hostlist.h"
+
+#define PROGNAME "lshowmount"
+
+extern int errno;
+static int enumerate = 0;
+static int lookup    = 0;
+static int verbose   = 0;
+
+static int totalexports = 0;
+static int totalfailures = 0;
+
+static struct option long_options[] = {
+    {"enumerate", 0, 0, 'e'},
+    {"help",      0, 0, 'h'},
+    {"lookup",    0, 0, 'l'},
+    {"verbose",   0, 0, 'v'},
+    {0, 0, 0, 0}
+};
+
+inline int
+lshowmount_hash_strcmp(const void *key1, const void *key2)
+{
+    return strcmp((char *) key1, (char *) key2);
+}
+
+inline void
+lshowmount_hash_hostlist_freeitem(void *data)
+{
+    hostlist_t hl = NULL;
+
+    if (data == NULL) {
+        return;
+    }
+
+    hl = (hostlist_t) data;
+    hostlist_destroy(hl);
+}
+
+inline int
+is_ipaddress(const char *str)
+{
+    int rc = 0;
+    int quad[4];
+
+    rc = sscanf(str, "%d.%d.%d.%d", &quad[0], &quad[1], &quad[2], &quad[3]);
+    if (rc == 4) {
+        return 1;
+    }
+    return 0;
+}
+
+inline void
+lshowmount_gethostname(const char *src, char *dst, int dstsize)
+{
+    struct hostent *hostptr = NULL;
+    char tmpsrc[4];
+    int rc = 0;
+
+    memset(dst, 0, sizeof(char) * dstsize);
+    if (lookup && is_ipaddress(src)) {
+        rc = inet_pton(AF_INET, src, tmpsrc);
+
+        if (rc <= 0) {
+            strncpy(dst, src, dstsize);
+            return;
+        }
+        else {
+            hostptr = gethostbyaddr(tmpsrc, 4, AF_INET);
+            if (hostptr == NULL) {
+                strncpy(dst, src, dstsize);
+                return;
+            }
+            else {
+                strncpy(dst, hostptr->h_name, dstsize);
+                return;
+            }
+        }
+    }
+    strncpy(dst, src, dstsize);
+}
+
+void
+lshowmount_print_hosts(char** network,
+                       hash_t network_hash)
+{
+    hostlist_t hl = NULL;
+    hostlist_iterator_t itr = NULL;
+    char *hosts = NULL;
+    int numnets = 0, numhosts = 0, i = 0;
+
+    if (network == NULL || network_hash == NULL) {
+        return;
+    }
+
+    numnets = hash_count(network_hash);
+    for (i = 0; i < numnets; i++) {
+        errno = 0;
+        hl = hash_remove(network_hash, network[i]);
+        if (hl == NULL) {
+            continue;
+        }
+        hostlist_uniq(hl);
+        numhosts = hostlist_count(hl);
+
+        if (numhosts > 0) {
+            if (enumerate) {
+                itr = hostlist_iterator_create(hl);
+
+                /* setup argument */
+                while ((hosts = hostlist_next(itr)) != NULL) {
+                    printf("    %s@%s\n", hosts, network[i]);
+                }
+                hostlist_iterator_destroy(itr);
+            }
+            else {
+                hosts = malloc(sizeof(char) * (numhosts) * (NID_MAX+1));
+                if (hosts == NULL) {
+                    fprintf(stderr, "warning: could not allocate buffer "
+                                    "to print hostrange\n");
+                    return;
+                }
+                hostlist_ranged_string(hl, sizeof(char) *
+                                           numhosts *
+                                           (NID_MAX+1), hosts);
+                printf("    %s@%s\n", hosts, network[i]);
+                free(hosts);
+                hosts = NULL;
+            }
+        }
+        memset(network[i], 0, sizeof(char) * (LNET_NETWORK_TYPE_MAX+1));
+        lshowmount_hash_hostlist_freeitem(hl);
+    }
+}
+
+void usage(void)
+{
+       fprintf(stderr, "usage: %s [-e] [-h] [-l] [-v]\n", PROGNAME);
+}
+
+int getclients(char*  procpath,
+               char** network,
+               hash_t network_hash)
+{
+    DIR *dirp, *dirp2;
+    struct dirent *dp, *dp2;
+    char path[PATH_MAX+1];
+    char nid[NID_MAX+1], addr[NID_MAX+1];
+    int size = PATH_MAX+1, sizeleft, sizeleft2;
+    int tmplen, tmplen2, idx, rc = 0;
+    char *tmp, *tmp2;
+    hostlist_t hl;
+
+    if (procpath == NULL) {
+        return -1;
+    }
+
+    /* It is not an error if we cannot open
+     * procpath since we are not sure if this
+     * node is an mgs, mds, and/or oss */
+    errno = 0;
+    dirp = opendir(procpath);
+    if (dirp == NULL) {
+        return 0;
+    }
+
+    do {
+        errno = 0;
+        dp = readdir(dirp);
+        if (dp != NULL) {
+            if (dp->d_type != DT_DIR ||
+                strncmp(dp->d_name, ".", 2) == 0  ||
+                strncmp(dp->d_name, "..", 3) == 0) {
+                continue;
+            }
+
+            sizeleft = size;
+            tmp = path;
+            memset(tmp, 0, sizeof(char) * sizeleft);
+
+            strncpy(tmp, procpath, sizeleft);
+            tmplen = strnlen(tmp, sizeleft);
+            sizeleft -= tmplen;
+            tmp += tmplen;
+
+            strncpy(tmp, "/", sizeleft);
+            tmplen = strnlen(tmp, sizeleft);
+            sizeleft -= tmplen;
+            tmp += tmplen;
+
+            strncpy(tmp, dp->d_name, sizeleft);
+            tmplen = strnlen(tmp, sizeleft);
+            sizeleft -= tmplen;
+            tmp += tmplen;
+
+            strncpy(tmp, "/", sizeleft);
+            tmplen = strnlen(tmp, sizeleft);
+            sizeleft -= tmplen;
+            tmp += tmplen;
+
+            strncpy(tmp, PROC_EXPORTS, sizeleft);
+            tmplen = strnlen(tmp, sizeleft);
+            sizeleft -= tmplen;
+            tmp += tmplen;
+
+            errno = 0;
+            dirp2 = opendir(path);
+            if (dirp2 == NULL) {
+                fprintf(stderr, "error: could not open: %s\n", path);
+                rc = errno;
+                continue;
+            }
+
+            do {
+                errno = 0;
+                dp2 = readdir(dirp2);
+                if (dp2 != NULL) {
+                    if (strncmp(dp2->d_name, ".", 2) == 0  ||
+                        strncmp(dp2->d_name, "..", 3) == 0 ||
+                        dp2->d_type != DT_DIR) {
+                        continue;
+                    }
+                    totalexports++;
+
+                    sizeleft2 = sizeleft;
+                    tmp2 = tmp;
+                    memset(tmp2, 0, sizeof(char) * sizeleft2);
+
+                    strncpy(tmp2, "/", sizeleft2);
+                    tmplen2 = strnlen(tmp2, sizeleft2);
+                    sizeleft2 -= tmplen2;
+                    tmp2 += tmplen2;
+
+                    strncpy(tmp2, dp2->d_name, sizeleft2);
+                    tmplen2 = strnlen(tmp2, sizeleft2);
+                    sizeleft2 -= tmplen2;
+                    tmp2 += tmplen2;
+
+                    memset(nid, 0, sizeof(char) * (NID_MAX+1));
+                    strncpy(nid, basename(path), sizeof(char) * (NID_MAX+1));
+                    tmp2 = strrchr(nid, '@');
+                    if (tmp2 == NULL) {
+                        totalfailures++;
+                        continue;
+                    }
+                    *tmp2 = '\0';
+                    tmp2++;
+                    /* Note that tmp2 should now hold the lnet network */
+
+                    /* Check to see if this lnet network already has a hostset
+                     * associated with it */
+                    errno = 0;
+                    hl = hash_find(network_hash, tmp2);
+                    if (hl == NULL) {
+                        if (hash_count(network_hash) >= NETWORK_MAX) {
+                            (void)closedir(dirp2);
+                            return EINVAL;
+                        }
+
+                        /* Create a new hostset for this hash table and
+                         * insert the first part of the nid into it */
+                        idx = hash_count(network_hash);
+                        strncpy(network[idx], tmp2, LNET_NETWORK_TYPE_MAX);
+                        lshowmount_gethostname(nid, addr, NID_MAX+1);
+                        hl = hostlist_create(addr);
+                        hash_insert(network_hash, network[idx], hl);
+                    }
+                    else {
+                        lshowmount_gethostname(nid, addr, NID_MAX+1);
+                        hostlist_push_host(hl, addr);
+                    }
+                }
+            } while (dp2 != NULL);
+            (void) closedir(dirp2);
+
+            /* If the verbose option is set we want to print
+             * out the hostlist for each mgs, mds, obdfilter */
+            if (verbose) {
+                printf("%s:\n", dp->d_name);
+                if (totalfailures > 0) {
+                    fprintf(stderr, "failures %d of %d exports\n",
+                            totalfailures, totalexports);
+                }
+
+                if (!rc && totalfailures > 0) {
+                    rc = 1;
+                }
+
+                totalexports = totalfailures = 0;
+                lshowmount_print_hosts(network, network_hash);
+            }
+        }
+    } while (dp != NULL);
+    (void) closedir(dirp);
+
+    if (!rc && totalfailures > 0) {
+        rc = 1;
+    }
+
+    return rc;
+}
+
+int main(int argc, char **argv)
+{
+    int                 opt = 0;
+    int                 optidx = 0;
+    int                 i = 0, rc = 0, rc2 = 0, rc3 = 0;
+    hash_t              network_hash = NULL;
+    char**              network = NULL;
+
+    while ((opt = getopt_long(argc, argv, "ehlv",long_options, &optidx)) != -1) {
+        switch (opt) {
+            case 'e':
+                enumerate = 1;
+                break;
+            case 'h':
+                usage();
+                goto finish;
+                break;
+            case 'l':
+                lookup = 1;
+                break;
+            case 'v':
+                verbose = 1;
+                break;
+            default:
+                usage();
+                rc = -1;
+                goto finish;
+                break;
+        }
+    }
+
+    /* Allocate memory for NETWORK_MAX total possible
+     * lnet networks.  Each network will have its own
+     * hash table so that we can possibly create a ranged
+     * string for it */
+    network = malloc(sizeof(char *) * NETWORK_MAX);
+    if (network == NULL) {
+        rc = ENOMEM;
+        goto finish;
+    }
+    memset(network, 0, sizeof(char *) * NETWORK_MAX);
+    for (i = 0; i < NETWORK_MAX; i++) {
+        network[i] = malloc(sizeof(char) * (LNET_NETWORK_TYPE_MAX+1));
+        if (network[i] == NULL) {
+            rc = ENOMEM;
+            goto finish;
+        }
+        memset(network[i], 0, sizeof(char) * (LNET_NETWORK_TYPE_MAX+1));
+    }
+
+    /* Initialize the network_hash.  This hash table will map
+     * a particular network say elan1 or tcp2 to a hostset */
+    network_hash = hash_create(0,
+                               (hash_key_f) hash_key_string,
+                               lshowmount_hash_strcmp,
+                               lshowmount_hash_hostlist_freeitem);
+
+    rc  = getclients(PROC_DIR_MGS, network, network_hash);
+    rc2 = getclients(PROC_DIR_MDS, network, network_hash);
+    rc3 = getclients(PROC_DIR_OST, network, network_hash);
+    if (rc || rc2 || rc3) {
+        rc = rc2 > rc ? rc2 : rc;
+        rc = rc3 > rc ? rc3 : rc;
+    }
+
+    if (!verbose) {
+        if (totalfailures > 0) {
+            fprintf(stderr, "failures %d of %d exports\n",
+                    totalfailures, totalexports);
+        }
+        lshowmount_print_hosts(network, network_hash);
+    }
+
+finish:
+    hash_destroy(network_hash);
+    if (network != NULL) {
+        for (i = 0; i < NETWORK_MAX; i++) {
+            if (network[i] != NULL) {
+                free(network[i]);
+                network[i] = NULL;
+            }
+        }
+        free(network);
+        network = NULL;
+    }
+
+    return rc;
+}
+
+/*
+ * vi:tabstop=4 shiftwidth=4 expandtab
+ */
diff --git a/lustre/utils/lshowmount.h b/lustre/utils/lshowmount.h

new file mode 100644 (file)

index 0000000..06e1c5d
--- /dev/null
+++ b/lustre/utils/lshowmount.h
@@ -0,0 +1,17 @@
+#ifndef __LSHOWMOUNT_H
+#define __LSHOWMOUNT_H
+
+#define PROC_DIR_MGS          "/proc/fs/lustre/mgs"
+#define PROC_DIR_MDS          "/proc/fs/lustre/mds"
+#define PROC_DIR_OST          "/proc/fs/lustre/obdfilter"
+#define PROC_EXPORTS          "exports"
+#define PROC_NID              "nid"
+#define NID_MAX               1024
+#define LNET_NETWORK_TYPE_MAX 32
+#define NETWORK_MAX           128
+
+#endif
+
+/*
+ * vi:tabstop=4 shiftwidth=4 expandtab
+ */
diff --git a/lustre/utils/ltrack_stats.c b/lustre/utils/ltrack_stats.c

index ccd9e7e..b418051 100644 (file)
--- a/lustre/utils/ltrack_stats.c
+++ b/lustre/utils/ltrack_stats.c
@@ -1,24 +1,41 @@
  /* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
   * vim:expandtab:shiftwidth=8:tabstop=8:
   *
- *  Copyright (C) 2007 Cluster File Systems, Inc.
- *   Author: Milind Dumbare <milind@clusterfs.com>
+ * GPL HEADER START
   *
- *   This file is part of Lustre, http://www.lustre.org.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   *
- *   Lustre is free software; you can redistribute it and/or
- *   modify it under the terms of version 2 of the GNU General Public
- *   License as published by the Free Software Foundation.
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
   *
- *   Lustre is distributed in the hope that it will be useful,
- *   but WITHOUT ANY WARRANTY; without even the implied warranty of
- *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- *   GNU General Public License for more details.
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
   *
- *   You should have received a copy of the GNU General Public License
- *   along with Lustre; if not, write to the Free Software
- *   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
   *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright  2008 Sun Microsystems, Inc. All rights reserved
+ * Use is subject to license terms.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/utils/ltrack_stats.c
+ *
+ * Author: Milind Dumbare <milind@clusterfs.com>
   */
  
  #include <sys/types.h>
diff --git a/lustre/utils/lustre_cfg.c b/lustre/utils/lustre_cfg.c

index 9d382a3..36608d3 100644 (file)
--- a/lustre/utils/lustre_cfg.c
+++ b/lustre/utils/lustre_cfg.c
@@ -1,27 +1,44 @@
  /* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
   * vim:expandtab:shiftwidth=8:tabstop=8:
   *
- *  Copyright (C) 2002 Cluster File Systems, Inc.
- *   Author: Peter J. Braam <braam@clusterfs.com>
- *   Author: Phil Schwan <phil@clusterfs.com>
- *   Author: Andreas Dilger <adilger@clusterfs.com>
- *   Author: Robert Read <rread@clusterfs.com>
+ * GPL HEADER START
   *
- *   This file is part of Lustre, http://www.lustre.org.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   *
- *   Lustre is free software; you can redistribute it and/or
- *   modify it under the terms of version 2 of the GNU General Public
- *   License as published by the Free Software Foundation.
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
   *
- *   Lustre is distributed in the hope that it will be useful,
- *   but WITHOUT ANY WARRANTY; without even the implied warranty of
- *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- *   GNU General Public License for more details.
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
   *
- *   You should have received a copy of the GNU General Public License
- *   along with Lustre; if not, write to the Free Software
- *   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
   *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright  2008 Sun Microsystems, Inc. All rights reserved
+ * Use is subject to license terms.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/utils/lustre_cfg.c
+ *
+ * Author: Peter J. Braam <braam@clusterfs.com>
+ * Author: Phil Schwan <phil@clusterfs.com>
+ * Author: Andreas Dilger <adilger@clusterfs.com>
+ * Author: Robert Read <rread@clusterfs.com>
   */
  
  #include <stdlib.h>
@@ -728,7 +745,12 @@ int jt_lcfg_setparam(int argc, char **argv)
                  }
                  /* Write the new value to the file */
                  fp = open(glob_info.gl_pathv[i], O_WRONLY);
-                if (fp > 0) {
+                if (fp == -1) {
+                        fprintf(stderr, "error: %s: %s opening %s\n",
+                                jt_cmdname(argv[0]), strerror(rc = errno),
+                                glob_info.gl_pathv[i]);
+                        break;
+                } else {
                          rc = write(fp, value, strlen(value));
                          if (rc < 0)
                                  fprintf(stderr,
@@ -737,10 +759,6 @@ int jt_lcfg_setparam(int argc, char **argv)
                          else
                                  rc = 0;
                          close(fp);
-                } else {
-                        fprintf(stderr, "error: %s: %s opening %s\n",
-                                jt_cmdname(argv[0]), strerror(rc = errno),
-                                glob_info.gl_pathv[i]);
                  }
          }
  
diff --git a/lustre/utils/mkfs_lustre.c b/lustre/utils/mkfs_lustre.c

index 70523c6..9c62901 100644 (file)
--- a/lustre/utils/mkfs_lustre.c
+++ b/lustre/utils/mkfs_lustre.c
@@ -1,28 +1,48 @@
  /* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
   * vim:expandtab:shiftwidth=8:tabstop=8:
   *
- *   Copyright (C) 2006 Cluster File Systems, Inc.
- *   Author: Nathan Rutman <nathan@clusterfs.com>
+ * GPL HEADER START
   *
- *   This file is part of Lustre, http://www.lustre.org.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   *
- *   Lustre is free software; you can redistribute it and/or
- *   modify it under the terms of version 2 of the GNU General Public
- *   License as published by the Free Software Foundation.
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
   *
- *   Lustre is distributed in the hope that it will be useful,
- *   but WITHOUT ANY WARRANTY; without even the implied warranty of
- *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- *   GNU General Public License for more details.
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
   *
- *   You should have received a copy of the GNU General Public License
- *   along with Lustre; if not, write to the Free Software
- *   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
   *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright  2008 Sun Microsystems, Inc. All rights reserved
+ * Use is subject to license terms.
   */
- /* This source file is compiled into both mkfs.lustre and tunefs.lustre */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/utils/mkfs_lustre.c
+ *
+ * Author: Nathan Rutman <nathan@clusterfs.com>
+*/
  
+/* This source file is compiled into both mkfs.lustre and tunefs.lustre */
+
+#ifndef _GNU_SOURCE
  #define _GNU_SOURCE
+#endif
  #include <stdlib.h>
  #include <stdio.h>
  #include <unistd.h>
@@ -50,6 +70,7 @@
  #include <lustre_param.h>
  #include <lnet/lnetctl.h>
  #include <lustre_ver.h>
+#include "mount_utils.h"
  
  #ifndef PATH_MAX
  #define PATH_MAX 4096
@@ -73,8 +94,8 @@ struct mkfs_opts {
          int   mo_mgs_failnodes;
  };
  
-static char *progname;
-static int verbose = 1;
+char *progname;
+int verbose = 1;
  static int print_only = 0;
  static int failover = 0;
  
@@ -125,12 +146,6 @@ void usage(FILE *out)
  #define vprint if (verbose > 0) printf
  #define verrprint if (verbose >= 0) printf
  
-static void fatal(void)
-{
-        verbose = 0;
-        fprintf(stderr, "\n%s FATAL: ", progname);
-}
-
  /*================ utility functions =====================*/
  
  char *strscat(char *dst, char *src, int buflen) {
@@ -185,47 +200,6 @@ int get_os_version()
          return version;
  }
  
-int run_command(char *cmd, int cmdsz)
-{
-        char log[] = "/tmp/mkfs_logXXXXXX";
-        int fd = -1, rc;
-
-        if ((cmdsz - strlen(cmd)) < 6) {
-                fatal();
-                fprintf(stderr, "Command buffer overflow: %.*s...\n",
-                        cmdsz, cmd);
-                return ENOMEM;
-        }
-
-        if (verbose > 1) {
-                printf("cmd: %s\n", cmd);
-        } else {
-                if ((fd = mkstemp(log)) >= 0) {
-                        close(fd);
-                        strcat(cmd, " >");
-                        strcat(cmd, log);
-                }
-        }
-        strcat(cmd, " 2>&1");
-
-        /* Can't use popen because we need the rv of the command */
-        rc = system(cmd);
-        if (rc && (fd >= 0)) {
-                char buf[128];
-                FILE *fp;
-                fp = fopen(log, "r");
-                if (fp) {
-                        while (fgets(buf, sizeof(buf), fp) != NULL) {
-                                printf("   %s", buf);
-                        }
-                        fclose(fp);
-                }
-        }
-        if (fd >= 0)
-                remove(log);
-        return rc;
-}
-
  static int check_mtab_entry(char *spec)
  {
          FILE *fp;
@@ -401,9 +375,8 @@ static void disp_old_e2fsprogs_msg(const char *feature, int make_backfs)
  
          fprintf(stderr, "WARNING: The e2fsprogs package currently installed on "
                  "your system does not support \"%s\" feature.\nPlease install "
-                "the latest version of e2fsprogs from http://www.clusterfs.com/"
-                "downloads/public/Lustre/Tools/e2fsprogs/\nto enable this "
-                "feature.\n", feature);
+                "the latest version of e2fsprogs from http://downloads.lustre.org"
+                "/public/tools/e2fsprogs/\nto enable this feature.\n", feature);
  
          if (make_backfs)
                  fprintf(stderr, "Feature will not be enabled until e2fsprogs "
@@ -421,7 +394,7 @@ static int file_in_dev(char *file_name, char *dev_name)
  
          /* Construct debugfs command line. */
          snprintf(debugfs_cmd, sizeof(debugfs_cmd),
-                "debugfs -c -R 'stat %s' %s 2>&1 | egrep '(Inode|unsupported)'",
+                "debugfs -c -R 'stat %s' '%s' 2>&1 | egrep '(Inode|unsupported)'",
                  file_name, dev_name);
  
          fp = popen(debugfs_cmd, "r");
@@ -441,6 +414,7 @@ static int file_in_dev(char *file_name, char *dev_name)
                  if (strstr(debugfs_cmd, "unsupported feature")) {
                          disp_old_e2fsprogs_msg("an unknown", 0);
                  }
+                pclose(fp);
                  return -1;
          }
          pclose(fp);
@@ -891,8 +865,12 @@ int read_local_files(struct mkfs_opts *mop)
  
          dev = mop->mo_device;
  
+        /* TODO: it's worth observing the get_mountdata() function that is
+                 in mount_utils.c for getting the mountdata out of the
+                 filesystem */
+
          /* Construct debugfs command line. */
-        snprintf(cmd, cmdsz, "debugfs -c -R 'dump /%s %s/mountdata' %s",
+        snprintf(cmd, cmdsz, "debugfs -c -R 'dump /%s %s/mountdata' '%s'",
                   MOUNT_DATA_FILE, tmpdir, dev);
  
          ret = run_command(cmd, cmdsz);
@@ -1089,38 +1067,40 @@ static int add_param(char *buf, char *key, char *val, int unique)
  #define MAXNIDSTR 1024
  static char *convert_hostnames(char *s1)
  {
-        char *converted, *s2 = 0, *c;
+        char *converted, *s2 = 0, *c, *end, sep;
          int left = MAXNIDSTR;
          lnet_nid_t nid;
  
          converted = malloc(left);
+        end = s1 + strlen(s1);
          c = converted;
-        while ((left > 0) && ((s2 = strsep(&s1, ",: \0")))) {
-                nid = libcfs_str2nid(s2);
+        while ((left > 0) && (s1 < end)) {
+                s2 = strpbrk(s1, ",:");
+                if (!s2)
+                        s2 = end;
+                sep = *s2;
+                *s2 = '\0';
+                nid = libcfs_str2nid(s1);
+                
                  if (nid == LNET_NID_ANY) {
-                        if (*s2 == '/')
-                                /* end of nids */
-                                break;
-                        fprintf(stderr, "%s: Can't parse NID '%s'\n",
-                                progname, s2);
+                        fprintf(stderr, "%s: Can't parse NID '%s'\n", progname, s1);
                          free(converted);
                          return NULL;
                  }
-
                  if (strncmp(libcfs_nid2str(nid), "127.0.0.1",
                              strlen("127.0.0.1")) == 0) {
                          fprintf(stderr, "%s: The NID '%s' resolves to the "
                                  "loopback address '%s'.  Lustre requires a "
                                  "non-loopback address.\n",
-                                progname, s2, libcfs_nid2str(nid));
+                                progname, s1, libcfs_nid2str(nid));
                          free(converted);
                          return NULL;
                  }
-
-                c += snprintf(c, left, "%s,", libcfs_nid2str(nid));
+                                        
+                c += snprintf(c, left, "%s%c", libcfs_nid2str(nid), sep);
                  left = converted + MAXNIDSTR - c;
+                s1 = s2 + 1;
          }
-        *(c - 1) = '\0';
          return converted;
  }
  
@@ -1476,7 +1456,7 @@ int main(int argc, char *const argv[])
                          strscat(always_mountopts, ",asyncdel",
                                  sizeof(always_mountopts));
                  /* NB: Files created while extents are enabled cannot be read
-                   if mounted with a kernel that doesn't include the CFS
+                   if mounted with a kernel that doesn't include the Lustre ldiskfs
                     patches! */
                  if (IS_OST(ldd) &&
                      (ldd->ldd_mount_type == LDD_MT_LDISKFS ||
diff --git a/lustre/utils/mount_lustre.c b/lustre/utils/mount_lustre.c

index 906a10a..25c12f4 100644 (file)
--- a/lustre/utils/mount_lustre.c
+++ b/lustre/utils/mount_lustre.c
@@ -1,29 +1,48 @@
  /* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
   * vim:expandtab:shiftwidth=8:tabstop=8:
   *
- *  Copyright (C) 2002 Cluster File Systems, Inc.
- *   Author: Robert Read <rread@clusterfs.com>
- *   Author: Nathan Rutman <nathan@clusterfs.com>
+ * GPL HEADER START
   *
- *   This file is part of Lustre, http://www.lustre.org.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   *
- *   Lustre is free software; you can redistribute it and/or
- *   modify it under the terms of version 2 of the GNU General Public
- *   License as published by the Free Software Foundation.
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
   *
- *   Lustre is distributed in the hope that it will be useful,
- *   but WITHOUT ANY WARRANTY; without even the implied warranty of
- *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- *   GNU General Public License for more details.
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
   *
- *   You should have received a copy of the GNU General Public License
- *   along with Lustre; if not, write to the Free Software
- *   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
   *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright  2008 Sun Microsystems, Inc. All rights reserved
+ * Use is subject to license terms.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/utils/mount_lustre.c
+ *
+ * Author: Robert Read <rread@clusterfs.com>
+ * Author: Nathan Rutman <nathan@clusterfs.com>
   */
  
  
+#ifndef _GNU_SOURCE
  #define _GNU_SOURCE
+#endif
  #include <stdlib.h>
  #include <stdio.h>
  #include <unistd.h>
@@ -33,21 +52,23 @@
  #include <sys/mount.h>
  #include <mntent.h>
  #include <getopt.h>
-#include <sys/utsname.h>
  #include "obdctl.h"
  #include <lustre_ver.h>
  #include <glob.h>
  #include <ctype.h>
  #include <limits.h>
+#include "mount_utils.h"
  
  #define MAX_HW_SECTORS_KB_PATH  "queue/max_hw_sectors_kb"
  #define MAX_SECTORS_KB_PATH     "queue/max_sectors_kb"
+#define MAX_RETRIES 99
  
  int          verbose = 0;
  int          nomtab = 0;
  int          fake = 0;
  int          force = 0;
-static char *progname = NULL;
+int          retry = 0;
+char         *progname = NULL;
  
  void usage(FILE *out)
  {
@@ -72,6 +93,7 @@ void usage(FILE *out)
                  "\t\tnomgs: only start target obds, using existing MGS\n"
                  "\t\texclude=<ostname>[:<ostname>] : colon-separated list of "
                  "inactive OSTs (e.g. lustre-OST0001)\n"
+                "\t\tretry=<num>: number of times mount is retried by client\n"
                  );
          exit((out != stdout) ? EINVAL : 0);
  }
@@ -237,7 +259,7 @@ static int parse_one_option(const char *check, int *flagp)
     fill in mount flags */
  int parse_options(char *orig_options, int *flagp)
  {
-        char *options, *opt, *nextopt;
+        char *options, *opt, *nextopt, *arg, *val;
  
          options = calloc(strlen(orig_options) + 1, 1);
          *flagp = 0;
@@ -246,7 +268,19 @@ int parse_options(char *orig_options, int *flagp)
                  if (!*opt)
                          /* empty option */
                          continue;
-                if (parse_one_option(opt, flagp) == 0) {
+
+                /* Handle retries in a slightly different
+                 * manner */
+                arg = opt;
+                val = strchr(opt, '=');
+                if (val != NULL && strncmp(arg, "retry", 5) == 0) {
+                        retry = atoi(val + 1);
+                        if (retry > MAX_RETRIES)
+                                retry = MAX_RETRIES;
+                        else if (retry < 0)
+                                retry = 0;
+                }
+                else if (parse_one_option(opt, flagp) == 0) {
                          /* pass this on as an option */
                          if (*options)
                                  strcat(options, ",");
@@ -306,7 +340,7 @@ int set_tunables(char *source, int src_len)
          ret_path = realpath(source, real_path);
          if (ret_path == NULL) {
                  if (verbose)
-                        fprintf(stderr, "warning: %s: cannot resolve: %s",
+                        fprintf(stderr, "warning: %s: cannot resolve: %s\n",
                                  source, strerror(errno));
                  return -EINVAL;
          }
@@ -548,16 +582,39 @@ int main(int argc, char *const argv[])
                  printf("mounting device %s at %s, flags=%#x options=%s\n",
                         source, target, flags, optcopy);
  
-        if (set_tunables(source, strlen(source)) && verbose)
+        if (!strstr(usource, ":/") && set_tunables(source, strlen(source)) &&
+            verbose)
                  fprintf(stderr, "%s: unable to set tunables for %s"
-                                " (may cause reduced IO performance)",
+                                " (may cause reduced IO performance)\n",
                                  argv[0], source);
  
-        if (!fake)
+        register_service_tags(usource, source, target);
+
+        if (!fake) {
                  /* flags and target get to lustre_get_sb, but not
                     lustre_fill_super.  Lustre ignores the flags, but mount
                     does not. */
-                rc = mount(source, target, "lustre", flags, (void *)optcopy);
+                for (i = 0, rc = -EAGAIN; i <= retry && rc != 0; i++) {
+                        rc = mount(source, target, "lustre", flags,
+                                   (void *)optcopy);
+                        if (rc) {
+                                if (verbose) {
+                                        fprintf(stderr, "%s: mount %s at %s "
+                                                "failed: %s retries left: "
+                                                "%d\n", basename(progname),
+                                                usource, target,
+                                                strerror(errno), retry-i);
+                                }
+
+                                if (retry) {
+                                        sleep(1 << max((i/2), 5));
+                                }
+                                else {
+                                        rc = errno;
+                                }
+                        }
+                }
+        }
  
          if (rc) {
                  char *cli;
diff --git a/lustre/utils/mount_utils.c b/lustre/utils/mount_utils.c

new file mode 100644 (file)

index 0000000..d023b45
--- /dev/null
+++ b/lustre/utils/mount_utils.c
@@ -0,0 +1,265 @@
+/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
+ * vim:expandtab:shiftwidth=8:tabstop=8:
+ *
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright  2008 Sun Microsystems, Inc. All rights reserved
+ * Use is subject to license terms.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+#include <stdio.h>
+#include <errno.h>
+#include <string.h>
+#include <config.h>
+#include <lustre_disk.h>
+#include <lustre_ver.h>
+#include <sys/stat.h>
+#include <sys/utsname.h>
+
+extern char *progname;
+extern int verbose;
+
+#define vprint(fmt, arg...) if (verbose > 0) printf(fmt, ##arg)
+#define verrprint(fmt, arg...) if (verbose >= 0) fprintf(stderr, fmt, ##arg)
+
+void fatal(void)
+{
+        verbose = 0;
+        fprintf(stderr, "\n%s FATAL: ", progname);
+}
+
+int run_command(char *cmd, int cmdsz)
+{
+        char log[] = "/tmp/run_command_logXXXXXX";
+        int fd = -1, rc;
+
+        if ((cmdsz - strlen(cmd)) < 6) {
+                fatal();
+                fprintf(stderr, "Command buffer overflow: %.*s...\n",
+                        cmdsz, cmd);
+                return ENOMEM;
+        }
+
+        if (verbose > 1) {
+                printf("cmd: %s\n", cmd);
+        } else {
+                if ((fd = mkstemp(log)) >= 0) {
+                        close(fd);
+                        strcat(cmd, " >");
+                        strcat(cmd, log);
+                }
+        }
+        strcat(cmd, " 2>&1");
+
+        /* Can't use popen because we need the rv of the command */
+        rc = system(cmd);
+        if (rc && (fd >= 0)) {
+                char buf[128];
+                FILE *fp;
+                fp = fopen(log, "r");
+                if (fp) {
+                        while (fgets(buf, sizeof(buf), fp) != NULL) {
+                                printf("   %s", buf);
+                        }
+                        fclose(fp);
+                }
+        }
+        if (fd >= 0)
+                remove(log);
+        return rc;
+}
+
+int get_mountdata(char *dev, struct lustre_disk_data *mo_ldd)
+{
+
+        char tmpdir[] = "/tmp/lustre_tmp.XXXXXX";
+        char cmd[256];
+        char filepnm[128];
+        FILE *filep;
+        int ret = 0;
+        int ret2 = 0;
+        int cmdsz = sizeof(cmd);
+
+        /* Make a temporary directory to hold Lustre data files. */
+        if (!mkdtemp(tmpdir)) {
+                verrprint("%s: Can't create temporary directory %s: %s\n",
+                         progname, tmpdir, strerror(errno));
+                return errno;
+        }
+
+        snprintf(cmd, cmdsz, "/sbin/debugfs -c -R 'dump /%s %s/mountdata' %s",
+                 MOUNT_DATA_FILE, tmpdir, dev);
+
+        ret = run_command(cmd, cmdsz);
+        if (ret) {
+                verrprint("%s: Unable to dump %s dir (%d)\n",
+                          progname, MOUNT_CONFIGS_DIR, ret);
+                goto out_rmdir;
+        }
+
+        sprintf(filepnm, "%s/mountdata", tmpdir);
+        filep = fopen(filepnm, "r");
+        if (filep) {
+                vprint("Reading %s\n", MOUNT_DATA_FILE);
+                fread(mo_ldd, sizeof(*mo_ldd), 1, filep);
+       } else {
+                verrprint("%s: Unable to read %d.%d config %s.\n",
+                          progname, LUSTRE_MAJOR, LUSTRE_MINOR, filepnm);
+                goto out_close;
+       }
+
+out_close:
+        fclose(filep);
+
+out_rmdir:
+        snprintf(cmd, cmdsz, "rm -rf %s", tmpdir);
+        ret2 = run_command(cmd, cmdsz);
+        if (ret2) {
+                verrprint("Failed to remove temp dir %s (%d)\n", tmpdir, ret2);
+               /* failure return from run_command() is more important
+                 * than the failure to remove a dir */
+               if (!ret)
+                       ret = ret2;
+       }
+
+        return ret;
+}
+
+#define PARENT_URN "urn:uuid:2bb5bdbf-6c4b-11dc-9b8e-080020a9ed93"
+#define PARENT_PRODUCT "Lustre"
+
+static int stclient(char *type, char *arch)
+{
+
+        char product[64];
+        char *urn = NULL;
+        char cmd[1024];
+        FILE *fp;
+        int i;
+
+        if (strcmp(type, "Client") == 0)
+                urn = CLIENT_URN;
+        else if (strcmp(type, "MDS") == 0)
+                urn = MDS_URN;
+        else if (strcmp(type, "MGS") == 0)
+                urn = MGS_URN;
+        else if (strcmp(type, "OSS") == 0)
+                urn = OSS_URN;
+
+        snprintf(product, 64, "Lustre %s %d.%d.%d", type, LUSTRE_MAJOR,
+                 LUSTRE_MINOR, LUSTRE_PATCH); 
+
+        /* need to see if the entry exists first */
+        snprintf(cmd, 1024,
+                 "/opt/sun/servicetag/bin/stclient -f -t '%s' ", urn);
+        fp = popen(cmd, "r");
+        if (!fp) {
+                if (verbose)
+                        fprintf(stderr, "%s: trying to run stclient -f: %s\n",
+                                progname, strerror(errno));
+                return 0;
+        }
+
+        i = fread(cmd, 1, sizeof(cmd), fp);
+        if (i) {
+                cmd[i] = 0;
+                if (strcmp(cmd, "Record not found\n") != 0) {
+                        /* exists, just return */
+                        pclose(fp);
+                        return 0;
+                }
+        }
+        pclose(fp);
+
+        snprintf(cmd, 1024, "/opt/sun/servicetag/bin/stclient -a -p '%s' "
+               "-e %d.%d.%d -t '%s' -S mount -F '%s' -P '%s' -m SUN "
+               "-A %s -z global", product, LUSTRE_MAJOR, LUSTRE_MINOR,
+               LUSTRE_PATCH, urn, PARENT_URN, PARENT_PRODUCT, arch);
+
+        return(run_command(cmd, sizeof(cmd)));
+}
+
+void register_service_tags(char *usource, char *source, char *target)
+{
+        struct lustre_disk_data mo_ldd;
+        struct utsname utsname_buf;
+        struct stat stat_buf;
+        char stclient_loc[] = "/opt/sun/servicetag/bin/stclient";
+        int rc;
+
+        rc = stat(stclient_loc, &stat_buf);
+
+        if (rc == 0) {
+                /* call the service tags stclient to show that we use Lustre on
+                   this system */
+
+                rc = uname(&utsname_buf);
+                if (rc) {
+                        if (verbose)
+                                fprintf(stderr,
+                                        "%s: trying to get uname failed: %s, "
+                                        "inventory tags will not be created\n",
+                                        progname, strerror(errno));
+                } else {
+
+                        /* client or server? */
+                        if (strchr(usource, ':')) {
+                                stclient("Client", utsname_buf.machine);
+                        } else {
+                                /* first figure what type of device it is */
+                                rc = get_mountdata(source, &mo_ldd);
+                                if (rc) {
+                                        if (verbose)
+                                                fprintf(stderr,
+                                                        "%s: trying to read mountdata from %s "
+                                                        "failed: %s, inventory tags will not "
+                                                        "be created\n",
+                                                        progname, target, strerror(errno));
+                                } else {
+
+                                        if (IS_MDT(&mo_ldd))
+                                                stclient("MDS", utsname_buf.machine);
+
+                                        if (IS_MGS(&mo_ldd))
+                                                stclient("MGS", utsname_buf.machine);
+
+                                        if (IS_OST(&mo_ldd))
+                                                stclient("OSS", utsname_buf.machine);
+                                }
+                        }
+                }
+        } else {
+                if (errno != ENOENT && verbose) {
+                        fprintf(stderr,
+                                "%s: trying to stat stclient failed: %s\n",
+                                progname, strerror(errno));
+                }
+        }
+}
diff --git a/lustre/utils/mount_utils.h b/lustre/utils/mount_utils.h

new file mode 100644 (file)

index 0000000..a4a3898
--- /dev/null
+++ b/lustre/utils/mount_utils.h
@@ -0,0 +1,47 @@
+/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
+ * vim:expandtab:shiftwidth=8:tabstop=8:
+ *
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright  2008 Sun Microsystems, Inc. All rights reserved
+ * Use is subject to license terms.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+#ifndef _MOUNT_UTILS_H_
+#define _MOUNT_UTILS_H_
+
+#include <lustre_disk.h>
+
+void fatal(void);
+int run_command(char *, int);
+int get_mountdata(char *, struct lustre_disk_data *);
+void register_service_tags(char *, char *, char *);
+
+#endif
diff --git a/lustre/utils/obd.c b/lustre/utils/obd.c

index 28d4f70..35b7712 100644 (file)
--- a/lustre/utils/obd.c
+++ b/lustre/utils/obd.c
@@ -1,27 +1,44 @@
  /* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
   * vim:expandtab:shiftwidth=8:tabstop=8:
   *
- *  Copyright (C) 2002 Cluster File Systems, Inc.
- *   Author: Peter J. Braam <braam@clusterfs.com>
- *   Author: Phil Schwan <phil@clusterfs.com>
- *   Author: Andreas Dilger <adilger@clusterfs.com>
- *   Author: Robert Read <rread@clusterfs.com>
+ * GPL HEADER START
   *
- *   This file is part of Lustre, http://www.lustre.org.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   *
- *   Lustre is free software; you can redistribute it and/or
- *   modify it under the terms of version 2 of the GNU General Public
- *   License as published by the Free Software Foundation.
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
   *
- *   Lustre is distributed in the hope that it will be useful,
- *   but WITHOUT ANY WARRANTY; without even the implied warranty of
- *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- *   GNU General Public License for more details.
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
   *
- *   You should have received a copy of the GNU General Public License
- *   along with Lustre; if not, write to the Free Software
- *   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
   *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright  2008 Sun Microsystems, Inc. All rights reserved
+ * Use is subject to license terms.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/utils/obd.c
+ *
+ * Author: Peter J. Braam <braam@clusterfs.com>
+ * Author: Phil Schwan <phil@clusterfs.com>
+ * Author: Andreas Dilger <adilger@clusterfs.com>
+ * Author: Robert Read <rread@clusterfs.com>
   */
  
  #include <stdlib.h>
@@ -31,9 +48,11 @@
  #include <sys/types.h>
  #include <sys/wait.h>
  #include <sys/stat.h>
+#include <sys/param.h>
  #include <stdio.h>
  #include <stdarg.h>
  #include <signal.h>
+#include <glob.h>
  
  #include "obdctl.h"
  
@@ -47,6 +66,7 @@
  #include <errno.h>
  #include <string.h>
  #include <ctype.h>
+#include <lustre/liblustreapi.h>
  
  #ifdef HAVE_ASM_PAGE_H
  #include <asm/page.h>           /* needed for PAGE_SIZE - rread */
@@ -104,7 +124,7 @@ struct lsm_buffer {
          struct lov_oinfo *ptrs[MAX_STRIPES];
  } lsm_buffer;
  
-static int l2_ioctl(int dev_id, int opc, void *buf)
+static int l2_ioctl(int dev_id, unsigned int opc, void *buf)
  {
          return l_ioctl(dev_id, opc, buf);
  }
@@ -153,42 +173,51 @@ int lcfg_ioctl(char * func, int dev_id, struct lustre_cfg *lcfg)
  
  static int do_device(char *func, char *devname);
  
-int lcfg_mgs_ioctl(char *func, int dev_id, struct lustre_cfg *lcfg)
+static int get_mgs_device()
  {
-        struct obd_ioctl_data data;
-        static int mgs_device = -1;
          char mgs[] = "$MGS";
-        int rc;
+        static int mgs_device = -1;
  
-        /* Always operates on MGS dev */
          if (mgs_device == -1) {
+                int rc;
                  do_disconnect(NULL, 1);
                  rc = do_device("mgsioc", mgs);
                  if (rc) {
+                        fprintf(stderr, 
+                                "This command must be run on the MGS.\n");
                          errno = ENODEV;
                          return -1;
                  }
                  mgs_device = cur_device;
          }
+        return mgs_device;
+}
  
+/* Returns -1 on error with errno set */
+int lcfg_mgs_ioctl(char *func, int dev_id, struct lustre_cfg *lcfg)
+{
+        struct obd_ioctl_data data;
+        int rc;
+        
          IOC_INIT(data);
-        data.ioc_dev = mgs_device;
+        rc = data.ioc_dev = get_mgs_device();
+        if (rc < 0)
+                goto out;
          data.ioc_type = LUSTRE_CFG_TYPE;
          data.ioc_plen1 = lustre_cfg_len(lcfg->lcfg_bufcount,
                                          lcfg->lcfg_buflens);
          data.ioc_pbuf1 = (void *)lcfg;
          IOC_PACK(func, data);
  
-        rc =  l_ioctl(dev_id, OBD_IOC_PARAM, buf);
-
-        if (rc == ENODEV)
-                fprintf(stderr, "Is the MGS running on this node?\n");
-        if (rc == ENOSYS)
-                fprintf(stderr, "Make sure cfg_device is set first.\n");
-        if (rc == EINVAL)
-                fprintf(stderr, "cfg_device should be of the form "
-                        "'lustre-MDT0000'\n");
-
+        rc = l_ioctl(dev_id, OBD_IOC_PARAM, buf);
+out:
+        if (rc) {
+                if (errno == ENOSYS)
+                        fprintf(stderr, "Make sure cfg_device is set first.\n");
+                if (errno == EINVAL)
+                        fprintf(stderr, "cfg_device should be of the form "
+                                "'lustre-MDT0000'\n");
+        }
          return rc;
  }
  
@@ -2282,7 +2311,7 @@ int jt_blockdev_info(int argc, char **argv)
          if (ino == 0ULL)
                  fprintf(stdout, "Not attached\n");
          else
-                fprintf(stdout, "attached to inode %llu\n", ino);
+                fprintf(stdout, "attached to inode "LPU64"\n", ino);
  out:
          close(fd);
          return -rc;
@@ -2323,3 +2352,620 @@ void obd_finalize(int argc, char **argv)
          shmem_stop();
          do_disconnect(argv[0], 1);
  }
+
+static int find_target_obdpath(char *fsname, char *path)
+{
+        glob_t glob_info;
+        char pattern[MAXPATHLEN + 1];
+        int rc;
+
+        snprintf(pattern, MAXPATHLEN,
+                 "/proc/fs/lustre/lov/%s-*/target_obd",
+                 fsname);
+        rc = glob(pattern, GLOB_BRACE, NULL, &glob_info);
+        if (rc)
+                return -EINVAL;
+
+        if (glob_info.gl_pathc == 0) {
+                globfree(&glob_info);
+                return -EINVAL;
+        }
+
+        strcpy(path, glob_info.gl_pathv[0]);
+        return 0;
+}
+
+static int find_poolpath(char *fsname, char *poolname, char *poolpath)
+{
+        glob_t glob_info;
+        char pattern[MAXPATHLEN + 1];
+        int rc;
+
+        snprintf(pattern, MAXPATHLEN,
+                 "/proc/fs/lustre/lov/%s-*/pools/%s",
+                 fsname, poolname);
+        rc = glob(pattern, GLOB_BRACE, NULL, &glob_info);
+        if (rc)
+                return -EINVAL;
+
+        if (glob_info.gl_pathc == 0) {
+                globfree(&glob_info);
+                return -EINVAL;
+        }
+
+        strcpy(poolpath, glob_info.gl_pathv[0]);
+        return 0;
+}
+
+/*
+ * if pool is NULL, search ostname in target_obd
+ * if pool is no NULL
+ *  if pool not found returns < 0
+ *  if ostname is NULL, returns 1 if pool is not empty and 0 if pool empty
+ *  if ostname is not NULL, returns 1 if OST is in pool and 0 if not
+ */
+static int search_ost(char *fsname, char *poolname, char *ostname)
+{
+        FILE *fd;
+        char buffer[MAXPATHLEN + 1];
+        int len = 0, rc;
+
+        if (ostname != NULL)
+                len = strlen(ostname);
+
+        if (poolname == NULL)
+                rc = find_target_obdpath(fsname, buffer);
+        else
+                rc = find_poolpath(fsname, poolname, buffer);
+        if (rc)
+                return rc;
+
+        if ((fd = fopen(buffer, "r")) == NULL)
+                return -EINVAL;
+
+        while (fgets(buffer, sizeof(buffer), fd) != NULL) {
+                if (poolname == NULL) {
+                        /* we search ostname in target_obd */
+                        if (strncmp(buffer + 3, ostname, len) == 0) {
+                                fclose(fd);
+                                return 1;
+                        }
+                } else {
+                        /* we search a non empty pool or
+                           an ostname in a pool */
+                        if ((ostname == NULL) ||
+                            (strncmp(buffer, ostname, len) == 0)) {
+                                fclose(fd);
+                                return 1;
+                        }
+                }
+        }
+        fclose(fd);
+        return 0;
+}
+
+static int check_pool_cmd(enum lcfg_command_type cmd,
+                          char *fsname, char *poolname,
+                          char *ostname)
+{
+        int rc = 0;
+
+        switch (cmd) {
+        case LCFG_POOL_NEW: {
+                if (search_ost(fsname, poolname, NULL) >= 0) {
+                        fprintf(stderr, "Pool %s.%s already exists\n",
+                                fsname, poolname);
+                        return -EEXIST;
+                }
+                return 0;
+        }
+        case LCFG_POOL_DEL: {
+                rc = search_ost(fsname, poolname, NULL);
+                if (rc < 0) {
+                        fprintf(stderr, "Pool %s.%s not found\n",
+                                fsname, poolname);
+                        return -ENOENT;
+                }
+                if (rc == 1) {
+                        fprintf(stderr, "Pool %s.%s not empty, "
+                                "please remove all members\n",
+                                fsname, poolname);
+                        return -ENOTEMPTY;
+                }
+                return 0;
+        }
+        case LCFG_POOL_ADD: {
+                rc = search_ost(fsname, NULL, ostname);
+                if (rc == 0) {
+                        fprintf(stderr, "OST %s not found in lov of %s\n",
+                                ostname, fsname);
+                        return -ENOENT;
+                }
+                rc = search_ost(fsname, poolname, ostname);
+                if (rc < 0) {
+                        fprintf(stderr, "Pool %s.%s not found\n",
+                                fsname, poolname);
+                        return -ENOENT;
+                }
+                if (rc == 1) {
+                        fprintf(stderr, "OST %s already in pool %s.%s\n",
+                                ostname, fsname, poolname);
+                        return -EEXIST;
+                }
+                return 0;
+        }
+        case LCFG_POOL_REM: {
+                rc = search_ost(fsname, poolname, ostname);
+                if (rc < 0) {
+                        fprintf(stderr, "Pool %s.%s not found\n",
+                                fsname, poolname);
+                        return -ENOENT;
+                }
+                if (rc == 0) {
+                        fprintf(stderr, "OST %s not found in pool %s.%s\n",
+                                ostname, fsname, poolname);
+                        return -ENOENT;
+                }
+                return 0;
+        }
+        default: {
+        }
+        }
+        return 0;
+}
+
+static void check_pool_cmd_result(enum lcfg_command_type cmd,
+                                  char *fsname, char *poolname,
+                                  char *ostname)
+{
+        int cpt, rc = 0;
+
+        cpt = 10;
+        switch (cmd) {
+        case LCFG_POOL_NEW: {
+                do {
+                        rc = search_ost(fsname, poolname, NULL);
+                        if (rc < 0)
+                                sleep(2);
+                        cpt--;
+                } while ((rc < 0) && (cpt > 0));
+                if (rc >= 0)
+                        fprintf(stderr, "Pool %s.%s created\n",
+                                fsname, poolname);
+                else
+                        fprintf(stderr, "Warning, pool %s.%s not found\n",
+                                fsname, poolname);
+                return;
+        }
+        case LCFG_POOL_DEL: {
+                do {
+                         rc = search_ost(fsname, poolname, NULL);
+                         if (rc >= 0)
+                                sleep(2);
+                         cpt--;
+                } while ((rc >= 0) && (cpt > 0));
+                if (rc < 0)
+                        fprintf(stderr, "Pool %s.%s destroyed\n",
+                                fsname, poolname);
+                else
+                        fprintf(stderr, "Warning, pool %s.%s still found\n",
+                                fsname, poolname);
+                return;
+        }
+        case LCFG_POOL_ADD: {
+                do {
+                        rc = search_ost(fsname, poolname, ostname);
+                        if (rc != 1)
+                                sleep(2);
+                        cpt--;
+                } while ((rc != 1) && (cpt > 0));
+                if (rc == 1)
+                        fprintf(stderr, "OST %s added to pool %s.%s\n",
+                                ostname, fsname, poolname);
+                else
+                        fprintf(stderr, "Warning, OST %s not found in pool %s.%s\n",
+                                ostname, fsname, poolname);
+                return;
+        }
+        case LCFG_POOL_REM: {
+                do {
+                        rc = search_ost(fsname, poolname, ostname);
+                        if (rc == 1)
+                                sleep(2);
+                        cpt--;
+                } while ((rc == 1) && (cpt > 0));
+                if (rc != 1)
+                        fprintf(stderr, "OST %s removed from pool %s.%s\n",
+                                ostname, fsname, poolname);
+                else
+                        fprintf(stderr, "Warning, OST %s still found in pool %s.%s\n",
+                                ostname, fsname, poolname);
+                return;
+        }
+        default: {
+        }
+        }
+}
+
+static int check_and_complete_ostname(char *fsname, char *ostname)
+{
+        char *ptr;
+        char real_ostname[MAX_OBD_NAME + 1];
+        char i;
+
+        /* if OST name does not start with fsname, we add it */
+        /* if not check if the fsname is the right one */
+        ptr = strchr(ostname, '-');
+        if (ptr == NULL) {
+                sprintf(real_ostname, "%s-%s", fsname, ostname);
+        } else if (strncmp(ostname, fsname, strlen(fsname)) != 0) {
+                fprintf(stderr, "%s does not start with fsname %s\n",
+                        ostname, fsname);
+                return -EINVAL;
+        } else {
+             strcpy(real_ostname, ostname);
+        }
+        /* real_ostname is fsname-????? */
+        ptr = real_ostname + strlen(fsname) + 1;
+        if (strncmp(ptr, "OST", 3) != 0) {
+                fprintf(stderr, "%s does not start by %s-OST nor OST\n",
+                        ostname, fsname);
+                return -EINVAL;
+        }
+        /* real_ostname is fsname-OST????? */
+        ptr += 3;
+        for (i = 0; i < 4; i++) {
+                if (!isxdigit(*ptr)) {
+                        fprintf(stderr,
+                                "ost's index in %s is not an hexa number\n",
+                                ostname);
+                        return -EINVAL;
+                }
+                ptr++;
+        }
+        /* real_ostname is fsname-OSTXXXX????? */
+        /* if OST name does not end with _UUID, we add it */
+        if (*ptr == '\0') {
+                strcat(real_ostname, "_UUID");
+        } else if (strcmp(ptr, "_UUID") != 0) {
+                fprintf(stderr,
+                        "ostname %s does not end with _UUID\n", ostname);
+                return -EINVAL;
+        }
+        /* real_ostname is fsname-OSTXXXX_UUID */
+        strcpy(ostname, real_ostname);
+        return 0;
+}
+
+/* returns 0 or -errno */
+static int pool_cmd(enum lcfg_command_type cmd,
+                    char *cmdname, char *fullpoolname,
+                    char *fsname, char *poolname, char *ostname)
+{
+        int rc = 0;
+        struct obd_ioctl_data data;
+        struct lustre_cfg_bufs bufs;
+        struct lustre_cfg *lcfg;
+
+        rc = check_pool_cmd(cmd, fsname, poolname, ostname);
+        if (rc)
+                return rc;
+
+        lustre_cfg_bufs_reset(&bufs, NULL);
+        lustre_cfg_bufs_set_string(&bufs, 0, cmdname);
+        lustre_cfg_bufs_set_string(&bufs, 1, fullpoolname);
+        if (ostname != NULL)
+                lustre_cfg_bufs_set_string(&bufs, 2, ostname);
+
+        lcfg = lustre_cfg_new(cmd, &bufs);
+        if (IS_ERR(lcfg)) {
+                rc = PTR_ERR(lcfg);
+                return rc;
+        }
+
+        IOC_INIT(data);
+        rc = data.ioc_dev = get_mgs_device();
+        if (rc < 0)
+                goto out;
+
+        data.ioc_type = LUSTRE_CFG_TYPE;
+        data.ioc_plen1 = lustre_cfg_len(lcfg->lcfg_bufcount,
+                                        lcfg->lcfg_buflens);
+        data.ioc_pbuf1 = (void *)lcfg;
+        IOC_PACK(cmdname, data);
+
+        rc = l_ioctl(OBD_DEV_ID, OBD_IOC_POOL, buf);
+out:
+        if (rc)
+                rc = -errno;
+        lustre_cfg_free(lcfg);
+        return rc;
+}
+
+/*
+ * this function tranforms a rule [start-end/step] into an array
+ * of matching numbers
+ * supported forms are:
+ * [start]                : just this number
+ * [start-end]            : all numbers from start to end
+ * [start-end/step]       : numbers from start to end with increment of step
+ * on return, format contains a printf format string which can be used
+ * to generate all the strings
+ */
+static int get_array_idx(char *rule, char *format, int **array)
+{
+        char *start, *end, *ptr;
+        unsigned int lo, hi, step;
+        int array_sz = 0;
+        int i, array_idx;
+        int rc;
+
+        start = strchr(rule, '[');
+        end = strchr(rule, ']');
+        if ((start == NULL) || (end == NULL)) {
+                *array = malloc(sizeof(int));
+                if (*array == NULL)
+                        return 0;
+                strcpy(format, rule);
+                array_sz = 1;
+                return array_sz;
+        }
+        *start = '\0';
+        *end = '\0';
+        end++;
+        start++;
+        /* put in format the printf format (the rule without the range) */
+        sprintf(format, "%s%%.4d%s", rule, end);
+
+        array_idx = 0;
+        array_sz = 0;
+        *array = NULL;
+        /* loop on , separator */
+        do {
+                /* extract the 3 fields */
+                rc = sscanf(start, "%u-%u/%u", &lo, &hi, &step);
+                switch (rc) {
+                case 0: {
+                        return 0;
+                }
+                case 1: {
+                        array_sz++;
+                        *array = realloc(*array, array_sz * sizeof(int));
+                        if (*array == NULL)
+                                return 0;
+                        (*array)[array_idx] = lo;
+                        array_idx++;
+                        break;
+                }
+                case 2: {
+                        step = 1;
+                        /* do not break to share code with case 3: */
+                }
+                case 3: {
+                        if ((hi < lo) || (step == 0))
+                                return 0;
+                        array_sz += (hi - lo) / step + 1;
+                        *array = realloc(*array, sizeof(int) * array_sz);
+                        if (*array == NULL)
+                                return 0;
+                        for (i = lo; i <= hi; i+=step, array_idx++)
+                                (*array)[array_idx] = i;
+                        break;
+                }
+                }
+                ptr = strchr(start, ',');
+                if (ptr != NULL)
+                        start = ptr + 1;
+
+        } while (ptr != NULL);
+        return array_sz;
+}
+
+static int extract_fsname_poolname(char *arg, char *fsname, char *poolname)
+{
+        char *ptr;
+        int len;
+        int rc;
+
+        strcpy(fsname, arg);
+        ptr = strchr(fsname, '.');
+        if (ptr == NULL) {
+                fprintf(stderr, ". is missing in %s\n", fsname);
+                rc = -EINVAL;
+                goto err;
+        }
+
+        len = ptr - fsname;
+        if (len == 0) {
+                fprintf(stderr, "fsname is empty\n");
+                rc = -EINVAL;
+                goto err;
+        }
+
+        len = strlen(ptr + 1);
+        if (len == 0) {
+                fprintf(stderr, "poolname is empty\n");
+                rc = -EINVAL;
+                goto err;
+        }
+        if (len > LOV_MAXPOOLNAME) {
+                fprintf(stderr,
+                        "poolname %s is too long (length is %d max is %d)\n",
+                        ptr + 1, len, LOV_MAXPOOLNAME);
+                rc = -ENAMETOOLONG;
+                goto err;
+        }
+        strncpy(poolname, ptr + 1, LOV_MAXPOOLNAME);
+        poolname[LOV_MAXPOOLNAME] = '\0';
+        *ptr = '\0';
+        return 0;
+
+err:
+        fprintf(stderr, "argument %s must be <fsname>.<poolname>\n", arg);
+        return rc;
+}
+
+int jt_pool_cmd(int argc, char **argv)
+{
+        enum lcfg_command_type cmd;
+        char fsname[MAXPATHLEN + 1];
+        char poolname[LOV_MAXPOOLNAME + 1];
+        char *ostnames_buf = NULL;
+        int i, rc;
+        int *array = NULL, array_sz;
+        struct {
+                int     rc;
+                char   *ostname;
+        } *cmds = NULL;
+
+        switch (argc) {
+        case 0:
+        case 1: return CMD_HELP;
+        case 2: {
+                if (strcmp("pool_new", argv[0]) == 0)
+                        cmd = LCFG_POOL_NEW;
+                else if (strcmp("pool_destroy", argv[0]) == 0)
+                        cmd = LCFG_POOL_DEL;
+                else if (strcmp("pool_list", argv[0]) == 0)
+                         return llapi_poollist(argv[1]);
+                else return CMD_HELP;
+
+                rc = extract_fsname_poolname(argv[1], fsname, poolname);
+                if (rc)
+                        break;
+
+                rc = pool_cmd(cmd, argv[0], argv[1],
+                              fsname, poolname, NULL);
+                if (rc)
+                        break;
+
+                check_pool_cmd_result(cmd, fsname, poolname, NULL);
+                break;
+        }
+        default: {
+                char format[2*MAX_OBD_NAME];
+
+                if (strcmp("pool_remove", argv[0]) == 0) {
+                        cmd = LCFG_POOL_REM;
+                } else if (strcmp("pool_add", argv[0]) == 0) {
+                        cmd = LCFG_POOL_ADD;
+                } else {
+                        return CMD_HELP;
+                }
+
+                rc = extract_fsname_poolname(argv[1], fsname, poolname);
+                if (rc)
+                        break;
+
+                for (i = 2; i < argc; i++) {
+                        int j;
+
+                        array_sz = get_array_idx(argv[i], format, &array);
+                        if (array_sz == 0)
+                                return CMD_HELP;
+
+                        cmds = malloc(array_sz * sizeof(cmds[0]));
+                        if (cmds != NULL) {
+                                ostnames_buf = malloc(array_sz *
+                                                      (MAX_OBD_NAME + 1));
+                        } else {
+                                free(array);
+                                rc = -ENOMEM;
+                                goto out;
+                        }
+
+                        for (j = 0; j < array_sz; j++) {
+                                char ostname[MAX_OBD_NAME + 1];
+
+                                snprintf(ostname, MAX_OBD_NAME, format,
+                                         array[j]);
+                                ostname[MAX_OBD_NAME] = '\0';
+
+                                rc = check_and_complete_ostname(fsname,ostname);
+                                if (rc) {
+                                        free(array);
+                                        free(cmds);
+                                        if (ostnames_buf)
+                                                free(ostnames_buf);
+                                        goto out;
+                                }
+                                if (ostnames_buf != NULL) {
+                                        cmds[j].ostname =
+                                          &ostnames_buf[(MAX_OBD_NAME + 1) * j];
+                                        strcpy(cmds[j].ostname, ostname);
+                                } else {
+                                        cmds[j].ostname = NULL;
+                                }
+                                cmds[j].rc = pool_cmd(cmd, argv[0], argv[1],
+                                                      fsname, poolname,
+                                                      ostname);
+                        }
+                        for (j = 0; j < array_sz; j++) {
+                                if (!cmds[j].rc) {
+                                        char ostname[MAX_OBD_NAME + 1];
+
+                                        if (!cmds[j].ostname) {
+                                                snprintf(ostname, MAX_OBD_NAME,
+                                                         format, array[j]);
+                                                ostname[MAX_OBD_NAME] = '\0';
+                                                check_and_complete_ostname(
+                                                        fsname, ostname);
+                                        } else {
+                                                strcpy(ostname,
+                                                       cmds[j].ostname);
+                                        }
+                                        check_pool_cmd_result(cmd, fsname,
+                                                              poolname,ostname);
+                                }
+                        }
+                        if (array_sz > 0)
+                                free(array);
+                        if (cmds)
+                                free(cmds);
+                        if (ostnames_buf);
+                                free(ostnames_buf);
+                }
+                return 0;
+        }
+        }
+
+
+out:
+        if ((rc == -EINVAL) || (rc == -ENOENT))
+                fprintf(stderr, "Does the fs, pool or ost exist?\n");
+        if (rc != 0) {
+                errno = -rc;
+                perror(argv[0]);
+        }
+
+        return rc;
+}
+
+void  llapi_ping_target(char *obd_type, char *obd_name,
+                        char *obd_uuid, void *args)
+{
+        int  rc;
+        struct obd_ioctl_data data;
+
+        memset(&data, 0, sizeof(data));
+        data.ioc_inlbuf4 = obd_name;
+        data.ioc_inllen4 = strlen(obd_name) + 1;
+        data.ioc_dev = OBD_DEV_BY_DEVNAME;
+        memset(buf, 0, sizeof(rawbuf));
+        if (obd_ioctl_pack(&data, &buf, max)) {
+                fprintf(stderr, "error: invalid ioctl\n");
+                return;
+        }
+        rc = l_ioctl(OBD_DEV_ID, OBD_IOC_PING_TARGET, buf);
+        if (rc)
+                rc = errno;
+        if (rc == ENOTCONN || rc == ESHUTDOWN) {
+                printf("%s inactive.\n", obd_name);
+        } else if (rc) {
+                fprintf(stderr, "error: check '%s' %s\n",
+                        obd_name, strerror(errno));
+        } else {
+                printf("%s active.\n", obd_name);
+        }
+
+}
diff --git a/lustre/utils/obdbarrier.c b/lustre/utils/obdbarrier.c

index f2c6eb1..5e27fb6 100644 (file)
--- a/lustre/utils/obdbarrier.c
+++ b/lustre/utils/obdbarrier.c
@@ -1,24 +1,41 @@
  /* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
   * vim:expandtab:shiftwidth=8:tabstop=8:
   *
- *  Copyright (C) 2002 Cluster File Systems, Inc.
- *   Author: Eric Barton <eeb@clusterfs.com>
+ * GPL HEADER START
   *
- *   This file is part of Lustre, http://www.lustre.org.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   *
- *   Lustre is free software; you can redistribute it and/or
- *   modify it under the terms of version 2 of the GNU General Public
- *   License as published by the Free Software Foundation.
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
   *
- *   Lustre is distributed in the hope that it will be useful,
- *   but WITHOUT ANY WARRANTY; without even the implied warranty of
- *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- *   GNU General Public License for more details.
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
   *
- *   You should have received a copy of the GNU General Public License
- *   along with Lustre; if not, write to the Free Software
- *   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
   *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright  2008 Sun Microsystems, Inc. All rights reserved
+ * Use is subject to license terms.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/utils/obdbarrier.c
+ *
+ * Author: Eric Barton <eeb@clusterfs.com>
   */
  
  #include <stdio.h>
@@ -220,5 +237,3 @@ main (int argc, char **argv)
  
          return (rc == 0 ? 0 : 1);
  }
-
-
diff --git a/lustre/utils/obdctl.c b/lustre/utils/obdctl.c

index 26bedd1..7c8dc23 100644 (file)
--- a/lustre/utils/obdctl.c
+++ b/lustre/utils/obdctl.c
@@ -1,25 +1,42 @@
  /* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
   * vim:expandtab:shiftwidth=8:tabstop=8:
   *
- *  Copyright (C) 2002 Cluster File Systems, Inc.
- *   Author: Peter J. Braam <braam@clusterfs.com>
- *   Author: Phil Schwan <phil@clusterfs.com>
+ * GPL HEADER START
   *
- *   This file is part of Lustre, http://www.lustre.org.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   *
- *   Lustre is free software; you can redistribute it and/or
- *   modify it under the terms of version 2 of the GNU General Public
- *   License as published by the Free Software Foundation.
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
   *
- *   Lustre is distributed in the hope that it will be useful,
- *   but WITHOUT ANY WARRANTY; without even the implied warranty of
- *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- *   GNU General Public License for more details.
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
   *
- *   You should have received a copy of the GNU General Public License
- *   along with Lustre; if not, write to the Free Software
- *   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
   *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright  2008 Sun Microsystems, Inc. All rights reserved
+ * Use is subject to license terms.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/utils/obdctl.c
+ *
+ * Author: Peter J. Braam <braam@clusterfs.com>
+ * Author: Phil Schwan <phil@clusterfs.com>
   */
  
  #include <stdlib.h>
diff --git a/lustre/utils/obdctl.h b/lustre/utils/obdctl.h

index 3f2bf77..eeb1bb8 100644 (file)
--- a/lustre/utils/obdctl.h
+++ b/lustre/utils/obdctl.h
@@ -1,8 +1,39 @@
  /* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
   * vim:expandtab:shiftwidth=8:tabstop=8:
   *
- *   This file is part of Lustre, http://www.lustre.org
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright  2008 Sun Microsystems, Inc. All rights reserved
+ * Use is subject to license terms.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
   */
+
  #ifndef _OBDCTL_H_
  #define _OBDCTL_H_
  
@@ -88,4 +119,6 @@ int jt_blockdev_attach(int argc, char **argv);
  int jt_blockdev_detach(int argc, char **argv);
  int jt_blockdev_info(int argc, char **argv);
  
+int jt_pool_cmd(int argc, char **argv);
+
  #endif
diff --git a/lustre/utils/obdio.c b/lustre/utils/obdio.c

index a8a6a73..62212b8 100644 (file)
--- a/lustre/utils/obdio.c
+++ b/lustre/utils/obdio.c
@@ -1,24 +1,41 @@
  /* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
   * vim:expandtab:shiftwidth=8:tabstop=8:
   *
- *  Copyright (C) 2002 Cluster File Systems, Inc.
- *   Author: Eric Barton <eeb@clusterfs.com>
+ * GPL HEADER START
   *
- *   This file is part of Lustre, http://www.lustre.org.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   *
- *   Lustre is free software; you can redistribute it and/or
- *   modify it under the terms of version 2 of the GNU General Public
- *   License as published by the Free Software Foundation.
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
   *
- *   Lustre is distributed in the hope that it will be useful,
- *   but WITHOUT ANY WARRANTY; without even the implied warranty of
- *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- *   GNU General Public License for more details.
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
   *
- *   You should have received a copy of the GNU General Public License
- *   along with Lustre; if not, write to the Free Software
- *   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
   *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright  2008 Sun Microsystems, Inc. All rights reserved
+ * Use is subject to license terms.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/utils/obdio.c
+ *
+ * Author: Eric Barton <eeb@clusterfs.com>
   */
  
  #include <stdio.h>
@@ -293,5 +310,3 @@ main (int argc, char **argv)
  
          return (rc == 0 ? 0 : 1);
  }
-
-
diff --git a/lustre/utils/obdiolib.c b/lustre/utils/obdiolib.c

index 3d04475..6b8ec44 100644 (file)
--- a/lustre/utils/obdiolib.c
+++ b/lustre/utils/obdiolib.c
@@ -1,24 +1,41 @@
  /* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
   * vim:expandtab:shiftwidth=8:tabstop=8:
   *
- *  Copyright (C) 2003 Cluster File Systems, Inc.
- *   Author: Eric Barton <eeb@clusterfs.com>
+ * GPL HEADER START
   *
- *   This file is part of Lustre, http://www.lustre.org.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   *
- *   Lustre is free software; you can redistribute it and/or
- *   modify it under the terms of version 2 of the GNU General Public
- *   License as published by the Free Software Foundation.
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
   *
- *   Lustre is distributed in the hope that it will be useful,
- *   but WITHOUT ANY WARRANTY; without even the implied warranty of
- *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- *   GNU General Public License for more details.
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
   *
- *   You should have received a copy of the GNU General Public License
- *   along with Lustre; if not, write to the Free Software
- *   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
   *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright  2008 Sun Microsystems, Inc. All rights reserved
+ * Use is subject to license terms.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/utils/obdiolib.c
+ *
+ * Author: Eric Barton <eeb@clusterfs.com>
   */
  
  #include <stdio.h>
diff --git a/lustre/utils/obdiolib.h b/lustre/utils/obdiolib.h

index e9e0642..bfd6c12 100644 (file)
--- a/lustre/utils/obdiolib.h
+++ b/lustre/utils/obdiolib.h
@@ -1,8 +1,39 @@
  /* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
   * vim:expandtab:shiftwidth=8:tabstop=8:
   *
- *   This file is part of Lustre, http://www.lustre.org
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright  2008 Sun Microsystems, Inc. All rights reserved
+ * Use is subject to license terms.
   */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
  #ifndef _OBDIOLIB_H_
  #define _OBDIOLIB_H_
  
diff --git a/lustre/utils/parser.c b/lustre/utils/parser.c

index 239c9b9..ec53f12 100644 (file)
--- a/lustre/utils/parser.c
+++ b/lustre/utils/parser.c
@@ -1,23 +1,37 @@
  /* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
   * vim:expandtab:shiftwidth=8:tabstop=8:
   *
- * Copyright (C) 2001 Cluster File Systems, Inc.
+ * GPL HEADER START
   *
- *   This file is part of Lustre, http://www.sf.net/projects/lustre/
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   *
- *   Lustre is free software; you can redistribute it and/or
- *   modify it under the terms of version 2 of the GNU General Public
- *   License as published by the Free Software Foundation.
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
   *
- *   Lustre is distributed in the hope that it will be useful,
- *   but WITHOUT ANY WARRANTY; without even the implied warranty of
- *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- *   GNU General Public License for more details.
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
   *
- *   You should have received a copy of the GNU General Public License
- *   along with Lustre; if not, write to the Free Software
- *   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
   *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright  2008 Sun Microsystems, Inc. All rights reserved
+ * Use is subject to license terms.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
   */
  #include <stdio.h>
  #include <stdlib.h>
diff --git a/lustre/utils/parser.h b/lustre/utils/parser.h

index a1f899b..62ebe78 100644 (file)
--- a/lustre/utils/parser.h
+++ b/lustre/utils/parser.h
@@ -1,3 +1,39 @@
+/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
+ * vim:expandtab:shiftwidth=8:tabstop=8:
+ *
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright  2008 Sun Microsystems, Inc. All rights reserved
+ * Use is subject to license terms.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
  #ifndef _PARSER_H_
  #define _PARSER_H_
  
diff --git a/lustre/utils/platform.h b/lustre/utils/platform.h

index 4f5b5c9..920e70b 100644 (file)
--- a/lustre/utils/platform.h
+++ b/lustre/utils/platform.h
@@ -1,23 +1,37 @@
  /* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
   * vim:expandtab:shiftwidth=8:tabstop=8:
   *
- *  Copyright (C) 2002 Cluster File Systems, Inc.
+ * GPL HEADER START
   *
- *   This file is part of Lustre, http://www.lustre.org.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   *
- *   Lustre is free software; you can redistribute it and/or
- *   modify it under the terms of version 2 of the GNU General Public
- *   License as published by the Free Software Foundation.
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
   *
- *   Lustre is distributed in the hope that it will be useful,
- *   but WITHOUT ANY WARRANTY; without even the implied warranty of
- *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- *   GNU General Public License for more details.
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
   *
- *   You should have received a copy of the GNU General Public License
- *   along with Lustre; if not, write to the Free Software
- *   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
   *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright  2008 Sun Microsystems, Inc. All rights reserved
+ * Use is subject to license terms.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
   */
  #ifndef __LUSTRE_UTILS_PLATFORM_H
  #define __LUSTRE_UTILS_PLATFORM_H
diff --git a/lustre/utils/thread.c b/lustre/utils/thread.c

new file mode 100644 (file)

index 0000000..aa1481d
--- /dev/null
+++ b/lustre/utils/thread.c
@@ -0,0 +1,50 @@
+/*****************************************************************************
+ *  $Id: thread.c,v 1.1.10.2 2008/12/18 18:02:31 johann Exp $
+ *****************************************************************************
+ *  Copyright (C) 2003 The Regents of the University of California.
+ *  Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER).
+ *  Written by Chris Dunlap <cdunlap@llnl.gov>.
+ *
+ *  This file is from LSD-Tools, the LLNL Software Development Toolbox.
+ *
+ *  LSD-Tools is free software; you can redistribute it and/or modify it under
+ *  the terms of the GNU General Public License as published by the Free
+ *  Software Foundation; either version 2 of the License, or (at your option)
+ *  any later version.
+ *
+ *  LSD-Tools is distributed in the hope that it will be useful, but WITHOUT
+ *  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ *  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ *  more details.
+ *
+ *  You should have received a copy of the GNU General Public License along
+ *  with LSD-Tools; if not, write to the Free Software Foundation, Inc.,
+ *  51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+ *****************************************************************************/
+
+
+#if HAVE_CONFIG_H
+#  include "config.h"
+#endif /* HAVE_CONFIG_H */
+
+#include <assert.h>
+#include <errno.h>
+#include <pthread.h>
+#include "thread.h"
+
+
+#if WITH_PTHREADS
+#ifndef NDEBUG
+int
+lsd_mutex_is_locked (pthread_mutex_t *mutex)
+{
+/*  Returns true if the mutex is locked; o/w, returns false.
+ */
+    int rc;
+
+    assert (mutex != NULL);
+    rc = pthread_mutex_trylock (mutex);
+    return (rc == EBUSY ? 1 : 0);
+}
+#endif /* !NDEBUG */
+#endif /* WITH_PTHREADS */
diff --git a/lustre/utils/thread.h b/lustre/utils/thread.h

new file mode 100644 (file)

index 0000000..2a6b1da
--- /dev/null
+++ b/lustre/utils/thread.h
@@ -0,0 +1,106 @@
+/*****************************************************************************
+ *  $Id: thread.h,v 1.1.10.2 2008/12/18 18:02:31 johann Exp $
+ *****************************************************************************
+ *  Copyright (C) 2003 The Regents of the University of California.
+ *  Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER).
+ *  Written by Chris Dunlap <cdunlap@llnl.gov>.
+ *
+ *  This file is from LSD-Tools, the LLNL Software Development Toolbox.
+ *
+ *  LSD-Tools is free software; you can redistribute it and/or modify it under
+ *  the terms of the GNU General Public License as published by the Free
+ *  Software Foundation; either version 2 of the License, or (at your option)
+ *  any later version.
+ *
+ *  LSD-Tools is distributed in the hope that it will be useful, but WITHOUT
+ *  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ *  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ *  more details.
+ *
+ *  You should have received a copy of the GNU General Public License along
+ *  with LSD-Tools; if not, write to the Free Software Foundation, Inc.,
+ *  51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+ *****************************************************************************/
+
+
+#ifndef LSD_THREAD_H
+#define LSD_THREAD_H
+
+#if WITH_PTHREADS
+#  include <errno.h>
+#  include <pthread.h>
+#  include <stdlib.h>
+#endif /* WITH_PTHREADS */
+
+
+/*****************************************************************************
+ *  Macros
+ *****************************************************************************/
+
+#if WITH_PTHREADS
+
+#  ifdef WITH_LSD_FATAL_ERROR_FUNC
+#    undef lsd_fatal_error
+     extern void lsd_fatal_error (char *file, int line, char *mesg);
+#  else /* !WITH_LSD_FATAL_ERROR_FUNC */
+#    ifndef lsd_fatal_error
+#      define lsd_fatal_error(file, line, mesg) (abort ())
+#    endif /* !lsd_fatal_error */
+#  endif /* !WITH_LSD_FATAL_ERROR_FUNC */
+
+#  define lsd_mutex_init(pmutex)                                              \
+     do {                                                                     \
+         int e = pthread_mutex_init (pmutex, NULL);                           \
+         if (e != 0) {                                                        \
+             errno = e;                                                       \
+             lsd_fatal_error (__FILE__, __LINE__, "mutex_init");              \
+             abort ();                                                        \
+         }                                                                    \
+     } while (0)
+
+#  define lsd_mutex_lock(pmutex)                                              \
+     do {                                                                     \
+         int e = pthread_mutex_lock (pmutex);                                 \
+         if (e != 0) {                                                        \
+             errno = e;                                                       \
+             lsd_fatal_error (__FILE__, __LINE__, "mutex_lock");              \
+             abort ();                                                        \
+         }                                                                    \
+     } while (0)
+
+#  define lsd_mutex_unlock(pmutex)                                            \
+     do {                                                                     \
+         int e = pthread_mutex_unlock (pmutex);                               \
+         if (e != 0) {                                                        \
+             errno = e;                                                       \
+             lsd_fatal_error (__FILE__, __LINE__, "mutex_unlock");            \
+             abort ();                                                        \
+         }                                                                    \
+     } while (0)
+
+#  define lsd_mutex_destroy(pmutex)                                           \
+     do {                                                                     \
+         int e = pthread_mutex_destroy (pmutex);                              \
+         if (e != 0) {                                                        \
+             errno = e;                                                       \
+             lsd_fatal_error (__FILE__, __LINE__, "mutex_destroy");           \
+             abort ();                                                        \
+         }                                                                    \
+     } while (0)
+
+#  ifndef NDEBUG
+     int lsd_mutex_is_locked (pthread_mutex_t *pmutex);
+#  endif /* !NDEBUG */
+
+#else /* !WITH_PTHREADS */
+
+#  define lsd_mutex_init(mutex)
+#  define lsd_mutex_lock(mutex)
+#  define lsd_mutex_unlock(mutex)
+#  define lsd_mutex_destroy(mutex)
+#  define lsd_mutex_is_locked(mutex) (1)
+
+#endif /* !WITH_PTHREADS */
+
+
+#endif /* !LSD_THREAD_H */
diff --git a/lustre/utils/wirecheck.c b/lustre/utils/wirecheck.c

index c009fc5..3c0ef9e 100644 (file)
--- a/lustre/utils/wirecheck.c
+++ b/lustre/utils/wirecheck.c
@@ -1,6 +1,39 @@
  /* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
   * vim:expandtab:shiftwidth=8:tabstop=8:
+ *
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
   */
+/*
+ * Copyright  2008 Sun Microsystems, Inc. All rights reserved
+ * Use is subject to license terms.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
  #include <stdio.h>
  #include <sys/types.h>
  #include <sys/wait.h>
@@ -158,6 +191,8 @@ check_ptlrpc_body(void)
          CHECK_MEMBER(ptlrpc_body, pb_service_time);
          CHECK_MEMBER(ptlrpc_body, pb_slv);
          CHECK_MEMBER(ptlrpc_body, pb_limit);
+        CHECK_MEMBER(ptlrpc_body, pb_pre_versions);
+        CHECK_MEMBER(ptlrpc_body, pb_padding);
  }
  
  static void check_obd_connect_data(void)
@@ -205,7 +240,9 @@ static void check_obd_connect_data(void)
          CHECK_CDEFINE(OBD_CONNECT_LRU_RESIZE);
          CHECK_CDEFINE(OBD_CONNECT_MDS_MDS);
          CHECK_CDEFINE(OBD_CONNECT_REAL);
+        CHECK_CDEFINE(OBD_CONNECT_FID);
          CHECK_CDEFINE(OBD_CONNECT_CKSUM);
+        CHECK_CDEFINE(OBD_CONNECT_VBR);
  }
  
  static void
@@ -334,6 +371,33 @@ check_lov_mds_md_join(void)
  }
  
  static void
+check_lov_mds_md_v3(void)
+{
+        BLANK_LINE();
+        CHECK_STRUCT(lov_mds_md_v3);
+        CHECK_MEMBER(lov_mds_md_v3, lmm_magic);
+        CHECK_MEMBER(lov_mds_md_v3, lmm_pattern);
+        CHECK_MEMBER(lov_mds_md_v3, lmm_object_id);
+        CHECK_MEMBER(lov_mds_md_v3, lmm_object_gr);
+        CHECK_MEMBER(lov_mds_md_v3, lmm_stripe_size);
+        CHECK_MEMBER(lov_mds_md_v3, lmm_stripe_count);
+        CHECK_MEMBER(lov_mds_md_v3, lmm_pool_name);
+        CHECK_MEMBER(lov_mds_md_v3, lmm_objects);
+
+        BLANK_LINE();
+        CHECK_STRUCT(lov_ost_data_v1);
+        CHECK_MEMBER(lov_ost_data_v1, l_object_id);
+        CHECK_MEMBER(lov_ost_data_v1, l_object_gr);
+        CHECK_MEMBER(lov_ost_data_v1, l_ost_gen);
+        CHECK_MEMBER(lov_ost_data_v1, l_ost_idx);
+
+        CHECK_CDEFINE(LOV_MAGIC_V3);
+
+        CHECK_VALUE(LOV_PATTERN_RAID0);
+        CHECK_VALUE(LOV_PATTERN_RAID1);
+}
+
+static void
  check_obd_statfs(void)
  {
          BLANK_LINE();
@@ -744,6 +808,19 @@ check_ldlm_lvb(void)
          CHECK_MEMBER(ost_lvb, lvb_blocks);
  }
  
+static void
+check_cfg_marker(void)
+{
+        BLANK_LINE();
+        CHECK_STRUCT(cfg_marker);
+        CHECK_MEMBER(cfg_marker, cm_step);
+        CHECK_MEMBER(cfg_marker, cm_flags);
+        CHECK_MEMBER(cfg_marker, cm_vers);
+        CHECK_MEMBER(cfg_marker, cm_createtime);
+        CHECK_MEMBER(cfg_marker, cm_canceltime);
+        CHECK_MEMBER(cfg_marker, cm_tgtname);
+        CHECK_MEMBER(cfg_marker, cm_comment);
+}
  
  static void
  check_llog_logid(void)
@@ -820,7 +897,7 @@ check_llog_create_rec(void)
          CHECK_MEMBER(llog_create_rec, lcr_hdr);
          CHECK_MEMBER(llog_create_rec, lcr_fid);
          CHECK_MEMBER(llog_create_rec, lcr_oid);
-        CHECK_MEMBER(llog_create_rec, lcr_ogen);
+        CHECK_MEMBER(llog_create_rec, lcr_ogr);
          CHECK_MEMBER(llog_create_rec, padding);
  }
  
@@ -843,8 +920,8 @@ check_llog_unlink_rec(void)
          CHECK_STRUCT(llog_unlink_rec);
          CHECK_MEMBER(llog_unlink_rec, lur_hdr);
          CHECK_MEMBER(llog_unlink_rec, lur_oid);
-        CHECK_MEMBER(llog_unlink_rec, lur_ogen);
-        CHECK_MEMBER(llog_unlink_rec, padding);
+        CHECK_MEMBER(llog_unlink_rec, lur_ogr);
+        CHECK_MEMBER(llog_unlink_rec, lur_count);
          CHECK_MEMBER(llog_unlink_rec, lur_tail);
  }
  
@@ -855,7 +932,7 @@ check_llog_setattr_rec(void)
          CHECK_STRUCT(llog_setattr_rec);
          CHECK_MEMBER(llog_setattr_rec, lsr_hdr);
          CHECK_MEMBER(llog_setattr_rec, lsr_oid);
-        CHECK_MEMBER(llog_setattr_rec, lsr_ogen);
+        CHECK_MEMBER(llog_setattr_rec, lsr_ogr);
          CHECK_MEMBER(llog_setattr_rec, lsr_uid);
          CHECK_MEMBER(llog_setattr_rec, lsr_gid);
          CHECK_MEMBER(llog_setattr_rec, padding);
@@ -863,6 +940,22 @@ check_llog_setattr_rec(void)
  }
  
  static void
+check_llog_setattr64_rec(void)
+{
+        BLANK_LINE();
+        CHECK_STRUCT(llog_setattr64_rec);
+        CHECK_MEMBER(llog_setattr64_rec, lsr_hdr);
+        CHECK_MEMBER(llog_setattr64_rec, lsr_oid);
+        CHECK_MEMBER(llog_setattr64_rec, lsr_ogr);
+        CHECK_MEMBER(llog_setattr64_rec, padding);
+        CHECK_MEMBER(llog_setattr64_rec, lsr_uid);
+        CHECK_MEMBER(llog_setattr64_rec, lsr_uid_h);
+        CHECK_MEMBER(llog_setattr64_rec, lsr_gid);
+        CHECK_MEMBER(llog_setattr64_rec, lsr_gid_h);
+        CHECK_MEMBER(llog_setattr64_rec, lsr_tail);
+}
+
+static void
  check_llog_size_change_rec(void)
  {
          BLANK_LINE();
@@ -1067,6 +1160,50 @@ check_quota_adjust_qunit(void)
  }
  
  static void
+check_ll_user_fiemap(void)
+{
+        BLANK_LINE();
+        CHECK_STRUCT(ll_user_fiemap);
+        CHECK_MEMBER(ll_user_fiemap, fm_start);
+        CHECK_MEMBER(ll_user_fiemap, fm_length);
+        CHECK_MEMBER(ll_user_fiemap, fm_flags);
+        CHECK_MEMBER(ll_user_fiemap, fm_mapped_extents);
+        CHECK_MEMBER(ll_user_fiemap, fm_extent_count);
+        CHECK_MEMBER(ll_user_fiemap, fm_reserved);
+        CHECK_MEMBER(ll_user_fiemap, fm_extents);
+
+        CHECK_CDEFINE(FIEMAP_FLAG_SYNC);
+        CHECK_CDEFINE(FIEMAP_FLAG_XATTR);
+        CHECK_CDEFINE(FIEMAP_FLAG_DEVICE_ORDER);
+}
+
+static void
+check_ll_fiemap_extent(void)
+{
+        BLANK_LINE();
+        CHECK_STRUCT(ll_fiemap_extent);
+        CHECK_MEMBER(ll_fiemap_extent, fe_logical);
+        CHECK_MEMBER(ll_fiemap_extent, fe_physical);
+        CHECK_MEMBER(ll_fiemap_extent, fe_length);
+        CHECK_MEMBER(ll_fiemap_extent, fe_flags);
+        CHECK_MEMBER(ll_fiemap_extent, fe_device);
+
+        CHECK_CDEFINE(FIEMAP_EXTENT_LAST);
+        CHECK_CDEFINE(FIEMAP_EXTENT_UNKNOWN);
+        CHECK_CDEFINE(FIEMAP_EXTENT_DELALLOC);
+        CHECK_CDEFINE(FIEMAP_EXTENT_NO_DIRECT);
+        CHECK_CDEFINE(FIEMAP_EXTENT_SECONDARY);
+        CHECK_CDEFINE(FIEMAP_EXTENT_NET);
+        CHECK_CDEFINE(FIEMAP_EXTENT_DATA_COMPRESSED);
+        CHECK_CDEFINE(FIEMAP_EXTENT_DATA_ENCRYPTED);
+        CHECK_CDEFINE(FIEMAP_EXTENT_NOT_ALIGNED);
+        CHECK_CDEFINE(FIEMAP_EXTENT_DATA_INLINE);
+        CHECK_CDEFINE(FIEMAP_EXTENT_DATA_TAIL);
+        CHECK_CDEFINE(FIEMAP_EXTENT_UNWRITTEN);
+        CHECK_CDEFINE(FIEMAP_EXTENT_MERGED);
+}
+
+static void
  system_string (char *cmdline, char *str, int len)
  {
          int   fds[2];
@@ -1200,6 +1337,7 @@ main(int argc, char **argv)
          CHECK_VALUE(REINT_UNLINK);
          CHECK_VALUE(REINT_RENAME);
          CHECK_VALUE(REINT_OPEN);
+        CHECK_VALUE(REINT_SETXATTR);
          CHECK_VALUE(REINT_MAX);
  
          CHECK_VALUE(MGS_CONNECT);
@@ -1252,7 +1390,7 @@ main(int argc, char **argv)
  
          CHECK_VALUE(MGS_CONNECT);
          CHECK_VALUE(MGS_DISCONNECT);
-        CHECK_VALUE(MGS_EXCEPTION);   
+        CHECK_VALUE(MGS_EXCEPTION);
          CHECK_VALUE(MGS_TARGET_REG);
          CHECK_VALUE(MGS_TARGET_DEL);
          CHECK_VALUE(MGS_SET_INFO);
@@ -1269,6 +1407,7 @@ main(int argc, char **argv)
          check_obd_connect_data();
          check_obdo();
          check_lov_mds_md_v1();
+        check_lov_mds_md_v3();
          check_lov_mds_md_join();
          check_obd_statfs();
          check_obd_ioobj();
@@ -1295,6 +1434,7 @@ main(int argc, char **argv)
          check_ldlm_request();
          check_ldlm_reply();
          check_ldlm_lvb();
+        check_cfg_marker();
          check_llog_logid();
          check_llog_catid();
          check_llog_rec_hdr();
@@ -1304,6 +1444,7 @@ main(int argc, char **argv)
          check_llog_orphan_rec();
          check_llog_unlink_rec();
          check_llog_setattr_rec();
+        check_llog_setattr64_rec();
          check_llog_size_change_rec();
          check_llog_gen();
          check_llog_gen_rec();
@@ -1315,11 +1456,12 @@ main(int argc, char **argv)
          check_mds_extent_desc();
          check_qunit_data();
          check_qunit_data_old2();
-        check_qunit_data_old();
          check_quota_adjust_qunit();
          check_mgs_target_info();
          check_lustre_disk_data();
-        printf("#ifdef LIBLUSTRE_POSIX_ACL\n");
+        check_ll_user_fiemap();
+        check_ll_fiemap_extent();
+        printf("#if defined(LIBLUSTRE_POSIX_ACL) && defined(CONFIG_FS_POSIX_ACL)\n");
  #ifndef LIBLUSTRE_POSIX_ACL
  #error build generator without LIBLUSTRE_POSIX_ACL defined - produce wrong check code.
  #endif
diff --git a/lustre/utils/wirehdr.c b/lustre/utils/wirehdr.c

index a0592d1..6358b6a 100644 (file)
--- a/lustre/utils/wirehdr.c
+++ b/lustre/utils/wirehdr.c
@@ -1,3 +1,39 @@
+/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
+ * vim:expandtab:shiftwidth=8:tabstop=8:
+ *
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright  2008 Sun Microsystems, Inc. All rights reserved
+ * Use is subject to license terms.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
  #include <stdio.h>
  #include <liblustre.h>
  #include <lustre_lib.h>
@@ -23,4 +59,3 @@ int main()
  
          return ret;
  }
-
diff --git a/lustre/utils/wiretest.c b/lustre/utils/wiretest.c

index f33017e..ca74af3 100644 (file)
--- a/lustre/utils/wiretest.c
+++ b/lustre/utils/wiretest.c
@@ -1,3 +1,39 @@
+/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
+ * vim:expandtab:shiftwidth=8:tabstop=8:
+ *
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright  2008 Sun Microsystems, Inc. All rights reserved
+ * Use is subject to license terms.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
  #include <stdio.h>
  #include <liblustre.h>
  #include <lustre_lib.h>
@@ -28,8 +64,8 @@ void lustre_assert_wire_constants(void)
  {
          /* Wire protocol assertions generated by 'wirecheck'
           * (make -C lustre/utils newwiretest)
-         * running on Linux xlab.hostel 2.6.23.12-52.fc7 #1 SMP Tue Dec 18 21:18:02 EST 2007 i686 i68
-         * with gcc version 3.4.6 20060404 (Red Hat 3.4.6-7) */
+         * running on Linux vb1 2.6.18-build.1 #1 SMP Thu Mar 27 14:34:21 MDT 2008 i686 i686 i386 GNU
+         * with gcc version 4.1.2 20070626 (Red Hat 4.1.2-14) */
  
  
          /* Constants... */
@@ -145,7 +181,9 @@ void lustre_assert_wire_constants(void)
                   (long long)REINT_RENAME);
          LASSERTF(REINT_OPEN == 6, " found %lld\n",
                   (long long)REINT_OPEN);
-        LASSERTF(REINT_MAX == 7, " found %lld\n",
+        LASSERTF(REINT_SETXATTR == 7, " found %lld\n",
+                 (long long)REINT_SETXATTR);
+        LASSERTF(REINT_MAX == 8, " found %lld\n",
                   (long long)REINT_MAX);
          LASSERTF(MGS_CONNECT == 250, " found %lld\n",
                   (long long)MGS_CONNECT);
@@ -347,7 +385,7 @@ void lustre_assert_wire_constants(void)
          LASSERT(offsetof(struct lustre_msg_v1, lm_magic) == offsetof(struct lustre_msg_v2, lm_magic));
  
          /* Checks for struct ptlrpc_body */
-        LASSERTF((int)sizeof(struct ptlrpc_body) == 88, " found %lld\n",
+        LASSERTF((int)sizeof(struct ptlrpc_body) == 152, " found %lld\n",
                   (long long)(int)sizeof(struct ptlrpc_body));
          LASSERTF((int)offsetof(struct ptlrpc_body, pb_handle) == 0, " found %lld\n",
                   (long long)(int)offsetof(struct ptlrpc_body, pb_handle));
@@ -413,6 +451,14 @@ void lustre_assert_wire_constants(void)
                   (long long)(int)offsetof(struct ptlrpc_body, pb_limit));
          LASSERTF((int)sizeof(((struct ptlrpc_body *)0)->pb_limit) == 4, " found %lld\n",
                   (long long)(int)sizeof(((struct ptlrpc_body *)0)->pb_limit));
+        LASSERTF((int)offsetof(struct ptlrpc_body, pb_pre_versions) == 88, " found %lld\n",
+                 (long long)(int)offsetof(struct ptlrpc_body, pb_pre_versions));
+        LASSERTF((int)sizeof(((struct ptlrpc_body *)0)->pb_pre_versions) == 32, " found %lld\n",
+                 (long long)(int)sizeof(((struct ptlrpc_body *)0)->pb_pre_versions));
+        LASSERTF((int)offsetof(struct ptlrpc_body, pb_padding) == 120, " found %lld\n",
+                 (long long)(int)offsetof(struct ptlrpc_body, pb_padding));
+        LASSERTF((int)sizeof(((struct ptlrpc_body *)0)->pb_padding) == 32, " found %lld\n",
+                 (long long)(int)sizeof(((struct ptlrpc_body *)0)->pb_padding));
  
          /* Checks for struct obd_connect_data */
          LASSERTF((int)sizeof(struct obd_connect_data) == 72, " found %lld\n",
@@ -496,7 +542,9 @@ void lustre_assert_wire_constants(void)
          CLASSERT(OBD_CONNECT_LRU_RESIZE == 0x02000000ULL);
          CLASSERT(OBD_CONNECT_MDS_MDS == 0x04000000ULL);
          CLASSERT(OBD_CONNECT_REAL == 0x08000000ULL);
+        CLASSERT(OBD_CONNECT_FID == 0x40000000ULL);
          CLASSERT(OBD_CONNECT_CKSUM == 0x20000000ULL);
+        CLASSERT(OBD_CONNECT_VBR == 0x80000000ULL);
  
          /* Checks for struct obdo */
          LASSERTF((int)sizeof(struct obdo) == 208, " found %lld\n",
@@ -726,6 +774,67 @@ void lustre_assert_wire_constants(void)
          LASSERTF(LOV_PATTERN_RAID1 == 2, " found %lld\n",
                   (long long)LOV_PATTERN_RAID1);
  
+        /* Checks for struct lov_mds_md_v3 */
+        LASSERTF((int)sizeof(struct lov_mds_md_v3) == 48, " found %lld\n",
+                 (long long)(int)sizeof(struct lov_mds_md_v3));
+        LASSERTF((int)offsetof(struct lov_mds_md_v3, lmm_magic) == 0, " found %lld\n",
+                 (long long)(int)offsetof(struct lov_mds_md_v3, lmm_magic));
+        LASSERTF((int)sizeof(((struct lov_mds_md_v3 *)0)->lmm_magic) == 4, " found %lld\n",
+                 (long long)(int)sizeof(((struct lov_mds_md_v3 *)0)->lmm_magic));
+        LASSERTF((int)offsetof(struct lov_mds_md_v3, lmm_pattern) == 4, " found %lld\n",
+                 (long long)(int)offsetof(struct lov_mds_md_v3, lmm_pattern));
+        LASSERTF((int)sizeof(((struct lov_mds_md_v3 *)0)->lmm_pattern) == 4, " found %lld\n",
+                 (long long)(int)sizeof(((struct lov_mds_md_v3 *)0)->lmm_pattern));
+        LASSERTF((int)offsetof(struct lov_mds_md_v3, lmm_object_id) == 8, " found %lld\n",
+                 (long long)(int)offsetof(struct lov_mds_md_v3, lmm_object_id));
+        LASSERTF((int)sizeof(((struct lov_mds_md_v3 *)0)->lmm_object_id) == 8, " found %lld\n",
+                 (long long)(int)sizeof(((struct lov_mds_md_v3 *)0)->lmm_object_id));
+        LASSERTF((int)offsetof(struct lov_mds_md_v3, lmm_object_gr) == 16, " found %lld\n",
+                 (long long)(int)offsetof(struct lov_mds_md_v3, lmm_object_gr));
+        LASSERTF((int)sizeof(((struct lov_mds_md_v3 *)0)->lmm_object_gr) == 8, " found %lld\n",
+                 (long long)(int)sizeof(((struct lov_mds_md_v3 *)0)->lmm_object_gr));
+        LASSERTF((int)offsetof(struct lov_mds_md_v3, lmm_stripe_size) == 24, " found %lld\n",
+                 (long long)(int)offsetof(struct lov_mds_md_v3, lmm_stripe_size));
+        LASSERTF((int)sizeof(((struct lov_mds_md_v3 *)0)->lmm_stripe_size) == 4, " found %lld\n",
+                 (long long)(int)sizeof(((struct lov_mds_md_v3 *)0)->lmm_stripe_size));
+        LASSERTF((int)offsetof(struct lov_mds_md_v3, lmm_stripe_count) == 28, " found %lld\n",
+                 (long long)(int)offsetof(struct lov_mds_md_v3, lmm_stripe_count));
+        LASSERTF((int)sizeof(((struct lov_mds_md_v3 *)0)->lmm_stripe_count) == 4, " found %lld\n",
+                 (long long)(int)sizeof(((struct lov_mds_md_v3 *)0)->lmm_stripe_count));
+        LASSERTF((int)offsetof(struct lov_mds_md_v3, lmm_pool_name) == 32, " found %lld\n",
+                 (long long)(int)offsetof(struct lov_mds_md_v3, lmm_pool_name));
+        LASSERTF((int)sizeof(((struct lov_mds_md_v3 *)0)->lmm_pool_name) == 16, " found %lld\n",
+                 (long long)(int)sizeof(((struct lov_mds_md_v3 *)0)->lmm_pool_name));
+        LASSERTF((int)offsetof(struct lov_mds_md_v3, lmm_objects) == 48, " found %lld\n",
+                 (long long)(int)offsetof(struct lov_mds_md_v3, lmm_objects));
+        LASSERTF((int)sizeof(((struct lov_mds_md_v3 *)0)->lmm_objects) == 0, " found %lld\n",
+                 (long long)(int)sizeof(((struct lov_mds_md_v3 *)0)->lmm_objects));
+
+        /* Checks for struct lov_ost_data_v1 */
+        LASSERTF((int)sizeof(struct lov_ost_data_v1) == 24, " found %lld\n",
+                 (long long)(int)sizeof(struct lov_ost_data_v1));
+        LASSERTF((int)offsetof(struct lov_ost_data_v1, l_object_id) == 0, " found %lld\n",
+                 (long long)(int)offsetof(struct lov_ost_data_v1, l_object_id));
+        LASSERTF((int)sizeof(((struct lov_ost_data_v1 *)0)->l_object_id) == 8, " found %lld\n",
+                 (long long)(int)sizeof(((struct lov_ost_data_v1 *)0)->l_object_id));
+        LASSERTF((int)offsetof(struct lov_ost_data_v1, l_object_gr) == 8, " found %lld\n",
+                 (long long)(int)offsetof(struct lov_ost_data_v1, l_object_gr));
+        LASSERTF((int)sizeof(((struct lov_ost_data_v1 *)0)->l_object_gr) == 8, " found %lld\n",
+                 (long long)(int)sizeof(((struct lov_ost_data_v1 *)0)->l_object_gr));
+        LASSERTF((int)offsetof(struct lov_ost_data_v1, l_ost_gen) == 16, " found %lld\n",
+                 (long long)(int)offsetof(struct lov_ost_data_v1, l_ost_gen));
+        LASSERTF((int)sizeof(((struct lov_ost_data_v1 *)0)->l_ost_gen) == 4, " found %lld\n",
+                 (long long)(int)sizeof(((struct lov_ost_data_v1 *)0)->l_ost_gen));
+        LASSERTF((int)offsetof(struct lov_ost_data_v1, l_ost_idx) == 20, " found %lld\n",
+                 (long long)(int)offsetof(struct lov_ost_data_v1, l_ost_idx));
+        LASSERTF((int)sizeof(((struct lov_ost_data_v1 *)0)->l_ost_idx) == 4, " found %lld\n",
+                 (long long)(int)sizeof(((struct lov_ost_data_v1 *)0)->l_ost_idx));
+        CLASSERT(LOV_MAGIC_V3 == 0x0BD30BD0);
+        LASSERTF(LOV_PATTERN_RAID0 == 1, " found %lld\n",
+                 (long long)LOV_PATTERN_RAID0);
+        LASSERTF(LOV_PATTERN_RAID1 == 2, " found %lld\n",
+                 (long long)LOV_PATTERN_RAID1);
+
          /* Checks for struct lov_mds_md_join */
          LASSERTF((int)sizeof(struct lov_mds_md_join) == 56, " found %lld\n",
                   (long long)(int)sizeof(struct lov_mds_md_join));
@@ -1609,6 +1718,38 @@ void lustre_assert_wire_constants(void)
          LASSERTF((int)sizeof(((struct ost_lvb *)0)->lvb_blocks) == 8, " found %lld\n",
                   (long long)(int)sizeof(((struct ost_lvb *)0)->lvb_blocks));
  
+        /* Checks for struct cfg_marker */
+        LASSERTF((int)sizeof(struct cfg_marker) == 160, " found %lld\n",
+                 (long long)(int)sizeof(struct cfg_marker));
+        LASSERTF((int)offsetof(struct cfg_marker, cm_step) == 0, " found %lld\n",
+                 (long long)(int)offsetof(struct cfg_marker, cm_step));
+        LASSERTF((int)sizeof(((struct cfg_marker *)0)->cm_step) == 4, " found %lld\n",
+                 (long long)(int)sizeof(((struct cfg_marker *)0)->cm_step));
+        LASSERTF((int)offsetof(struct cfg_marker, cm_flags) == 4, " found %lld\n",
+                 (long long)(int)offsetof(struct cfg_marker, cm_flags));
+        LASSERTF((int)sizeof(((struct cfg_marker *)0)->cm_flags) == 4, " found %lld\n",
+                 (long long)(int)sizeof(((struct cfg_marker *)0)->cm_flags));
+        LASSERTF((int)offsetof(struct cfg_marker, cm_vers) == 8, " found %lld\n",
+                 (long long)(int)offsetof(struct cfg_marker, cm_vers));
+        LASSERTF((int)sizeof(((struct cfg_marker *)0)->cm_vers) == 4, " found %lld\n",
+                 (long long)(int)sizeof(((struct cfg_marker *)0)->cm_vers));
+        LASSERTF((int)offsetof(struct cfg_marker, cm_createtime) == 16, " found %lld\n",
+                 (long long)(int)offsetof(struct cfg_marker, cm_createtime));
+        LASSERTF((int)sizeof(((struct cfg_marker *)0)->cm_createtime) == 8, " found %lld\n",
+                 (long long)(int)sizeof(((struct cfg_marker *)0)->cm_createtime));
+        LASSERTF((int)offsetof(struct cfg_marker, cm_canceltime) == 24, " found %lld\n",
+                 (long long)(int)offsetof(struct cfg_marker, cm_canceltime));
+        LASSERTF((int)sizeof(((struct cfg_marker *)0)->cm_canceltime) == 8, " found %lld\n",
+                 (long long)(int)sizeof(((struct cfg_marker *)0)->cm_canceltime));
+        LASSERTF((int)offsetof(struct cfg_marker, cm_tgtname) == 32, " found %lld\n",
+                 (long long)(int)offsetof(struct cfg_marker, cm_tgtname));
+        LASSERTF((int)sizeof(((struct cfg_marker *)0)->cm_tgtname) == 64, " found %lld\n",
+                 (long long)(int)sizeof(((struct cfg_marker *)0)->cm_tgtname));
+        LASSERTF((int)offsetof(struct cfg_marker, cm_comment) == 96, " found %lld\n",
+                 (long long)(int)offsetof(struct cfg_marker, cm_comment));
+        LASSERTF((int)sizeof(((struct cfg_marker *)0)->cm_comment) == 64, " found %lld\n",
+                 (long long)(int)sizeof(((struct cfg_marker *)0)->cm_comment));
+
          /* Checks for struct llog_logid */
          LASSERTF((int)sizeof(struct llog_logid) == 20, " found %lld\n",
                   (long long)(int)sizeof(struct llog_logid));
@@ -1738,10 +1879,10 @@ void lustre_assert_wire_constants(void)
                   (long long)(int)offsetof(struct llog_create_rec, lcr_oid));
          LASSERTF((int)sizeof(((struct llog_create_rec *)0)->lcr_oid) == 8, " found %lld\n",
                   (long long)(int)sizeof(((struct llog_create_rec *)0)->lcr_oid));
-        LASSERTF((int)offsetof(struct llog_create_rec, lcr_ogen) == 40, " found %lld\n",
-                 (long long)(int)offsetof(struct llog_create_rec, lcr_ogen));
-        LASSERTF((int)sizeof(((struct llog_create_rec *)0)->lcr_ogen) == 4, " found %lld\n",
-                 (long long)(int)sizeof(((struct llog_create_rec *)0)->lcr_ogen));
+        LASSERTF((int)offsetof(struct llog_create_rec, lcr_ogr) == 40, " found %lld\n",
+                 (long long)(int)offsetof(struct llog_create_rec, lcr_ogr));
+        LASSERTF((int)sizeof(((struct llog_create_rec *)0)->lcr_ogr) == 4, " found %lld\n",
+                 (long long)(int)sizeof(((struct llog_create_rec *)0)->lcr_ogr));
          LASSERTF((int)offsetof(struct llog_create_rec, padding) == 44, " found %lld\n",
                   (long long)(int)offsetof(struct llog_create_rec, padding));
          LASSERTF((int)sizeof(((struct llog_create_rec *)0)->padding) == 4, " found %lld\n",
@@ -1782,14 +1923,14 @@ void lustre_assert_wire_constants(void)
                   (long long)(int)offsetof(struct llog_unlink_rec, lur_oid));
          LASSERTF((int)sizeof(((struct llog_unlink_rec *)0)->lur_oid) == 8, " found %lld\n",
                   (long long)(int)sizeof(((struct llog_unlink_rec *)0)->lur_oid));
-        LASSERTF((int)offsetof(struct llog_unlink_rec, lur_ogen) == 24, " found %lld\n",
-                 (long long)(int)offsetof(struct llog_unlink_rec, lur_ogen));
-        LASSERTF((int)sizeof(((struct llog_unlink_rec *)0)->lur_ogen) == 4, " found %lld\n",
-                 (long long)(int)sizeof(((struct llog_unlink_rec *)0)->lur_ogen));
-        LASSERTF((int)offsetof(struct llog_unlink_rec, padding) == 28, " found %lld\n",
-                 (long long)(int)offsetof(struct llog_unlink_rec, padding));
-        LASSERTF((int)sizeof(((struct llog_unlink_rec *)0)->padding) == 4, " found %lld\n",
-                 (long long)(int)sizeof(((struct llog_unlink_rec *)0)->padding));
+        LASSERTF((int)offsetof(struct llog_unlink_rec, lur_ogr) == 24, " found %lld\n",
+                 (long long)(int)offsetof(struct llog_unlink_rec, lur_ogr));
+        LASSERTF((int)sizeof(((struct llog_unlink_rec *)0)->lur_ogr) == 4, " found %lld\n",
+                 (long long)(int)sizeof(((struct llog_unlink_rec *)0)->lur_ogr));
+        LASSERTF((int)offsetof(struct llog_unlink_rec, lur_count) == 28, " found %lld\n",
+                 (long long)(int)offsetof(struct llog_unlink_rec, lur_count));
+        LASSERTF((int)sizeof(((struct llog_unlink_rec *)0)->lur_count) == 4, " found %lld\n",
+                 (long long)(int)sizeof(((struct llog_unlink_rec *)0)->lur_count));
          LASSERTF((int)offsetof(struct llog_unlink_rec, lur_tail) == 32, " found %lld\n",
                   (long long)(int)offsetof(struct llog_unlink_rec, lur_tail));
          LASSERTF((int)sizeof(((struct llog_unlink_rec *)0)->lur_tail) == 8, " found %lld\n",
@@ -1806,10 +1947,10 @@ void lustre_assert_wire_constants(void)
                   (long long)(int)offsetof(struct llog_setattr_rec, lsr_oid));
          LASSERTF((int)sizeof(((struct llog_setattr_rec *)0)->lsr_oid) == 8, " found %lld\n",
                   (long long)(int)sizeof(((struct llog_setattr_rec *)0)->lsr_oid));
-        LASSERTF((int)offsetof(struct llog_setattr_rec, lsr_ogen) == 24, " found %lld\n",
-                 (long long)(int)offsetof(struct llog_setattr_rec, lsr_ogen));
-        LASSERTF((int)sizeof(((struct llog_setattr_rec *)0)->lsr_ogen) == 4, " found %lld\n",
-                 (long long)(int)sizeof(((struct llog_setattr_rec *)0)->lsr_ogen));
+        LASSERTF((int)offsetof(struct llog_setattr_rec, lsr_ogr) == 24, " found %lld\n",
+                 (long long)(int)offsetof(struct llog_setattr_rec, lsr_ogr));
+        LASSERTF((int)sizeof(((struct llog_setattr_rec *)0)->lsr_ogr) == 4, " found %lld\n",
+                 (long long)(int)sizeof(((struct llog_setattr_rec *)0)->lsr_ogr));
          LASSERTF((int)offsetof(struct llog_setattr_rec, lsr_uid) == 28, " found %lld\n",
                   (long long)(int)offsetof(struct llog_setattr_rec, lsr_uid));
          LASSERTF((int)sizeof(((struct llog_setattr_rec *)0)->lsr_uid) == 4, " found %lld\n",
@@ -1827,6 +1968,46 @@ void lustre_assert_wire_constants(void)
          LASSERTF((int)sizeof(((struct llog_setattr_rec *)0)->lsr_tail) == 8, " found %lld\n",
                   (long long)(int)sizeof(((struct llog_setattr_rec *)0)->lsr_tail));
  
+        /* Checks for struct llog_setattr64_rec */
+        LASSERTF((int)sizeof(struct llog_setattr64_rec) == 56, " found %lld\n",
+                 (long long)(int)sizeof(struct llog_setattr64_rec));
+        LASSERTF((int)offsetof(struct llog_setattr64_rec, lsr_hdr) == 0, " found %lld\n",
+                 (long long)(int)offsetof(struct llog_setattr64_rec, lsr_hdr));
+        LASSERTF((int)sizeof(((struct llog_setattr64_rec *)0)->lsr_hdr) == 16, " found %lld\n",
+                 (long long)(int)sizeof(((struct llog_setattr64_rec *)0)->lsr_hdr));
+        LASSERTF((int)offsetof(struct llog_setattr64_rec, lsr_oid) == 16, " found %lld\n",
+                 (long long)(int)offsetof(struct llog_setattr64_rec, lsr_oid));
+        LASSERTF((int)sizeof(((struct llog_setattr64_rec *)0)->lsr_oid) == 8, " found %lld\n",
+                 (long long)(int)sizeof(((struct llog_setattr64_rec *)0)->lsr_oid));
+        LASSERTF((int)offsetof(struct llog_setattr64_rec, lsr_ogr) == 24, " found %lld\n",
+                 (long long)(int)offsetof(struct llog_setattr64_rec, lsr_ogr));
+        LASSERTF((int)sizeof(((struct llog_setattr64_rec *)0)->lsr_ogr) == 4, " found %lld\n",
+                 (long long)(int)sizeof(((struct llog_setattr64_rec *)0)->lsr_ogr));
+        LASSERTF((int)offsetof(struct llog_setattr64_rec, padding) == 28, " found %lld\n",
+                 (long long)(int)offsetof(struct llog_setattr64_rec, padding));
+        LASSERTF((int)sizeof(((struct llog_setattr64_rec *)0)->padding) == 4, " found %lld\n",
+                 (long long)(int)sizeof(((struct llog_setattr64_rec *)0)->padding));
+        LASSERTF((int)offsetof(struct llog_setattr64_rec, lsr_uid) == 32, " found %lld\n",
+                 (long long)(int)offsetof(struct llog_setattr64_rec, lsr_uid));
+        LASSERTF((int)sizeof(((struct llog_setattr64_rec *)0)->lsr_uid) == 4, " found %lld\n",
+                 (long long)(int)sizeof(((struct llog_setattr64_rec *)0)->lsr_uid));
+        LASSERTF((int)offsetof(struct llog_setattr64_rec, lsr_uid_h) == 36, " found %lld\n",
+                 (long long)(int)offsetof(struct llog_setattr64_rec, lsr_uid_h));
+        LASSERTF((int)sizeof(((struct llog_setattr64_rec *)0)->lsr_uid_h) == 4, " found %lld\n",
+                 (long long)(int)sizeof(((struct llog_setattr64_rec *)0)->lsr_uid_h));
+        LASSERTF((int)offsetof(struct llog_setattr64_rec, lsr_gid) == 40, " found %lld\n",
+                 (long long)(int)offsetof(struct llog_setattr64_rec, lsr_gid));
+        LASSERTF((int)sizeof(((struct llog_setattr64_rec *)0)->lsr_gid) == 4, " found %lld\n",
+                 (long long)(int)sizeof(((struct llog_setattr64_rec *)0)->lsr_gid));
+        LASSERTF((int)offsetof(struct llog_setattr64_rec, lsr_gid_h) == 44, " found %lld\n",
+                 (long long)(int)offsetof(struct llog_setattr64_rec, lsr_gid_h));
+        LASSERTF((int)sizeof(((struct llog_setattr64_rec *)0)->lsr_gid_h) == 4, " found %lld\n",
+                 (long long)(int)sizeof(((struct llog_setattr64_rec *)0)->lsr_gid_h));
+        LASSERTF((int)offsetof(struct llog_setattr64_rec, lsr_tail) == 48, " found %lld\n",
+                 (long long)(int)offsetof(struct llog_setattr64_rec, lsr_tail));
+        LASSERTF((int)sizeof(((struct llog_setattr64_rec *)0)->lsr_tail) == 8, " found %lld\n",
+                 (long long)(int)sizeof(((struct llog_setattr64_rec *)0)->lsr_tail));
+
          /* Checks for struct llog_size_change_rec */
          LASSERTF((int)sizeof(struct llog_size_change_rec) == 48, " found %lld\n",
                   (long long)(int)sizeof(struct llog_size_change_rec));
@@ -2199,7 +2380,79 @@ void lustre_assert_wire_constants(void)
                   (long long)(int)offsetof(struct lustre_disk_data, ldd_params));
          LASSERTF((int)sizeof(((struct lustre_disk_data *)0)->ldd_params) == 4096, " found %lld\n",
                   (long long)(int)sizeof(((struct lustre_disk_data *)0)->ldd_params));
-#ifdef LIBLUSTRE_POSIX_ACL
+
+        /* Checks for struct ll_user_fiemap */
+        LASSERTF((int)sizeof(struct ll_user_fiemap) == 32, " found %lld\n",
+                 (long long)(int)sizeof(struct ll_user_fiemap));
+        LASSERTF((int)offsetof(struct ll_user_fiemap, fm_start) == 0, " found %lld\n",
+                 (long long)(int)offsetof(struct ll_user_fiemap, fm_start));
+        LASSERTF((int)sizeof(((struct ll_user_fiemap *)0)->fm_start) == 8, " found %lld\n",
+                 (long long)(int)sizeof(((struct ll_user_fiemap *)0)->fm_start));
+        LASSERTF((int)offsetof(struct ll_user_fiemap, fm_length) == 8, " found %lld\n",
+                 (long long)(int)offsetof(struct ll_user_fiemap, fm_length));
+        LASSERTF((int)sizeof(((struct ll_user_fiemap *)0)->fm_length) == 8, " found %lld\n",
+                 (long long)(int)sizeof(((struct ll_user_fiemap *)0)->fm_length));
+        LASSERTF((int)offsetof(struct ll_user_fiemap, fm_flags) == 16, " found %lld\n",
+                 (long long)(int)offsetof(struct ll_user_fiemap, fm_flags));
+        LASSERTF((int)sizeof(((struct ll_user_fiemap *)0)->fm_flags) == 4, " found %lld\n",
+                 (long long)(int)sizeof(((struct ll_user_fiemap *)0)->fm_flags));
+        LASSERTF((int)offsetof(struct ll_user_fiemap, fm_mapped_extents) == 20, " found %lld\n",
+                 (long long)(int)offsetof(struct ll_user_fiemap, fm_mapped_extents));
+        LASSERTF((int)sizeof(((struct ll_user_fiemap *)0)->fm_mapped_extents) == 4, " found %lld\n",
+                 (long long)(int)sizeof(((struct ll_user_fiemap *)0)->fm_mapped_extents));
+        LASSERTF((int)offsetof(struct ll_user_fiemap, fm_extent_count) == 24, " found %lld\n",
+                 (long long)(int)offsetof(struct ll_user_fiemap, fm_extent_count));
+        LASSERTF((int)sizeof(((struct ll_user_fiemap *)0)->fm_extent_count) == 4, " found %lld\n",
+                 (long long)(int)sizeof(((struct ll_user_fiemap *)0)->fm_extent_count));
+        LASSERTF((int)offsetof(struct ll_user_fiemap, fm_reserved) == 28, " found %lld\n",
+                 (long long)(int)offsetof(struct ll_user_fiemap, fm_reserved));
+        LASSERTF((int)sizeof(((struct ll_user_fiemap *)0)->fm_reserved) == 4, " found %lld\n",
+                 (long long)(int)sizeof(((struct ll_user_fiemap *)0)->fm_reserved));
+        LASSERTF((int)offsetof(struct ll_user_fiemap, fm_extents) == 32, " found %lld\n",
+                 (long long)(int)offsetof(struct ll_user_fiemap, fm_extents));
+        LASSERTF((int)sizeof(((struct ll_user_fiemap *)0)->fm_extents) == 0, " found %lld\n",
+                 (long long)(int)sizeof(((struct ll_user_fiemap *)0)->fm_extents));
+        CLASSERT(FIEMAP_FLAG_SYNC == 0x00000001);
+        CLASSERT(FIEMAP_FLAG_XATTR == 0x00000002);
+        CLASSERT(FIEMAP_FLAG_DEVICE_ORDER == 0x40000000);
+
+        /* Checks for struct ll_fiemap_extent */
+        LASSERTF((int)sizeof(struct ll_fiemap_extent) == 32, " found %lld\n",
+                 (long long)(int)sizeof(struct ll_fiemap_extent));
+        LASSERTF((int)offsetof(struct ll_fiemap_extent, fe_logical) == 0, " found %lld\n",
+                 (long long)(int)offsetof(struct ll_fiemap_extent, fe_logical));
+        LASSERTF((int)sizeof(((struct ll_fiemap_extent *)0)->fe_logical) == 8, " found %lld\n",
+                 (long long)(int)sizeof(((struct ll_fiemap_extent *)0)->fe_logical));
+        LASSERTF((int)offsetof(struct ll_fiemap_extent, fe_physical) == 8, " found %lld\n",
+                 (long long)(int)offsetof(struct ll_fiemap_extent, fe_physical));
+        LASSERTF((int)sizeof(((struct ll_fiemap_extent *)0)->fe_physical) == 8, " found %lld\n",
+                 (long long)(int)sizeof(((struct ll_fiemap_extent *)0)->fe_physical));
+        LASSERTF((int)offsetof(struct ll_fiemap_extent, fe_length) == 16, " found %lld\n",
+                 (long long)(int)offsetof(struct ll_fiemap_extent, fe_length));
+        LASSERTF((int)sizeof(((struct ll_fiemap_extent *)0)->fe_length) == 8, " found %lld\n",
+                 (long long)(int)sizeof(((struct ll_fiemap_extent *)0)->fe_length));
+        LASSERTF((int)offsetof(struct ll_fiemap_extent, fe_flags) == 24, " found %lld\n",
+                 (long long)(int)offsetof(struct ll_fiemap_extent, fe_flags));
+        LASSERTF((int)sizeof(((struct ll_fiemap_extent *)0)->fe_flags) == 4, " found %lld\n",
+                 (long long)(int)sizeof(((struct ll_fiemap_extent *)0)->fe_flags));
+        LASSERTF((int)offsetof(struct ll_fiemap_extent, fe_device) == 28, " found %lld\n",
+                 (long long)(int)offsetof(struct ll_fiemap_extent, fe_device));
+        LASSERTF((int)sizeof(((struct ll_fiemap_extent *)0)->fe_device) == 4, " found %lld\n",
+                 (long long)(int)sizeof(((struct ll_fiemap_extent *)0)->fe_device));
+        CLASSERT(FIEMAP_EXTENT_LAST == 0x00000001);
+        CLASSERT(FIEMAP_EXTENT_UNKNOWN == 0x00000002);
+        CLASSERT(FIEMAP_EXTENT_DELALLOC == 0x00000004);
+        CLASSERT(FIEMAP_EXTENT_NO_DIRECT == 0x00000008);
+        CLASSERT(FIEMAP_EXTENT_SECONDARY == 0x00000010);
+        CLASSERT(FIEMAP_EXTENT_NET == 0x00000020);
+        CLASSERT(FIEMAP_EXTENT_DATA_COMPRESSED == 0x00000040);
+        CLASSERT(FIEMAP_EXTENT_DATA_ENCRYPTED == 0x00000080);
+        CLASSERT(FIEMAP_EXTENT_NOT_ALIGNED == 0x00000100);
+        CLASSERT(FIEMAP_EXTENT_DATA_INLINE == 0x00000200);
+        CLASSERT(FIEMAP_EXTENT_DATA_TAIL == 0x00000400);
+        CLASSERT(FIEMAP_EXTENT_UNWRITTEN == 0x00000800);
+        CLASSERT(FIEMAP_EXTENT_MERGED == 0x00001000);
+#if defined(LIBLUSTRE_POSIX_ACL) && defined(CONFIG_FS_POSIX_ACL)
  
          /* Checks for type posix_acl_xattr_entry */
          LASSERTF((int)sizeof(xattr_acl_entry) == 8, " found %lld\n",
author	johann <johann>
	Thu, 18 Dec 2008 18:02:32 +0000 (18:02 +0000)
committer	johann <johann>
	Thu, 18 Dec 2008 18:02:32 +0000 (18:02 +0000)