LU-3285 merge: 'dom' branch merging

author Mikhal Pershin <mike.pershin@intel.com>

Fri, 10 Nov 2017 10:18:48 +0000 (13:18 +0300)

committer Mikhal Pershin <mike.pershin@intel.com>

Fri, 10 Nov 2017 10:18:48 +0000 (13:18 +0300)
author Mikhal Pershin <mike.pershin@intel.com>
Fri, 10 Nov 2017 10:18:48 +0000 (13:18 +0300)
committer Mikhal Pershin <mike.pershin@intel.com>
Fri, 10 Nov 2017 10:18:48 +0000 (13:18 +0300)
diff --combined lustre/doc/lfs-setstripe.1

index 583cf31,332f1b6..8d91135
--- 1/lustre/doc/lfs-setstripe.1
--- 2/lustre/doc/lfs-setstripe.1
+++ b/lustre/doc/lfs-setstripe.1
@@@ -1,100 -1,82 +1,100 @@@
- -.TH LFS-SETSTRIPIE 1 2015-11-06 "Lustre" "Lustre Utilities"
+ +.TH LFS-SETSTRIPE 1 2017-08-23 "Lustre" "Lustre Utilities"
   .SH NAME
- -lfs setstripe \- set striping pattern of a file.
+ +lfs setstripe \- set striping pattern of a file or directory default
   .SH SYNOPSIS
- -.B lfs setstripe [\fISTRIPE_OPTIONS\fR] <directory|filename>
+ +.B lfs setstripe \fR[\fISTRIPE_OPTIONS\fR] <\fIdirectory\fR|\fIfilename\fR>
   .br
- -.B lfs setstripe -d <directory>
+ +.B lfs setstripe \fR{\fB--component-end\fR|\fB-E \fIend1\fR} [\fISTRIPE_OPTIONS\fR]
+ +[{\fB--component-end\fR|\fB-E \fIend2\fR} [\fISTRIPE_OPTIONS\fR] ...] <\fIfilename\fR>
   .br
- -.B lfs setstripe <--component-end|-E end1> [\fISTRIPE_OPTIONS\fR] \
- -[<--component-end|-E end2> [\fISTRIPE_OPTIONS\fR] ...] <filename>
+ +.B lfs setstripe --component-add \fR{\fB--component-end\fR|\fB-E \fIend1\fR}
+ +[\fISTRIPE_OPTIONS\fR] [{\fB--component-end\fR|\fB-E \fIend2\fR} [\fISTRIPE_OPTIONS\fR]
+ +\&...] <\fIfilename\fR>
   .br
- -.B lfs setstripe --component-add <--component-end|-E end1> [\fISTRIPE_OPTIONS\fR] \
- -[<--component-end|-E end2> [\fISTRIPE_OPTIONS\fR] ...] <filename>
+ +.B lfs setstripe --component-del \fR{\fB--component-id\fR|\fB-I \fIcomp_id\fR|
+ +.B --component-flags=\fIcomp_flags\fR} <\fIfilename\fR>
   .br
- -.B lfs setstripe --component-del <--component-id|-I comp_id | \
- ---component-flags comp_flags> <filename>
+ +.B lfs setstripe -d \fR<\fIdirectory\fR>
   .br
   .SH DESCRIPTION
   .TP
- -.B lfs setstripe [\fISTRIPE_OPTIONS\fR] <directory|filename>
- -Create a file with specified striping pattern, or set default stripping pattern
- -to a directory.
- -.br
- -.TP
- -.B lfs setstripe -d <directory>
- -.br
- -Delete the default striping on the specified directory.
- -.TP
- -.B lfs setstripe <--component-end|-E end1> [\fISTRIPE_OPTIONS\fR] \
- -[<--component-end|-E end2> [\fISTRIPE_OPTIONS\fR] ...] <filename>
+ +.B lfs setstripe \fR[\fISTRIPE_OPTIONS\fR] <\fIdirectory\fR|\fIfilename\fR>
+ +Create a file with specified layout, or set or replace the default file
+ +layout on an existing directory.  If the default file layout is set on
+ +the filesystem root directory, it will be used as the filesystem-wide
+ +default layout for all files that do not explicitly specify a layout and
+ +do not have a default layout on the parent directory.  The default layout
+ +set on a directory will be copied to any new subdirectories created within
+ +that directory at the time they are created.
+ +.TP
+ +.B lfs setstripe \fR{\fB--component-end\fR|\fB-E \fIend1\fR} [\fISTRIPE_OPTIONS\fR] \
+ +[{\fB--component-end\fR|\fB-E \fIend2\fR} [\fISTRIPE_OPTIONS\fR] ...] <\fIfilename\fR>
   .br
   Create a file with the specified composite layout. Each component defines the
- -stripe pattern of the file in the range of [start, end). The first component
- -must start from offset 0, and all components must be adjacent with each other,
- -no holes are allowed, so each extent will start at the end of previous extent.
- -The
- -.I -E
+ +stripe pattern of the file in the range of
+ +.RI [ start ", " end ].
+ +The first component implicitly starts at offset 0, and all later components
+ +start at the end of previous extent.  The
+ +.B -E
   option is used to specify the end offset of each component, and it also
- -indicates the following \fISTRIPE_OPTIONS\fR are for this component. A -1 end
- -offset indicates the EOF.
- -.TP
- -.B lfs setstripe --component-add <--component-end|-E end1> [\fISTRIPE_OPTIONS\fR] \
- -[<--component-end|-E end2> [\fISTRIPE_OPTIONS\fR] ...] <filename>
+ +indicates the following \fISTRIPE_OPTIONS\fR are for this component. The end
+ +offset of
+ +.B -1
+ +or
+ +.B eof
+ +indicates the component extends to the end of file.
+ +.TP
+ +.B lfs setstripe --component-add \fR{\fB--component-end\fR|\fB-E \fIend1\fR} [\fISTRIPE_OPTIONS\fR] \
+ +[{\fB--component-end\fR|\fB-E \fIend2\fR} [\fISTRIPE_OPTIONS\fR] ...] <\fIfilename\fR>
   .br
   Add components to an existing composite file. The extent start of the first
   component to be added is equal to the extent end of last component in existing
- -file, and all components to be added must be adjacent with each other.
- -.TP
- -.B lfs setstripe --component-del <--component-id|-I comp_id | \
- ---component-flags comp_flags> <filename>
+ +file, and all components to be added must be adjacent with each other.  It is
+ +not possible to add components incrementally to the default directory layout,
+ +since the entire default layout can be replaced with one
+ +.B lfs setstripe
+ +call.
+ +.TP
+ +.B lfs setstripe --component-del \fR{\fB--component-id\fR|\fB-I \fIcomp_id\fR | \
+ +\fB--component-flags \fIcomp_flags\fR} <\fIfilename\fR>
   .br
   Remove the component(s) specified by component ID or flags from an existing
- -file. The ID specified by
- -.I -I
+ +file. The ID specified by the
+ +.B -I
   option is the numerical unique ID of the component, it can be obtained using
   the
   .B lfs getstripe
- -command.
- -.I --component-flags
- -option is used to specify certain type of components, such as all instantiated
- -ones.
+ +command.  It is not possible to delete components from a default directory
+ +layout, since the entire default layout can be replaced with one
+ +.B lfs setstripe
+ +call.
+ +The \fB--component-flags\fR option is used to specify certain type of
+ +components, such as all instantiated ones.
+ +.TP
+ +.B lfs setstripe -d \fR<\fIdirectory\fR>
+ +.br
+ +Delete the default layout on the specified directory.  It is not necessary
+ +to delete the default layout on a directory before replacing it.  This is
+ +only needed if the directory should revert from a directory-specific layout
+ +to using the global filesystem default layout stored on the root directory.
   .SH STRIPE_OPTIONS
   The various stripe related options are listed and explained below:
   .TP
- -.B -c, --stripe-count <\fIstripe_count\fR>
- -The number of OSTs to stripe a file over. 0 means to use the filesystem-wide
- -default stripe count (default 1), and -1 means to stripe over all available
- -OSTs.
+ +.B -c\fR, \fB--stripe-count \fR<\fIstripe_count\fR>
+ +The number of OSTs to stripe a file over. \fB0 \fRmeans to use the
+ +filesystem-wide default stripe count (default 1), and \fB-1 \fRmeans to stripe
+ +over all available OSTs.
   .TP
- -.B -S, --stripe-size <\fIstripe_size\fR>
- -The number of bytes to store on each OST before moving to the next OST. 0 means
- -to use the filesystem-wide default stripe_size (default 1MB).
+ +.B -S\fR, \fB--stripe-size \fR<\fIstripe_size\fR>
+ +The number of bytes to store on each OST before moving to the next OST. \fB0\fR
+ +means to use the filesystem-wide default stripe_size (default 1MB).
   .TP
- -.B -i, --stripe-index <\fIstart_ost_index\fR>
- -The OST index (starting at 0) on which to start striping for this file. -1
+ +.B -i\fR, \fB--stripe-index \fR<\fIstart_ost_index\fR>
+ +The OST index (starting at 0) on which to start striping for this file. \fB-1\fR
   allows the MDS to choose the starting index and it is strongly recommended, as
   this allows space and load balancing to be done by the MDS as needed.
   .TP
- -.B -o, --ost-list <\fIost_indices\fR>
+ +.B -o\fR, \fB--ost-list \fR<\fIost_indices\fR>
   Used to specify the exact stripe layout on the file system. \fIost_indices\fR
   is a list of OSTs referenced by their indices, which are specified in decimal
   or hex form and can be obtained using the
@@@ -121,7 -103,7 +121,7 @@@ must be in the OST list, and it will b
   striping the file. Otherwise the striping will occur in the order specified in
   .IR ost_indices .
   .TP
- -.B -p, --pool <\fIpool_name\fR>
+ +.B -p\fR, \fB--pool \fR<\fIpool_name\fR>
   The name of a predefined pool of OSTs (see
   .BR lctl (8))
   that will be used for striping. The
@@@ -133,35 -115,49 +133,56 @@@ will be used as well; th
   .I start_ost_index
   must be part of the pool or an error will be returned.
   .TP
+ .B -L, --layout <\fIlayout type\fB>\fR
+ The type of stripe layout, can be
+ .BR raid0 ", " released " or " mdt ".
+ It is
+ .BR raid0
+ by default. The
+ .BR mdt
+ type allows place the first component of the file on the MDT where the inode
+ is located. This is used with composite file layouts and can be defined as
+ first component only. The
+ .IR stripe_size
+ of MDT part is always equal to the component size. There is also per-MDT
+ parameter
+ .IR lod.dom_stripesize
+ to limit maximum size of DoM stripe which can be changed with
+ .BR lctl\ set_param
+ command, (e.g.
+ .IR lctl\ set_param\ lod.*.dom_stripesize=0
+ , see
+ .BR lctl (8))
+ .TP
   There are two options available only for \fBlfs migrate\fR:
   .TP
- -.B -b, --block
+ +.BR -b , --block
   Block file access during data migration (default).
   .TP
- -.B -n, --non-block
+ +.BR -n , --non-block
   Abort migrations if concurrent access is detected.
   .SH COMPONENT_OPTIONS
   The various component related options are listed and explained below:
   .TP
- -.B -E, --component-end <\fIend\fR>
+ +.B -E\fR,\fB--component-end \fR< \fIend\fR>
   The end offset of the component,
   .I end
   is specified in bytes, or using a suffix (kMGTP),
- -such as 256M. -1 means the end of file.
+ +such as 256M. \fB-1\fR means the end of file.
   .TP
- -.B -I, --component-id <\fIcomp_id\fR>
+ +.B -I\fR, \fB--component-id \fR<\fIcomp_id\fR>
   The numerical unique component id.
   .TP
- -.B --component-flags <\fIflags\fR>
- -Component flags. Available flags: \fBinit\fR: instantiated component.
- -\fB^init\fR: uninstantiated component.
+ +.B --component-flags \fR<\fIflags\fR>
+ +Component flags. Available \fIflags\fR:
+ +.RS
+ +.RS
+ +.B init\fR: instantiated component.
+ +.RE
+ +.RS
+ +.B ^init\fR: uninstantiated component.
+ +.RE
+ +.RE
   .TP
   .B --component-add
   Add specified components to an existing composite file.
@@@ -175,8 -171,8 +196,8 @@@ with the last component
   This creates a file striped on two OSTs with 128kB on each stripe.
   .TP
   .B $ lfs setstripe -d /mnt/lustre/dir
- -This deletes a default stripe pattern on dir. New files will use the default \
- -striping pattern created therein.
+ +This deletes a default stripe pattern on dir. New files created in that
+ +directory will use the filesystem global default instead.
   .TP
   .B $ lfs setstripe -E 4M -c 1 -E 64M -c 4 -E -1 -c -1 /mnt/lustre/file1
   This creates a file with composite layout, the component has 1 stripe and \
@@@ -189,6 -185,10 +210,10 @@@ the end of file
   .TP
   .B $ lfs setstripe --component-del -I 1 /mnt/lustre/file1
   This deletes the component with ID equals 1 from an existing file.
+ .TP
+ .B $ lfs setstripe -E 1M -L mdt -E -1 /mnt/lustre/file1
+ This created file with Data-on-MDT layout. The first 1M is placed on MDT and \
+ rest of file is placed on OST with default striping.
   .SH SEE ALSO
   .BR lfs (1),
   .BR lfs-migrate (1),
diff --combined lustre/include/cl_object.h

index 69b2281,fbaabf6..185ff8d
--- 1/lustre/include/cl_object.h
--- 2/lustre/include/cl_object.h
+++ b/lustre/include/cl_object.h
@@@ -289,6 -289,8 +289,8 @@@ struct cl_layout 
         struct lu_buf   cl_buf;
         /** size of layout in lov_mds_md format. */
         size_t          cl_size;
+       /** size of DoM component if exists or zero otherwise */
+       u64             cl_dom_comp_size;
         /** Layout generation. */
         u32             cl_layout_gen;
         /** whether layout is a composite one */
@@@ -703,7 -705,7 +705,7 @@@ enum cl_page_type 
   
           /** Transient page, the transient cl_page is used to bind a cl_page
            *  to vmpage which is not belonging to the same object of cl_page.
- -         *  it is used in DirectIO, lockless IO and liblustre. */
+ +         *  it is used in DirectIO and lockless IO. */
           CPT_TRANSIENT,
   };
   
diff --combined lustre/include/lustre_dlm.h

index 2feb5e2,4ee312a..488575b
--- 1/lustre/include/lustre_dlm.h
--- 2/lustre/include/lustre_dlm.h
+++ b/lustre/include/lustre_dlm.h
@@@ -289,11 -289,10 +289,10 @@@ typedef int (*ldlm_cancel_cbt)(struct l
    * of ldlm_[res_]lvbo_[init,update,fill]() functions.
    */
   struct ldlm_valblock_ops {
-         int (*lvbo_init)(struct ldlm_resource *res);
-         int (*lvbo_update)(struct ldlm_resource *res,
-                            struct ptlrpc_request *r,
-                            int increase);
-         int (*lvbo_free)(struct ldlm_resource *res);
+       int (*lvbo_init)(struct ldlm_resource *res);
+       int (*lvbo_update)(struct ldlm_resource *res, struct ldlm_lock *lock,
+                          struct ptlrpc_request *r,  int increase);
+       int (*lvbo_free)(struct ldlm_resource *res);
         /* Return size of lvb data appropriate RPC size can be reserved */
         int (*lvbo_size)(struct ldlm_lock *lock);
         /* Called to fill in lvb data to RPC buffer @buf */
@@@ -438,14 -437,14 +437,14 @@@ struct ldlm_namespace 
          * This allows the client to start caching negative dentries
          * for a directory and may save an RPC for a later stat.
          */
- -      unsigned int            ns_ctime_age_limit;
+ +      time64_t                ns_ctime_age_limit;
   
         /**
          * Used to rate-limit ldlm_namespace_dump calls.
          * \see ldlm_namespace_dump. Increased by 10 seconds every time
          * it is called.
          */
- -      cfs_time_t              ns_next_dump;
+ +      time64_t                ns_next_dump;
   
         /** "policy" function that does actual lock conflict determination */
         ldlm_res_policy         ns_policy;
@@@ -483,7 -482,7 +482,7 @@@
          * The resources in this namespace remember contended state during
          * \a ns_contention_time, in seconds.
          */
- -      unsigned                ns_contention_time;
+ +      time64_t                ns_contention_time;
   
         /**
          * Limit size of contended extent locks, in bytes.
@@@ -843,7 -842,9 +842,9 @@@ struct ldlm_lock 
   
         /** Private storage for lock user. Opaque to LDLM. */
         void                    *l_ast_data;
- 
+       /* separate ost_lvb used mostly by Data-on-MDT for now.
+        * It is introduced to don't mix with layout lock data. */
+       struct ost_lvb           l_ost_lvb;
         /*
          * Server-side-only members.
          */
@@@ -871,7 -872,7 +872,7 @@@
          * under this lock.
          * \see ost_rw_prolong_locks
          */
- -      cfs_time_t              l_callback_timeout;
+ +      time64_t                l_callback_timeout;
   
         /** Local PID of process which created this lock. */
         __u32                   l_pid;
@@@ -980,9 -981,8 +981,9 @@@ struct ldlm_resource 
         union {
                 /**
                  * When the resource was considered as contended,
- -               * used only on server side. */
- -              cfs_time_t      lr_contention_time;
+ +               * used only on server side.
+ +               */
+ +              time64_t        lr_contention_time;
                 /**
                  * Associated inode, used only on client side.
                  */
@@@ -1013,6 -1013,12 +1014,12 @@@ static inline bool ldlm_has_layout(stru
                 lock->l_policy_data.l_inodebits.bits & MDS_INODELOCK_LAYOUT;
   }
   
+ static inline bool ldlm_has_dom(struct ldlm_lock *lock)
+ {
+       return lock->l_resource->lr_type == LDLM_IBITS &&
+               lock->l_policy_data.l_inodebits.bits & MDS_INODELOCK_DOM;
+ }
+ 
   static inline char *
   ldlm_ns_name(struct ldlm_namespace *ns)
   {
@@@ -1264,7 -1270,7 +1271,7 @@@ struct ldlm_prolong_args 
         struct ldlm_res_id      lpa_resid;
         struct ldlm_extent      lpa_extent;
         enum ldlm_mode          lpa_mode;
- -      int                     lpa_timeout;
+ +      time64_t                lpa_timeout;
         int                     lpa_locks_cnt;
         int                     lpa_blocks_cnt;
   };
@@@ -1313,10 -1319,10 +1320,10 @@@ int ldlm_request_cancel(struct ptlrpc_r
   /** @} ldlm_handlers */
   
   void ldlm_revoke_export_locks(struct obd_export *exp);
- -unsigned int ldlm_bl_timeout(struct ldlm_lock *lock);
+ +time64_t ldlm_bl_timeout(struct ldlm_lock *lock);
   #endif
   int ldlm_del_waiting_lock(struct ldlm_lock *lock);
- -int ldlm_refresh_waiting_lock(struct ldlm_lock *lock, int timeout);
+ +int ldlm_refresh_waiting_lock(struct ldlm_lock *lock, time64_t timeout);
   int ldlm_get_ref(void);
   void ldlm_put_ref(void);
   int ldlm_init_export(struct obd_export *exp);
@@@ -1361,9 -1367,11 +1368,11 @@@ ldlm_handle2lock_long(const struct lust
    * Update Lock Value Block Operations (LVBO) on a resource taking into account
    * data from request \a r
    */
- static inline int ldlm_res_lvbo_update(struct ldlm_resource *res,
-                                      struct ptlrpc_request *req, int increase)
+ static inline int ldlm_lvbo_update(struct ldlm_resource *res,
+                                  struct ldlm_lock *lock,
+                                  struct ptlrpc_request *req, int increase)
   {
+       struct ldlm_namespace *ns = ldlm_res_to_ns(res);
         int rc;
   
         /* delayed lvb init may be required */
@@@ -1373,14 -1381,18 +1382,18 @@@
                 return rc;
         }
   
-       if (ldlm_res_to_ns(res)->ns_lvbo &&
-           ldlm_res_to_ns(res)->ns_lvbo->lvbo_update) {
-               return ldlm_res_to_ns(res)->ns_lvbo->lvbo_update(res, req,
-                                                                increase);
-       }
+       if (ns->ns_lvbo && ns->ns_lvbo->lvbo_update)
+               return ns->ns_lvbo->lvbo_update(res, lock, req, increase);
+ 
         return 0;
   }
   
+ static inline int ldlm_res_lvbo_update(struct ldlm_resource *res,
+                                      struct ptlrpc_request *req, int increase)
+ {
+       return ldlm_lvbo_update(res, NULL, req, increase);
+ }
+ 
   int ldlm_error2errno(enum ldlm_error error);
   enum ldlm_error ldlm_errno2error(int err_no); /* don't call it `errno': this
                                                * confuses user-space. */
@@@ -1478,8 -1490,41 +1491,41 @@@ void ldlm_namespace_put(struct ldlm_nam
   int ldlm_proc_setup(void);
   #ifdef CONFIG_PROC_FS
   void ldlm_proc_cleanup(void);
+ 
+ static inline void ldlm_svc_get_eopc(const struct ldlm_request *dlm_req,
+                                    struct lprocfs_stats *srv_stats)
+ {
+       int lock_type = 0, op = 0;
+ 
+       lock_type = dlm_req->lock_desc.l_resource.lr_type;
+ 
+       switch (lock_type) {
+       case LDLM_PLAIN:
+               op = PTLRPC_LAST_CNTR + LDLM_PLAIN_ENQUEUE;
+               break;
+       case LDLM_EXTENT:
+               op = PTLRPC_LAST_CNTR + LDLM_EXTENT_ENQUEUE;
+               break;
+       case LDLM_FLOCK:
+               op = PTLRPC_LAST_CNTR + LDLM_FLOCK_ENQUEUE;
+               break;
+       case LDLM_IBITS:
+               op = PTLRPC_LAST_CNTR + LDLM_IBITS_ENQUEUE;
+               break;
+       default:
+               op = 0;
+               break;
+       }
+ 
+       if (op != 0)
+               lprocfs_counter_incr(srv_stats, op);
+ 
+       return;
+ }
   #else
   static inline void ldlm_proc_cleanup(void) {}
+ static inline void ldlm_svc_get_eopc(const struct ldlm_request *dlm_req,
+                                    struct lprocfs_stats *srv_stats) {}
   #endif
   
   /* resource.c - internal */
@@@ -1668,5 -1713,7 +1714,7 @@@ static inline int ldlm_extent_contain(c
         return ex1->start <= ex2->start && ex1->end >= ex2->end;
   }
   
+ int ldlm_inodebits_drop(struct ldlm_lock *lock,  __u64 to_drop);
+ 
   #endif
   /** @} LDLM */
diff --combined lustre/include/lustre_osc.h

index df36985,aa7ab75..88fe0dd
--- 1/lustre/include/lustre_osc.h
--- 2/lustre/include/lustre_osc.h
+++ b/lustre/include/lustre_osc.h
@@@ -182,6 -182,73 +182,73 @@@ struct osc_thread_info 
         struct lu_buf           oti_ladvise_buf;
   };
   
+ static inline __u64 osc_enq2ldlm_flags(__u32 enqflags)
+ {
+       __u64 result = 0;
+ 
+       CDEBUG(D_DLMTRACE, "flags: %x\n", enqflags);
+ 
+       LASSERT((enqflags & ~CEF_MASK) == 0);
+ 
+       if (enqflags & CEF_NONBLOCK)
+               result |= LDLM_FL_BLOCK_NOWAIT;
+       if (enqflags & CEF_GLIMPSE)
+               result |= LDLM_FL_HAS_INTENT;
+       if (enqflags & CEF_DISCARD_DATA)
+               result |= LDLM_FL_AST_DISCARD_DATA;
+       if (enqflags & CEF_PEEK)
+               result |= LDLM_FL_TEST_LOCK;
+       if (enqflags & CEF_LOCK_MATCH)
+               result |= LDLM_FL_MATCH_LOCK;
+       if (enqflags & CEF_LOCK_NO_EXPAND)
+               result |= LDLM_FL_NO_EXPANSION;
+       if (enqflags & CEF_SPECULATIVE)
+               result |= LDLM_FL_SPECULATIVE;
+       return result;
+ }
+ 
+ typedef int (*osc_enqueue_upcall_f)(void *cookie, struct lustre_handle *lockh,
+                                   int rc);
+ 
+ struct osc_enqueue_args {
+       struct obd_export       *oa_exp;
+       enum ldlm_type          oa_type;
+       enum ldlm_mode          oa_mode;
+       __u64                   *oa_flags;
+       osc_enqueue_upcall_f    oa_upcall;
+       void                    *oa_cookie;
+       struct ost_lvb          *oa_lvb;
+       struct lustre_handle    oa_lockh;
+       bool                    oa_speculative;
+ };
+ 
+ /**
+  * Bit flags for osc_dlm_lock_at_pageoff().
+  */
+ enum osc_dap_flags {
+       /**
+        * Just check if the desired lock exists, it won't hold reference
+        * count on lock.
+        */
+       OSC_DAP_FL_TEST_LOCK = 1 << 0,
+       /**
+        * Return the lock even if it is being canceled.
+        */
+       OSC_DAP_FL_CANCELING = 1 << 1
+ };
+ 
+ /*
+  * The set of operations which are different for MDC and OSC objects
+  */
+ struct osc_object_operations {
+       void (*oto_build_res_name)(struct osc_object *osc,
+                                  struct ldlm_res_id *resname);
+       struct ldlm_lock* (*oto_dlmlock_at_pgoff)(const struct lu_env *env,
+                                               struct osc_object *obj,
+                                               pgoff_t index,
+                                               enum osc_dap_flags dap_flags);
+ };
+ 
   struct osc_object {
         struct cl_object        oo_cl;
         struct lov_oinfo        *oo_oinfo;
@@@ -242,9 -309,24 +309,24 @@@
         atomic_t                oo_nr_ios;
         wait_queue_head_t       oo_io_waitq;
   
+       const struct osc_object_operations *oo_obj_ops;
         bool                    oo_initialized;
   };
   
+ static inline void osc_build_res_name(struct osc_object *osc,
+                                     struct ldlm_res_id *resname)
+ {
+       return osc->oo_obj_ops->oto_build_res_name(osc, resname);
+ }
+ 
+ static inline struct ldlm_lock *osc_dlmlock_at_pgoff(const struct lu_env *env,
+                                                   struct osc_object *obj,
+                                                   pgoff_t index,
+                                                   enum osc_dap_flags flags)
+ {
+       return obj->oo_obj_ops->oto_dlmlock_at_pgoff(env, obj, index, flags);
+ }
+ 
   static inline void osc_object_lock(struct osc_object *obj)
   {
         spin_lock(&obj->oo_lock);
@@@ -274,6 -356,18 +356,18 @@@ static inline int osc_object_is_locked(
   #endif
   }
   
+ static inline void osc_object_set_contended(struct osc_object *obj)
+ {
+       obj->oo_contention_time = cfs_time_current();
+       /* mb(); */
+       obj->oo_contended = 1;
+ }
+ 
+ static inline void osc_object_clear_contended(struct osc_object *obj)
+ {
+       obj->oo_contended = 0;
+ }
+ 
   /*
    * Lock "micro-states" for osc layer.
    */
@@@ -350,7 -444,8 +444,8 @@@ struct osc_lock 
         enum osc_lock_state     ols_state;
         /** lock value block */
         struct ost_lvb          ols_lvb;
- 
+       /** Lockless operations to be used by lockless lock */
+       const struct cl_lock_operations *ols_lockless_ops;
         /**
          * true, if ldlm_lock_addref() was called against
          * osc_lock::ols_lock. This is used for sanity checking.
@@@ -402,6 -497,10 +497,10 @@@
                                 ols_speculative:1;
   };
   
+ static inline int osc_lock_is_lockless(const struct osc_lock *ols)
+ {
+       return (ols->ols_cl.cls_ops == ols->ols_lockless_ops);
+ }
   
   /**
    * Page state private for osc layer.
@@@ -445,18 -544,6 +544,18 @@@ struct osc_page 
         cfs_time_t              ops_submit_time;
   };
   
+ +struct osc_brw_async_args {
+ +      struct obdo             *aa_oa;
+ +      int                      aa_requested_nob;
+ +      int                      aa_nio_count;
+ +      u32                      aa_page_count;
+ +      int                      aa_resends;
+ +      struct brw_page         **aa_ppga;
+ +      struct client_obd       *aa_cli;
+ +      struct list_head         aa_oaps;
+ +      struct list_head         aa_exts;
+ +};
+ +
   extern struct kmem_cache *osc_lock_kmem;
   extern struct kmem_cache *osc_object_kmem;
   extern struct kmem_cache *osc_thread_kmem;
@@@ -469,16 -556,19 +568,19 @@@ extern struct lu_context_key osc_sessio
   
   #define OSC_FLAGS (ASYNC_URGENT|ASYNC_READY)
   
+ /* osc_page.c */
   int osc_page_init(const struct lu_env *env, struct cl_object *obj,
                   struct cl_page *page, pgoff_t ind);
   void osc_index2policy(union ldlm_policy_data *policy, const struct cl_object *obj,
                       pgoff_t start, pgoff_t end);
   void osc_lru_add_batch(struct client_obd *cli, struct list_head *list);
   void osc_page_submit(const struct lu_env *env, struct osc_page *opg,
                      enum cl_req_type crt, int brw_flags);
+ int lru_queue_work(const struct lu_env *env, void *data);
+ long osc_lru_shrink(const struct lu_env *env, struct client_obd *cli,
+                   long target, bool force);
+ 
+ /* osc_cache.c */
   int osc_cancel_async_page(const struct lu_env *env, struct osc_page *ops);
   int osc_set_async_flags(struct osc_object *obj, struct osc_page *opg,
                         u32 async_flags);
@@@ -501,14 -591,120 +603,120 @@@ int osc_cache_writeback_range(const str
                               pgoff_t start, pgoff_t end, int hp, int discard);
   int osc_cache_wait_range(const struct lu_env *env, struct osc_object *obj,
                          pgoff_t start, pgoff_t end);
- void osc_io_unplug(const struct lu_env *env, struct client_obd *cli,
-                  struct osc_object *osc);
- int lru_queue_work(const struct lu_env *env, void *data);
+ int osc_io_unplug0(const struct lu_env *env, struct client_obd *cli,
+                  struct osc_object *osc, int async);
+ void osc_wake_cache_waiters(struct client_obd *cli);
   
- void osc_object_set_contended(struct osc_object *obj);
- void osc_object_clear_contended(struct osc_object *obj);
+ static inline int osc_io_unplug_async(const struct lu_env *env,
+                                     struct client_obd *cli,
+                                     struct osc_object *osc)
+ {
+       return osc_io_unplug0(env, cli, osc, 1);
+ }
+ 
+ static inline void osc_io_unplug(const struct lu_env *env,
+                                struct client_obd *cli,
+                                struct osc_object *osc)
+ {
+       (void)osc_io_unplug0(env, cli, osc, 0);
+ }
+ 
+ typedef int (*osc_page_gang_cbt)(const struct lu_env *, struct cl_io *,
+                                struct osc_page *, void *);
+ int osc_page_gang_lookup(const struct lu_env *env, struct cl_io *io,
+                       struct osc_object *osc, pgoff_t start, pgoff_t end,
+                       osc_page_gang_cbt cb, void *cbdata);
+ int osc_discard_cb(const struct lu_env *env, struct cl_io *io,
+                  struct osc_page *ops, void *cbdata);
+ 
+ /* osc_dev.c */
+ int osc_device_init(const struct lu_env *env, struct lu_device *d,
+                   const char *name, struct lu_device *next);
+ struct lu_device *osc_device_fini(const struct lu_env *env,
+                                 struct lu_device *d);
+ struct lu_device *osc_device_free(const struct lu_env *env,
+                                 struct lu_device *d);
+ 
+ /* osc_object.c */
+ int osc_object_init(const struct lu_env *env, struct lu_object *obj,
+                   const struct lu_object_conf *conf);
+ void osc_object_free(const struct lu_env *env, struct lu_object *obj);
+ int osc_lvb_print(const struct lu_env *env, void *cookie,
+                 lu_printer_t p, const struct ost_lvb *lvb);
+ int osc_object_print(const struct lu_env *env, void *cookie,
+                    lu_printer_t p, const struct lu_object *obj);
+ int osc_attr_get(const struct lu_env *env, struct cl_object *obj,
+                struct cl_attr *attr);
+ int osc_attr_update(const struct lu_env *env, struct cl_object *obj,
+                   const struct cl_attr *attr, unsigned valid);
+ int osc_object_glimpse(const struct lu_env *env, const struct cl_object *obj,
+                      struct ost_lvb *lvb);
+ int osc_object_invalidate(const struct lu_env *env, struct osc_object *osc);
   int osc_object_is_contended(struct osc_object *obj);
- int osc_lock_is_lockless(const struct osc_lock *olck);
+ int osc_object_find_cbdata(const struct lu_env *env, struct cl_object *obj,
+                          ldlm_iterator_t iter, void *data);
+ int osc_object_prune(const struct lu_env *env, struct cl_object *obj);
+ 
+ /* osc_request.c */
+ void osc_init_grant(struct client_obd *cli, struct obd_connect_data *ocd);
+ int osc_setup_common(struct obd_device *obd, struct lustre_cfg *lcfg);
+ int osc_precleanup_common(struct obd_device *obd);
+ int osc_cleanup_common(struct obd_device *obd);
+ int osc_set_info_async(const struct lu_env *env, struct obd_export *exp,
+                      u32 keylen, void *key, u32 vallen, void *val,
+                      struct ptlrpc_request_set *set);
+ int osc_ldlm_resource_invalidate(struct cfs_hash *hs, struct cfs_hash_bd *bd,
+                                struct hlist_node *hnode, void *arg);
+ int osc_reconnect(const struct lu_env *env, struct obd_export *exp,
+                 struct obd_device *obd, struct obd_uuid *cluuid,
+                 struct obd_connect_data *data, void *localdata);
+ int osc_disconnect(struct obd_export *exp);
+ int osc_punch_send(struct obd_export *exp, struct obdo *oa,
+                  obd_enqueue_update_f upcall, void *cookie);
+ 
+ /* osc_io.c */
+ int osc_io_submit(const struct lu_env *env, const struct cl_io_slice *ios,
+                 enum cl_req_type crt, struct cl_2queue *queue);
+ int osc_io_commit_async(const struct lu_env *env,
+                       const struct cl_io_slice *ios,
+                       struct cl_page_list *qin, int from, int to,
+                       cl_commit_cbt cb);
+ int osc_io_iter_init(const struct lu_env *env, const struct cl_io_slice *ios);
+ void osc_io_iter_fini(const struct lu_env *env,
+                     const struct cl_io_slice *ios);
+ int osc_io_write_iter_init(const struct lu_env *env,
+                          const struct cl_io_slice *ios);
+ void osc_io_write_iter_fini(const struct lu_env *env,
+                           const struct cl_io_slice *ios);
+ int osc_io_fault_start(const struct lu_env *env, const struct cl_io_slice *ios);
+ void osc_io_setattr_end(const struct lu_env *env,
+                       const struct cl_io_slice *slice);
+ int osc_io_read_start(const struct lu_env *env,
+                     const struct cl_io_slice *slice);
+ int osc_io_write_start(const struct lu_env *env,
+                      const struct cl_io_slice *slice);
+ void osc_io_end(const struct lu_env *env, const struct cl_io_slice *slice);
+ int osc_fsync_ost(const struct lu_env *env, struct osc_object *obj,
+                 struct cl_fsync_io *fio);
+ void osc_io_fsync_end(const struct lu_env *env,
+                     const struct cl_io_slice *slice);
+ void osc_read_ahead_release(const struct lu_env *env, void *cbdata);
+ 
+ /* osc_lock.c */
+ void osc_lock_to_lockless(const struct lu_env *env, struct osc_lock *ols,
+                         int force);
+ void osc_lock_wake_waiters(const struct lu_env *env, struct osc_object *osc,
+                          struct osc_lock *oscl);
+ int osc_lock_enqueue_wait(const struct lu_env *env, struct osc_object *obj,
+                         struct osc_lock *oscl);
+ void osc_lock_set_writer(const struct lu_env *env, const struct cl_io *io,
+                        struct cl_object *obj, struct osc_lock *oscl);
+ int osc_lock_print(const struct lu_env *env, void *cookie,
+                  lu_printer_t p, const struct cl_lock_slice *slice);
+ void osc_lock_cancel(const struct lu_env *env,
+                    const struct cl_lock_slice *slice);
+ void osc_lock_fini(const struct lu_env *env, struct cl_lock_slice *slice);
+ int osc_ldlm_glimpse_ast(struct ldlm_lock *dlmlock, void *data);
   
   /*****************************************************************************
    *
@@@ -757,18 -953,6 +965,6 @@@ struct osc_extent 
         unsigned int            oe_mppr;
   };
   
- int osc_extent_finish(const struct lu_env *env, struct osc_extent *ext,
-                     int sent, int rc);
- int osc_extent_release(const struct lu_env *env, struct osc_extent *ext);
- 
- int osc_lock_discard_pages(const struct lu_env *env, struct osc_object *osc,
-                          pgoff_t start, pgoff_t end, bool discard_pages);
- 
- typedef int (*osc_page_gang_cbt)(const struct lu_env *, struct cl_io *,
-                                struct osc_page *, void *);
- int osc_page_gang_lookup(const struct lu_env *env, struct cl_io *io,
-                        struct osc_object *osc, pgoff_t start, pgoff_t end,
-                        osc_page_gang_cbt cb, void *cbdata);
   /** @} osc */
   
   #endif /* LUSTRE_OSC_H */
diff --combined lustre/include/obd.h

index a19500a,89a427e..9497830
--- 1/lustre/include/obd.h
--- 2/lustre/include/obd.h
+++ b/lustre/include/obd.h
@@@ -33,9 -33,7 +33,9 @@@
   #ifndef __OBD_H
   #define __OBD_H
   
+ +#include <linux/kobject.h>
   #include <linux/spinlock.h>
+ +#include <linux/sysfs.h>
   
   #include <uapi/linux/lustre/lustre_idl.h>
   #include <lustre_lib.h>
@@@ -107,8 -105,7 +107,8 @@@ struct obd_type 
         int                      typ_refcnt;
         struct lu_device_type   *typ_lu;
         spinlock_t               obd_type_lock;
- -      struct kobject          *typ_kobj;
+ +      struct kobject           typ_kobj;
+ +      struct completion        typ_kobj_unregister;
   };
   
   struct brw_page {
@@@ -120,7 -117,7 +120,7 @@@
   
   struct timeout_item {
         enum timeout_event ti_event;
- -      cfs_time_t         ti_timeout;
+ +      time64_t           ti_timeout;
         timeout_cb_t       ti_cb;
         void              *ti_cb_data;
         struct list_head   ti_obd_list;
@@@ -204,9 -201,9 +204,9 @@@ struct client_obd 
          * See osc_{reserve|unreserve}_grant for details. */
         long                    cl_reserved_grant;
         struct list_head        cl_cache_waiters; /* waiting for cache/grant */
- -      cfs_time_t              cl_next_shrink_grant;   /* jiffies */
+ +      time64_t                cl_next_shrink_grant;   /* seconds */
         struct list_head        cl_grant_shrink_list;  /* Timeout event list */
- -      int                     cl_grant_shrink_interval; /* seconds */
+ +      time64_t                cl_grant_shrink_interval; /* seconds */
   
         /* A chunk is an optimal size used by osc_extent to determine
          * the extent size. A chunk is max(PAGE_SIZE, OST block size) */
@@@ -246,7 -243,6 +246,7 @@@
         atomic_t                cl_pending_r_pages;
         __u32                   cl_max_pages_per_rpc;
         __u32                   cl_max_rpcs_in_flight;
+ +      __u32                   cl_short_io_bytes;
         struct obd_histogram    cl_read_rpc_hist;
         struct obd_histogram    cl_write_rpc_hist;
         struct obd_histogram    cl_read_page_hist;
@@@ -309,8 -305,11 +309,11 @@@
         struct mutex              cl_mgc_mutex;
         struct local_oid_storage *cl_mgc_los;
         struct dt_object         *cl_mgc_configs_dir;
-       atomic_t                  cl_mgc_refcount;
         struct obd_export        *cl_mgc_mgsexp;
+       atomic_t                  cl_mgc_refcount;
+       /* in-flight control list and total RPCs counter */
+       struct list_head         cl_flight_waiters;
+       __u32                    cl_rpcs_in_flight;
   
           /* checksumming for data sent over the network */
         unsigned int             cl_checksum:1, /* 0 = disabled, 1 = enabled */
@@@ -376,6 -375,11 +379,11 @@@ struct lov_tgt_desc 
                               ltd_reap:1;  /* should this target be deleted */
   };
   
+ struct lov_md_tgt_desc {
+       struct obd_device *lmtd_mdc;
+       __u32              lmtd_index;
+ };
+ 
   struct lov_obd {
         struct lov_desc         desc;
         struct lov_tgt_desc   **lov_tgts;               /* sparse array */
@@@ -398,10 -402,13 +406,13 @@@
         struct cl_client_cache *lov_cache;
   
         struct rw_semaphore     lov_notify_lock;
+       /* Data-on-MDT: MDC array */
+       struct lov_md_tgt_desc  *lov_mdc_tgts;
   };
   
   struct lmv_tgt_desc {
         struct obd_uuid         ltd_uuid;
+       struct obd_device       *ltd_obd;
         struct obd_export       *ltd_exp;
         __u32                   ltd_idx;
         struct mutex            ltd_fid_mutex;
@@@ -546,7 -553,7 +557,7 @@@ enum obd_notify_event 
   
   /*
    * Data structure used to pass obd_notify()-event to non-obd listeners (llite
- - * and liblustre being main examples).
+ + * being main example).
    */
   struct obd_notify_upcall {
         int (*onu_upcall)(struct obd_device *host, struct obd_device *watched,
@@@ -641,7 -648,7 +652,7 @@@ struct obd_device 
         struct obd_export       *obd_lwp_export;
         /* list of exports in LRU order, for ping evictor, with obd_dev_lock */
         struct list_head        obd_exports_timed;
- -      time_t                  obd_eviction_timer;     /* for ping evictor */
+ +      time64_t                obd_eviction_timer;     /* for ping evictor */
   
         int                     obd_max_recoverable_clients;
         atomic_t                obd_connected_clients;
@@@ -698,8 -705,7 +709,8 @@@
         struct proc_dir_entry   *obd_proc_exports_entry;
         struct proc_dir_entry   *obd_svc_procroot;
         struct lprocfs_stats    *obd_svc_stats;
- -      struct attribute_group  *obd_attrs;
+ +      struct attribute_group           obd_attrs_group;
+ +      struct attribute               **obd_attrs;
         struct lprocfs_vars     *obd_vars;
         atomic_t                obd_evict_inprogress;
         wait_queue_head_t       obd_evict_inprogress_waitq;
@@@ -718,9 -724,8 +729,9 @@@
          * List of outstanding class_incref()'s fo this OBD. For debugging. */
         struct lu_ref                   obd_reference;
   
- -      struct kobject          obd_kobj; /* sysfs object */
- -      struct completion       obd_kobj_unregister;
+ +      struct kset                     obd_kset; /* sysfs object collection */
+ +      struct kobj_type                obd_ktype;
+ +      struct completion               obd_kobj_unregister;
   };
   
   /* get/set_info keys */
diff --combined lustre/include/uapi/linux/lustre/lustre_idl.h

index a845ecf,65a6ec9..9635941
--- 1/lustre/include/uapi/linux/lustre/lustre_idl.h
--- 2/lustre/include/uapi/linux/lustre/lustre_idl.h
+++ b/lustre/include/uapi/linux/lustre/lustre_idl.h
@@@ -107,7 -107,7 +107,7 @@@
   #define MDC_REPLY_PORTAL               10
   //#define MDC_BULK_PORTAL              11
   #define MDS_REQUEST_PORTAL             12
- //#define MDS_REPLY_PORTAL             13
+ #define MDS_IO_PORTAL                 13
   #define MDS_BULK_PORTAL                14
   #define LDLM_CB_REQUEST_PORTAL         15
   #define LDLM_CB_REPLY_PORTAL           16
@@@ -713,7 -713,7 +713,7 @@@ struct ptlrpc_body_v2 
   #define MSG_CONNECT_RECOVERING  0x00000001
   #define MSG_CONNECT_RECONNECT   0x00000002
   #define MSG_CONNECT_REPLAYABLE  0x00000004
- -//#define MSG_CONNECT_PEER        0x8
+ +/* #define MSG_CONNECT_PEER        0x00000008 removed 1.5 */
   #define MSG_CONNECT_LIBCLIENT   0x00000010
   #define MSG_CONNECT_INITIAL     0x00000020
   #define MSG_CONNECT_ASYNC       0x00000040
@@@ -845,10 -845,12 +845,12 @@@
                                 OBD_CONNECT_FLOCK_DEAD | \
                                 OBD_CONNECT_DISP_STRIPE | OBD_CONNECT_LFSCK | \
                                 OBD_CONNECT_OPEN_BY_FID | \
-                               OBD_CONNECT_DIR_STRIPE | \
-                               OBD_CONNECT_BULK_MBITS | \
+                               OBD_CONNECT_DIR_STRIPE | OBD_CONNECT_GRANT | \
+                               OBD_CONNECT_TRUNCLOCK | OBD_CONNECT_SRVLOCK | \
+                               OBD_CONNECT_BULK_MBITS | OBD_CONNECT_CKSUM | \
                                 OBD_CONNECT_MULTIMODRPCS | \
                                 OBD_CONNECT_SUBTREE | OBD_CONNECT_LARGE_ACL | \
+                               OBD_CONNECT_GRANT_PARAM | \
                                 OBD_CONNECT_FLAGS2)
   
   #define MDT_CONNECT_SUPPORTED2 OBD_CONNECT2_FILE_SECCTX
@@@ -1049,10 -1051,10 +1051,10 @@@ enum obdo_flags 
    * those *_DEF magics are only used on server side internally, they
    * won't be put on wire or disk.
    */
- -#define LOV_MAGIC_DEF         0x10000000
- -#define LOV_MAGIC_V1_DEF      (LOV_MAGIC_DEF | LOV_MAGIC_V1)
- -#define LOV_MAGIC_V3_DEF      (LOV_MAGIC_DEF | LOV_MAGIC_V3)
- -#define LOV_MAGIC_COMP_V1_DEF (LOV_MAGIC_DEF | LOV_MAGIC_COMP_V1)
+ +#define LOV_MAGIC_DEFINED             0x10000000
+ +#define LOV_MAGIC_V1_DEFINED          (LOV_MAGIC_DEFINED | LOV_MAGIC_V1)
+ +#define LOV_MAGIC_V3_DEFINED          (LOV_MAGIC_DEFINED | LOV_MAGIC_V3)
+ +#define LOV_MAGIC_COMP_V1_DEFINED     (LOV_MAGIC_DEFINED | LOV_MAGIC_COMP_V1)
   
   #define lov_pattern(pattern)          (pattern & ~LOV_PATTERN_F_MASK)
   #define lov_pattern_flags(pattern)    (pattern & LOV_PATTERN_F_MASK)
@@@ -1168,6 -1170,7 +1170,7 @@@ lov_mds_md_max_stripe_count(size_t buf_
   #define OBD_MD_FLUID       (0x00000200ULL) /* user ID */
   #define OBD_MD_FLGID       (0x00000400ULL) /* group ID */
   #define OBD_MD_FLFLAGS     (0x00000800ULL) /* flags word */
+ #define OBD_MD_DOM_SIZE    (0X00001000ULL) /* Data-on-MDT component size */
   #define OBD_MD_FLNLINK     (0x00002000ULL) /* link count */
   #define OBD_MD_FLGENER     (0x00004000ULL) /* generation number */
   /*#define OBD_MD_FLINLINE    (0x00008000ULL)  inline data. used until 1.6.5 */
@@@ -1274,13 -1277,6 +1277,13 @@@ struct hsm_state_set 
                                OBD_BRW_OVER_GRPQUOTA | \
                                OBD_BRW_OVER_PRJQUOTA)
   
+ +#define OBD_BRW_LOCAL1        0x80000000UL    /*
+ +                                       * osd-ldiskfs internal,
+ +                                       * page mapped to real block
+ +                                       */
+ +
+ +#define OBD_BRW_LOCALS (OBD_BRW_LOCAL1)
+ +
   #define OBD_OBJECT_EOF LUSTRE_EOF
   
   #define OST_MIN_PRECREATE 32
@@@ -1366,7 -1362,6 +1369,7 @@@ union lquota_id 
         struct lu_fid   qid_fid; /* FID for per-directory quota */
         __u64           qid_uid; /* user identifier */
         __u64           qid_gid; /* group identifier */
+ +      __u64           qid_projid; /* project identifier */
   };
   
   /* quotactl management */
@@@ -1563,7 -1558,7 +1566,7 @@@ typedef enum 
    * Do not exceed 63
    */
   
- -typedef enum {
+ +enum mds_reint_op {
         REINT_SETATTR  = 1,
         REINT_CREATE   = 2,
         REINT_LINK     = 3,
@@@ -1574,7 -1569,7 +1577,7 @@@
         REINT_RMENTRY  = 8,
         REINT_MIGRATE  = 9,
           REINT_MAX
- -} mds_reint_t, mdt_reint_t;
+ +};
   
   /* the disposition of the intent outlines what was executed */
   #define DISP_IT_EXECD        0x00000001
@@@ -1610,10 -1605,13 +1613,13 @@@
    * will grant LOOKUP_LOCK. */
   #define MDS_INODELOCK_PERM   0x000010
   #define MDS_INODELOCK_XATTR  0x000020 /* extended attributes */
+ #define MDS_INODELOCK_DOM    0x000040 /* Data for data-on-mdt files */
   
- #define MDS_INODELOCK_MAXSHIFT 5
+ #define MDS_INODELOCK_MAXSHIFT 6
   /* This FULL lock is useful to take on unlink sort of operations */
   #define MDS_INODELOCK_FULL ((1<<(MDS_INODELOCK_MAXSHIFT+1))-1)
+ /* DOM lock shouldn't be canceled early, use this macro for ELC */
+ #define MDS_INODELOCK_ELC (MDS_INODELOCK_FULL & ~MDS_INODELOCK_DOM)
   
   /* NOTE: until Lustre 1.8.7/2.1.1 the fid_ver() was packed into name[2],
    * but was moved into name[1] along with the OID to avoid consuming the
@@@ -1730,9 -1728,9 +1736,9 @@@ struct mdt_body 
         __u32   mbo_uid_h; /* high 32-bits of uid, for FUID */
         __u32   mbo_gid_h; /* high 32-bits of gid, for FUID */
         __u32   mbo_projid;
-       __u64   mbo_padding_6; /* also fix lustre_swab_mdt_body */
-       __u64   mbo_padding_7;
-       __u64   mbo_padding_8;
+       __u64   mbo_dom_size; /* size of DOM component */
+       __u64   mbo_dom_blocks; /* blocks consumed by DOM component */
+       __u64   mbo_padding_8; /* also fix lustre_swab_mdt_body */
         __u64   mbo_padding_9;
         __u64   mbo_padding_10;
   }; /* 216 */
@@@ -2047,17 -2045,17 +2053,17 @@@ struct mdt_rec_reint 
   
   /* lmv structures */
   struct lmv_desc {
- -        __u32 ld_tgt_count;                /* how many MDS's */
- -        __u32 ld_active_tgt_count;         /* how many active */
- -        __u32 ld_default_stripe_count;     /* how many objects are used */
- -      __u32 ld_pattern;                  /* default hash pattern */
- -        __u64 ld_default_hash_size;
- -        __u64 ld_padding_1;                /* also fix lustre_swab_lmv_desc */
- -        __u32 ld_padding_2;                /* also fix lustre_swab_lmv_desc */
- -        __u32 ld_qos_maxage;               /* in second */
- -        __u32 ld_padding_3;                /* also fix lustre_swab_lmv_desc */
- -        __u32 ld_padding_4;                /* also fix lustre_swab_lmv_desc */
- -        struct obd_uuid ld_uuid;
+ +      __u32 ld_tgt_count;             /* how many MDS's */
+ +      __u32 ld_active_tgt_count;      /* how many active */
+ +      __u32 ld_default_stripe_count;  /* how many objects are used */
+ +      __u32 ld_pattern;               /* default hash pattern */
+ +      __u64 ld_default_hash_size;
+ +      __u64 ld_padding_1;             /* also fix lustre_swab_lmv_desc */
+ +      __u32 ld_padding_2;             /* also fix lustre_swab_lmv_desc */
+ +      __u32 ld_qos_maxage;            /* in second */
+ +      __u32 ld_padding_3;             /* also fix lustre_swab_lmv_desc */
+ +      __u32 ld_padding_4;             /* also fix lustre_swab_lmv_desc */
+ +      struct obd_uuid ld_uuid;
   };
   
   /* LMV layout EA, and it will be stored both in master and slave object */
@@@ -2095,7 -2093,7 +2101,7 @@@ struct lmv_mds_md_v1 
   
   #define LMV_HASH_FLAG_MIGRATION       0x80000000
   
- -#if LUSTRE_VERSION_CODE < OBD_OCD_VERSION(2, 10, 56, 0)
+ +#if LUSTRE_VERSION_CODE < OBD_OCD_VERSION(2, 11, 56, 0)
   /* Since lustre 2.8, this flag will not be needed, instead this DEAD
    * and orphan flags will be stored in LMA (see LMAI_ORPHAN)
    * Keep this flag just for LFSCK, because it still might meet such
@@@ -2228,17 -2226,17 +2234,17 @@@ typedef enum 
   
   /* LOV settings descriptor (should only contain static info) */
   struct lov_desc {
- -        __u32 ld_tgt_count;                /* how many OBD's */
- -        __u32 ld_active_tgt_count;         /* how many active */
- -        __u32 ld_default_stripe_count;     /* how many objects are used */
- -        __u32 ld_pattern;                  /* default PATTERN_RAID0 */
- -        __u64 ld_default_stripe_size;      /* in bytes */
- -        __u64 ld_default_stripe_offset;    /* in bytes */
- -        __u32 ld_padding_0;                /* unused */
- -        __u32 ld_qos_maxage;               /* in second */
- -        __u32 ld_padding_1;                /* also fix lustre_swab_lov_desc */
- -        __u32 ld_padding_2;                /* also fix lustre_swab_lov_desc */
- -        struct obd_uuid ld_uuid;
+ +      __u32 ld_tgt_count;             /* how many OBD's */
+ +      __u32 ld_active_tgt_count;      /* how many active */
+ +      __s32 ld_default_stripe_count;  /* how many objects are used */
+ +      __u32 ld_pattern;               /* default PATTERN_RAID0 */
+ +      __u64 ld_default_stripe_size;   /* in bytes */
+ +      __s64 ld_default_stripe_offset; /* starting OST index */
+ +      __u32 ld_padding_0;             /* unused */
+ +      __u32 ld_qos_maxage;            /* in second */
+ +      __u32 ld_padding_1;             /* also fix lustre_swab_lov_desc */
+ +      __u32 ld_padding_2;             /* also fix lustre_swab_lov_desc */
+ +      struct obd_uuid ld_uuid;
   };
   
   #define ld_magic ld_active_tgt_count       /* for swabbing from llogs */
@@@ -2366,6 -2364,8 +2372,8 @@@ enum ldlm_intent_flags 
         IT_QUOTA_DQACQ = 0x00000800,
         IT_QUOTA_CONN  = 0x00001000,
         IT_SETXATTR    = 0x00002000,
+       IT_GLIMPSE     = 0x00004000,
+       IT_BRW         = 0x00008000,
   };
   
   struct ldlm_intent {
diff --combined lustre/ldlm/ldlm_lib.c

index 03a63ca,09c84b4..5e8a03f
--- 1/lustre/ldlm/ldlm_lib.c
--- 2/lustre/ldlm/ldlm_lib.c
+++ b/lustre/ldlm/ldlm_lib.c
@@@ -391,6 -391,9 +391,9 @@@ int client_obd_setup(struct obd_device 
         atomic_long_set(&cli->cl_unstable_count, 0);
         INIT_LIST_HEAD(&cli->cl_shrink_list);
   
+       INIT_LIST_HEAD(&cli->cl_flight_waiters);
+       cli->cl_rpcs_in_flight = 0;
+ 
         init_waitqueue_head(&cli->cl_destroy_waitq);
         atomic_set(&cli->cl_destroy_in_flight, 0);
   #ifdef ENABLE_CHECKSUM
@@@ -409,8 -412,6 +412,8 @@@
          * from OFD after connecting. */
         cli->cl_max_pages_per_rpc = PTLRPC_MAX_BRW_PAGES;
   
+ +      cli->cl_short_io_bytes = OBD_MAX_SHORT_IO_BYTES;
+ +
         /* set cl_chunkbits default value to PAGE_SHIFT,
          * it will be updated at OSC connection time. */
         cli->cl_chunkbits = PAGE_SHIFT;
@@@ -428,7 -429,7 +431,7 @@@
                         cli->cl_max_rpcs_in_flight = OBD_MAX_RIF_MAX;
                 else
                         cli->cl_max_rpcs_in_flight = OBD_MAX_RIF_DEFAULT;
-         }
+       }
   
         spin_lock_init(&cli->cl_mod_rpcs_lock);
         spin_lock_init(&cli->cl_mod_rpcs_hist.oh_lock);
@@@ -744,12 -745,12 +747,12 @@@ static int target_handle_reconnect(stru
   {
         struct obd_device *target;
         struct lustre_handle *hdl;
- -      cfs_time_t now;
- -      cfs_time_t deadline;
- -      int timeout;
+ +      time64_t deadline;
+ +      time64_t timeout;
+ +      time64_t now;
         int rc = 0;
- -      ENTRY;
   
+ +      ENTRY;
         hdl = &exp->exp_imp_reverse->imp_remote_handle;
         if (!exp->exp_connection || !lustre_handle_is_used(hdl)) {
                 conn->cookie = exp->exp_handle.h_cookie;
@@@ -783,16 -784,16 +786,16 @@@
                 GOTO(out_already, rc);
         }
   
- -      now = cfs_time_current();
- -      deadline = target->obd_recovery_timer.expires;
- -      if (cfs_time_before(now, deadline)) {
- -              struct target_distribute_txn_data *tdtd =
- -                                      class_exp2tgt(exp)->lut_tdtd;
+ +      now = ktime_get_seconds();
+ +      deadline = cfs_duration_sec(target->obd_recovery_timer.expires);
+ +      if (now < deadline) {
+ +              struct target_distribute_txn_data *tdtd;
                 int size = 0;
                 int count = 0;
                 char *buf = NULL;
   
- -              timeout = cfs_duration_sec(cfs_time_sub(deadline, now));
+ +              timeout = deadline - now;
+ +              tdtd = class_exp2tgt(exp)->lut_tdtd;
                 if (tdtd && tdtd->tdtd_show_update_logs_retrievers)
                         buf = tdtd->tdtd_show_update_logs_retrievers(
                                 tdtd->tdtd_show_retrievers_cbdata,
@@@ -800,7 -801,7 +803,7 @@@
   
                 if (count > 0)
                         LCONSOLE_WARN("%s: Recovery already passed deadline "
- -                                    "%d:%.02d. It is due to DNE recovery "
+ +                                    "%lld:%.02lld. It is due to DNE recovery "
                                       "failed/stuck on the %d MDT(s):%s. "
                                       "Please wait until all MDTs recovered "
                                       "or abort the recovery by force.\n",
@@@ -809,7 -810,7 +812,7 @@@
                                       buf ? buf : "unknown (not enough RAM)");
                 else
                         LCONSOLE_WARN("%s: Recovery already passed deadline "
- -                                    "%d:%.02d. If you do not want to wait "
+ +                                    "%lld:%.02lld. If you do not want to wait "
                                       "more, please abort the recovery by "
                                       "force.\n", target->obd_name,
                                       timeout / 60, timeout % 60);
@@@ -817,9 -818,9 +820,9 @@@
                 if (buf != NULL)
                         OBD_FREE(buf, size);
         } else {
- -              timeout = cfs_duration_sec(cfs_time_sub(now, deadline));
+ +              timeout = now - deadline;
                 LCONSOLE_WARN("%s: Recovery already passed deadline"
- -                      " %d:%.02d, It is most likely due to DNE"
+ +                      " %lld:%.02lld, It is most likely due to DNE"
                         " recovery is failed or stuck, please wait a"
                         " few more minutes or abort the recovery.\n",
                         target->obd_name, timeout / 60, timeout % 60);
@@@ -953,6 -954,7 +956,6 @@@ int target_handle_connect(struct ptlrpc
          * reconnect case */
         struct lustre_handle conn;
         struct lustre_handle *tmp;
- -        struct obd_uuid tgtuuid;
           struct obd_uuid cluuid;
           char *str;
           int rc = 0;
@@@ -961,6 -963,7 +964,6 @@@
         bool     mds_conn = false, lw_client = false, initial_conn = false;
         bool     mds_mds_conn = false;
         bool     new_mds_mds_conn = false;
- -      bool     target_referenced = false;
           struct obd_connect_data *data, *tmpdata;
           int size, tmpsize;
           lnet_nid_t *client_nid = NULL;
@@@ -974,7 -977,11 +977,7 @@@
                   GOTO(out, rc = -EINVAL);
           }
   
- -        obd_str2uuid(&tgtuuid, str);
- -        target = class_uuid2obd(&tgtuuid);
- -        if (!target)
- -                target = class_name2obd(str);
- -
+ +      target = class_dev_by_str(str);
         if (!target) {
                 deuuidify(str, NULL, &target_start, &target_len);
                 LCONSOLE_ERROR_MSG(0x137, "%s: not available for connect "
@@@ -986,9 -993,6 +989,9 @@@
         }
   
         spin_lock(&target->obd_dev_lock);
+ +
+ +      target->obd_conn_inprogress++;
+ +
         if (target->obd_stopping || !target->obd_set_up) {
                 spin_unlock(&target->obd_dev_lock);
   
@@@ -1010,6 -1014,13 +1013,6 @@@
                 GOTO(out, rc = -EAGAIN);
         }
   
- -      /* Make sure the target isn't cleaned up while we're here. Yes,
- -       * there's still a race between the above check and our incref here.
- -       * Really, class_uuid2obd should take the ref. */
- -      class_incref(target, __func__, current);
- -      target_referenced = true;
- -
- -      target->obd_conn_inprogress++;
         spin_unlock(&target->obd_dev_lock);
   
           str = req_capsule_client_get(&req->rq_pill, &RMF_CLUUID);
@@@ -1048,17 -1059,32 +1051,17 @@@
          */
         if (!(data->ocd_connect_flags & OBD_CONNECT_FULL20))
                 GOTO(out, rc = -EPROTO);
- -#endif
   
+ +      /* Don't allow liblustre clients to connect.
+ +       * - testing was disabled in v2_2_50_0-61-g6a75d65
+ +       * - building was disabled in v2_5_58_0-28-g7277179
+ +       * - client code was deleted in v2_6_50_0-101-gcdfbc72,
+ +       * - clients were refused connect for version difference > 0.0.1.32  */
         if (lustre_msg_get_op_flags(req->rq_reqmsg) & MSG_CONNECT_LIBCLIENT) {
- -              if (data->ocd_version < LUSTRE_VERSION_CODE -
- -                                             LUSTRE_VERSION_ALLOWED_OFFSET ||
- -                  data->ocd_version > LUSTRE_VERSION_CODE +
- -                                             LUSTRE_VERSION_ALLOWED_OFFSET) {
- -                      DEBUG_REQ(D_WARNING, req, "Refusing %s (%d.%d.%d.%d) "
- -                                "libclient connection attempt",
- -                                data->ocd_version < LUSTRE_VERSION_CODE ?
- -                                "old" : "new",
- -                                OBD_OCD_VERSION_MAJOR(data->ocd_version),
- -                                OBD_OCD_VERSION_MINOR(data->ocd_version),
- -                                OBD_OCD_VERSION_PATCH(data->ocd_version),
- -                                OBD_OCD_VERSION_FIX(data->ocd_version));
- -                      data = req_capsule_server_sized_get(&req->rq_pill,
- -                                                          &RMF_CONNECT_DATA,
- -                                  offsetof(typeof(*data), ocd_version) +
- -                                           sizeof(data->ocd_version));
- -                      if (data) {
- -                              data->ocd_connect_flags = OBD_CONNECT_VERSION;
- -                              data->ocd_version = LUSTRE_VERSION_CODE;
- -                      }
- -                      GOTO(out, rc = -EPROTO);
- -              }
+ +              DEBUG_REQ(D_WARNING, req, "Refusing libclient connection");
+ +              GOTO(out, rc = -EPROTO);
         }
+ +#endif
   
         /* Note: lw_client is needed in MDS-MDS failover during update log
          * processing, so we needs to allow lw_client to be connected at
@@@ -1210,11 -1236,11 +1213,11 @@@ no_export
                   GOTO(out, rc);
           }
   
- -      CDEBUG(D_HA, "%s: connection from %s@%s %st%llu exp %p cur %ld last %ld\n",
- -               target->obd_name, cluuid.uuid, libcfs_nid2str(req->rq_peer.nid),
- -              target->obd_recovering ? "recovering/" : "", data->ocd_transno,
- -              export, (long)cfs_time_current_sec(),
- -              export ? (long)export->exp_last_request_time : 0);
+ +      CDEBUG(D_HA, "%s: connection from %s@%s %st%llu exp %p cur %lld last %lld\n",
+ +             target->obd_name, cluuid.uuid, libcfs_nid2str(req->rq_peer.nid),
+ +             target->obd_recovering ? "recovering/" : "", data->ocd_transno,
+ +             export, ktime_get_real_seconds(),
+ +             export ? export->exp_last_request_time : 0);
   
         /* If this is the first time a client connects, reset the recovery
          * timer. Discard lightweight connections which might be local. */
@@@ -1240,26 -1266,27 +1243,26 @@@
                 /* allow "new" MDT to be connected during recovery, since we
                  * need retrieve recovery update records from it */
                 if (target->obd_recovering && !lw_client && !mds_mds_conn) {
- -                        cfs_time_t t;
- -                      int     c; /* connected */
- -                      int     i; /* in progress */
- -                      int     k; /* known */
- -                      int     s; /* stale/evicted */
+ +                      time64_t t;
+ +                      int c; /* connected */
+ +                      int i; /* in progress */
+ +                      int k; /* known */
+ +                      int s; /* stale/evicted */
   
                         c = atomic_read(&target->obd_connected_clients);
                         i = atomic_read(&target->obd_lock_replay_clients);
                         k = target->obd_max_recoverable_clients;
                         s = target->obd_stale_clients;
                         t = target->obd_recovery_timer.expires;
- -                      t = cfs_time_sub(t, cfs_time_current());
- -                      t = cfs_duration_sec(t);
+ +                      t = cfs_duration_sec(target->obd_recovery_timer.expires);
+ +                      t -= ktime_get_seconds();
                         LCONSOLE_WARN("%s: Denying connection for new client %s"
                                       "(at %s), waiting for %d known clients "
                                       "(%d recovered, %d in progress, and %d "
- -                                    "evicted) to recover in %d:%.02d\n",
+ +                                    "evicted) to recover in %lld:%.02lld\n",
                                       target->obd_name, cluuid.uuid,
                                       libcfs_nid2str(req->rq_peer.nid), k,
- -                                    c - i, i, s, (int)t / 60,
- -                                    (int)t % 60);
+ +                                    c - i, i, s, t / 60, t % 60);
                         rc = -EBUSY;
                 } else {
   dont_check_exports:
@@@ -1314,26 -1341,37 +1317,26 @@@
                 spin_unlock(&export->exp_lock);
                 CDEBUG(D_RPCTRACE, "%s: %s already connected at greater "
                        "or equal conn_cnt: %d >= %d\n",
- -                       cluuid.uuid, libcfs_nid2str(req->rq_peer.nid),
- -                       export->exp_conn_cnt,
- -                       lustre_msg_get_conn_cnt(req->rq_reqmsg));
+ +                     cluuid.uuid, libcfs_nid2str(req->rq_peer.nid),
+ +                     export->exp_conn_cnt,
+ +                     lustre_msg_get_conn_cnt(req->rq_reqmsg));
   
- -                GOTO(out, rc = -EALREADY);
- -        }
- -        LASSERT(lustre_msg_get_conn_cnt(req->rq_reqmsg) > 0);
- -        export->exp_conn_cnt = lustre_msg_get_conn_cnt(req->rq_reqmsg);
- -
- -      /* Don't evict liblustre clients for not pinging. */
- -        if (lustre_msg_get_op_flags(req->rq_reqmsg) & MSG_CONNECT_LIBCLIENT) {
- -                export->exp_libclient = 1;
- -              spin_unlock(&export->exp_lock);
- -
- -              spin_lock(&target->obd_dev_lock);
- -              list_del_init(&export->exp_obd_chain_timed);
- -              spin_unlock(&target->obd_dev_lock);
- -      } else {
- -              spin_unlock(&export->exp_lock);
+ +              GOTO(out, rc = -EALREADY);
         }
+ +      LASSERT(lustre_msg_get_conn_cnt(req->rq_reqmsg) > 0);
+ +      export->exp_conn_cnt = lustre_msg_get_conn_cnt(req->rq_reqmsg);
+ +      spin_unlock(&export->exp_lock);
   
- -        if (export->exp_connection != NULL) {
+ +      if (export->exp_connection != NULL) {
                 /* Check to see if connection came from another NID. */
- -                if ((export->exp_connection->c_peer.nid != req->rq_peer.nid) &&
+ +              if ((export->exp_connection->c_peer.nid != req->rq_peer.nid) &&
                     !hlist_unhashed(&export->exp_nid_hash))
- -                        cfs_hash_del(export->exp_obd->obd_nid_hash,
- -                                     &export->exp_connection->c_peer.nid,
- -                                     &export->exp_nid_hash);
+ +                      cfs_hash_del(export->exp_obd->obd_nid_hash,
+ +                                   &export->exp_connection->c_peer.nid,
+ +                                   &export->exp_nid_hash);
   
- -                ptlrpc_connection_put(export->exp_connection);
- -        }
+ +              ptlrpc_connection_put(export->exp_connection);
+ +      }
   
         export->exp_connection = ptlrpc_connection_get(req->rq_peer,
                                                        req->rq_self,
@@@ -1407,11 -1445,12 +1410,11 @@@ out
   
                 class_export_put(export);
         }
- -      if (target_referenced == true && target != NULL) {
+ +      if (target != NULL) {
                 spin_lock(&target->obd_dev_lock);
                 target->obd_conn_inprogress--;
                 spin_unlock(&target->obd_dev_lock);
- -
- -              class_decref(target, __func__, current);
+ +              class_decref(target, "find", current);
         }
         req->rq_status = rc;
         RETURN(rc);
@@@ -1573,13 -1612,12 +1576,13 @@@ static void target_finish_recovery(stru
         obd->obd_recovery_end = ktime_get_real_seconds();
   
         /* When recovery finished, cleanup orphans on MDS and OST. */
- -        if (OBT(obd) && OBP(obd, postrecov)) {
- -                int rc = OBP(obd, postrecov)(obd);
- -                if (rc < 0)
- -                        LCONSOLE_WARN("%s: Post recovery failed, rc %d\n",
- -                                      obd->obd_name, rc);
- -        }
+ +      if (obd->obd_type && OBP(obd, postrecov)) {
+ +              int rc = OBP(obd, postrecov)(obd);
+ +
+ +              if (rc < 0)
+ +                      LCONSOLE_WARN("%s: Post recovery failed, rc %d\n",
+ +                                    obd->obd_name, rc);
+ +      }
           EXIT;
   }
   
@@@ -1699,7 -1737,7 +1702,7 @@@ static void target_start_recovery_timer
         }
   
         mod_timer(&obd->obd_recovery_timer,
- -                cfs_time_shift(obd->obd_recovery_timeout));
+ +                jiffies + cfs_time_seconds(obd->obd_recovery_timeout));
         obd->obd_recovery_start = ktime_get_real_seconds();
         spin_unlock(&obd->obd_dev_lock);
   
@@@ -1718,7 -1756,7 +1721,7 @@@
    * if @extend is true, extend recovery window to have @drt remaining at least;
    * otherwise, make sure the recovery timeout value is not less than @drt.
    */
- -static void extend_recovery_timer(struct obd_device *obd, int drt,
+ +static void extend_recovery_timer(struct obd_device *obd, time64_t drt,
                                   bool extend)
   {
         time64_t now;
@@@ -1754,7 -1792,7 +1757,7 @@@
                   obd->obd_recovery_timeout = to;
                 end = obd->obd_recovery_start + to;
                 mod_timer(&obd->obd_recovery_timer,
- -                        cfs_time_shift(end - now));
+ +                        jiffies + cfs_time_seconds(end - now));
           }
         spin_unlock(&obd->obd_dev_lock);
   
@@@ -1778,7 -1816,7 +1781,7 @@@ check_and_start_recovery_timer(struct o
                                  struct ptlrpc_request *req,
                                  int new_client)
   {
- -        int service_time = lustre_msg_get_service_time(req->rq_reqmsg);
+ +      time64_t service_time = lustre_msg_get_service_time(req->rq_reqmsg);
           struct obd_device_target *obt = &obd->u.obt;
   
           if (!new_client && service_time)
@@@ -1790,8 -1828,7 +1793,8 @@@
           target_start_recovery_timer(obd);
   
         /* Convert the service time to RPC timeout,
- -       * and reuse service_time to limit stack usage. */
+ +       * and reuse service_time to limit stack usage.
+ +       */
         service_time = at_est2timeout(service_time);
   
         if (OBD_FAIL_CHECK(OBD_FAIL_TGT_SLUGGISH_NET) &&
@@@ -2134,7 -2171,7 +2137,7 @@@ static void handle_recovery_req(struct 
   
           /* don't reset timer for final stage */
           if (!exp_finished(req->rq_export)) {
- -                int to = obd_timeout;
+ +              time64_t to = obd_timeout;
   
                   /**
                    * Add request timeout to the recovery time so next request from
@@@ -3126,10 -3163,10 +3129,10 @@@ static inline const char *bulk2type(str
   int target_bulk_io(struct obd_export *exp, struct ptlrpc_bulk_desc *desc,
                      struct l_wait_info *lwi)
   {
- -      struct ptlrpc_request   *req = desc->bd_req;
- -      time_t                   start = cfs_time_current_sec();
- -      time_t                   deadline;
- -      int                      rc = 0;
+ +      struct ptlrpc_request *req = desc->bd_req;
+ +      time64_t start = ktime_get_real_seconds();
+ +      time64_t deadline;
+ +      int rc = 0;
   
         ENTRY;
   
@@@ -3176,13 -3213,12 +3179,13 @@@
                 deadline = req->rq_deadline;
   
         do {
- -              long timeoutl = deadline - cfs_time_current_sec();
- -              cfs_duration_t timeout = timeoutl <= 0 ?
- -                                       CFS_TICK : cfs_time_seconds(timeoutl);
- -              time_t  rq_deadline;
+ +              time64_t timeoutl = deadline - ktime_get_real_seconds();
+ +              long timeout_jiffies = timeoutl <= 0 ?
+ +                                     1 : cfs_time_seconds(timeoutl);
+ +              time64_t rq_deadline;
   
- -              *lwi = LWI_TIMEOUT_INTERVAL(timeout, cfs_time_seconds(1),
+ +              *lwi = LWI_TIMEOUT_INTERVAL(timeout_jiffies,
+ +                                          cfs_time_seconds(1),
                                             target_bulk_timeout, desc);
                 rc = l_wait_event(desc->bd_waitq,
                                   !ptlrpc_server_bulk_active(desc) ||
@@@ -3196,13 -3232,13 +3199,13 @@@
                 deadline = start + bulk_timeout;
                 if (deadline > rq_deadline)
                         deadline = rq_deadline;
- -      } while ((rc == -ETIMEDOUT) &&
- -               (deadline > cfs_time_current_sec()));
+ +      } while (rc == -ETIMEDOUT &&
+ +               deadline > ktime_get_real_seconds());
   
         if (rc == -ETIMEDOUT) {
- -              DEBUG_REQ(D_ERROR, req, "timeout on bulk %s after %ld%+lds",
+ +              DEBUG_REQ(D_ERROR, req, "timeout on bulk %s after %lld%+llds",
                           bulk2type(req), deadline - start,
- -                        cfs_time_current_sec() - deadline);
+ +                        ktime_get_real_seconds() - deadline);
                 ptlrpc_abort_bulk(desc);
         } else if (exp->exp_failed) {
                 DEBUG_REQ(D_ERROR, req, "Eviction on bulk %s",
diff --combined lustre/ldlm/ldlm_lock.c

index 472dde5,a61487a..b7ec3df
--- 1/lustre/ldlm/ldlm_lock.c
--- 2/lustre/ldlm/ldlm_lock.c
+++ b/lustre/ldlm/ldlm_lock.c
@@@ -1072,16 -1072,14 +1072,14 @@@ static void ldlm_granted_list_add_lock(
    * Add a lock to granted list on a resource maintaining skiplist
    * correctness.
    */
- static void ldlm_grant_lock_with_skiplist(struct ldlm_lock *lock)
+ void ldlm_grant_lock_with_skiplist(struct ldlm_lock *lock)
   {
-         struct sl_insert_point prev;
-         ENTRY;
+       struct sl_insert_point prev;
   
-         LASSERT(lock->l_req_mode == lock->l_granted_mode);
+       LASSERT(lock->l_req_mode == lock->l_granted_mode);
   
-         search_granted_lock(&lock->l_resource->lr_granted, lock, &prev);
-         ldlm_granted_list_add_lock(lock, &prev);
-         EXIT;
+       search_granted_lock(&lock->l_resource->lr_granted, lock, &prev);
+       ldlm_granted_list_add_lock(lock, &prev);
   }
   
   /**
@@@ -2441,7 -2439,7 +2439,7 @@@ static void ldlm_cancel_lock_for_export
   
         res = ldlm_resource_getref(lock->l_resource);
   
-       ldlm_res_lvbo_update(res, NULL, 1);
+       ldlm_lvbo_update(res, lock, NULL, 1);
         ldlm_lock_cancel(lock);
         if (!exp->exp_obd->obd_stopping)
                 ldlm_reprocess_all(res);
@@@ -2774,7 -2772,7 +2772,7 @@@ void _ldlm_lock_debug(struct ldlm_lock 
                   libcfs_debug_vmsg2(msgdata, fmt, args,
                        " ns: \?\? lock: %p/%#llx lrc: %d/%d,%d mode: %s/%s "
                        "res: \?\? rrc=\?\? type: \?\?\? flags: %#llx nid: %s "
- -                     "remote: %#llx expref: %d pid: %u timeout: %lu "
+ +                     "remote: %#llx expref: %d pid: %u timeout: %lld "
                        "lvb_type: %d\n",
                          lock,
                        lock->l_handle.h_cookie, atomic_read(&lock->l_refc),
@@@ -2794,7 -2792,7 +2792,7 @@@
                         " ns: %s lock: %p/%#llx lrc: %d/%d,%d mode: %s/%s "
                         "res: "DLDLMRES" rrc: %d type: %s [%llu->%llu] "
                         "(req %llu->%llu) flags: %#llx nid: %s remote: "
- -                      "%#llx expref: %d pid: %u timeout: %lu lvb_type: %d\n",
+ +                      "%#llx expref: %d pid: %u timeout: %lld lvb_type: %d\n",
                         ldlm_lock_to_ns_name(lock), lock,
                         lock->l_handle.h_cookie, atomic_read(&lock->l_refc),
                         lock->l_readers, lock->l_writers,
@@@ -2817,7 -2815,7 +2815,7 @@@
                         " ns: %s lock: %p/%#llx lrc: %d/%d,%d mode: %s/%s "
                         "res: "DLDLMRES" rrc: %d type: %s pid: %d "
                         "[%llu->%llu] flags: %#llx nid: %s "
- -                      "remote: %#llx expref: %d pid: %u timeout: %lu\n",
+ +                      "remote: %#llx expref: %d pid: %u timeout: %lld\n",
                         ldlm_lock_to_ns_name(lock), lock,
                         lock->l_handle.h_cookie, atomic_read(&lock->l_refc),
                         lock->l_readers, lock->l_writers,
@@@ -2839,7 -2837,7 +2837,7 @@@
                         " ns: %s lock: %p/%#llx lrc: %d/%d,%d mode: %s/%s "
                         "res: "DLDLMRES" bits %#llx/%#llx rrc: %d type: %s "
                         "flags: %#llx nid: %s remote: %#llx expref: %d "
- -                      "pid: %u timeout: %lu lvb_type: %d\n",
+ +                      "pid: %u timeout: %lld lvb_type: %d\n",
                         ldlm_lock_to_ns_name(lock),
                         lock, lock->l_handle.h_cookie,
                         atomic_read(&lock->l_refc),
@@@ -2862,7 -2860,7 +2860,7 @@@
                         " ns: %s lock: %p/%#llx lrc: %d/%d,%d mode: %s/%s "
                         "res: "DLDLMRES" rrc: %d type: %s flags: %#llx "
                         "nid: %s remote: %#llx expref: %d pid: %u "
- -                      "timeout: %lu lvb_type: %d\n",
+ +                      "timeout: %lld lvb_type: %d\n",
                         ldlm_lock_to_ns_name(lock),
                         lock, lock->l_handle.h_cookie,
                         atomic_read(&lock->l_refc),
diff --combined lustre/ldlm/ldlm_lockd.c

index 0584a0f,383b391..b96396a
--- 1/lustre/ldlm/ldlm_lockd.c
--- 2/lustre/ldlm/ldlm_lockd.c
+++ b/lustre/ldlm/ldlm_lockd.c
@@@ -64,16 -64,18 +64,16 @@@ struct kset *ldlm_svc_kset
   
   static struct ldlm_state *ldlm_state;
   
- -static inline cfs_time_t round_timeout(cfs_time_t timeout)
- -{
- -        return cfs_time_seconds((int)cfs_duration_sec(cfs_time_sub(timeout, 0)) + 1);
- -}
- -
- -/* timeout for initial callback (AST) reply (bz10399) */
- -static inline unsigned int ldlm_get_rq_timeout(void)
+ +/* timeout for initial callback (AST) reply (bz10399)
+ + * Due to having to send a 32 bit time value over the
+ + * wire return it as time_t instead of time64_t
+ + */
+ +static inline time_t ldlm_get_rq_timeout(void)
   {
- -        /* Non-AT value */
- -        unsigned int timeout = min(ldlm_timeout, obd_timeout / 3);
+ +      /* Non-AT value */
+ +      time_t timeout = min(ldlm_timeout, obd_timeout / 3);
   
- -        return timeout < 1 ? 1 : timeout;
+ +      return timeout < 1 ? 1 : timeout;
   }
   
   struct ldlm_bl_pool {
@@@ -257,7 -259,7 +257,7 @@@ static int expired_lock_main(void *arg
   }
   
   static int ldlm_add_waiting_lock(struct ldlm_lock *lock);
- -static int __ldlm_add_waiting_lock(struct ldlm_lock *lock, int seconds);
+ +static int __ldlm_add_waiting_lock(struct ldlm_lock *lock, time64_t seconds);
   
   /**
    * Check if there is a request in the export request list
@@@ -294,10 -296,11 +294,10 @@@ static void waiting_locks_callback(unsi
         spin_lock_bh(&waiting_locks_spinlock);
         while (!list_empty(&waiting_locks_list)) {
                 lock = list_entry(waiting_locks_list.next, struct ldlm_lock,
- -                                      l_pending_chain);
- -                if (cfs_time_after(lock->l_callback_timeout,
- -                                   cfs_time_current()) ||
- -                    (lock->l_req_mode == LCK_GROUP))
- -                        break;
+ +                                l_pending_chain);
+ +              if (lock->l_callback_timeout > ktime_get_seconds() ||
+ +                  lock->l_req_mode == LCK_GROUP)
+ +                      break;
   
                   /* Check if we need to prolong timeout */
                   if (!OBD_FAIL_CHECK(OBD_FAIL_PTLRPC_HPREQ_TIMEOUT) &&
@@@ -345,18 -348,17 +345,18 @@@
                 wake_up(&expired_lock_wait_queue);
         }
   
- -        /*
- -         * Make sure the timer will fire again if we have any locks
- -         * left.
- -         */
+ +      /*
+ +       * Make sure the timer will fire again if we have any locks
+ +       * left.
+ +       */
         if (!list_empty(&waiting_locks_list)) {
- -                cfs_time_t timeout_rounded;
+ +              unsigned long timeout_jiffies;
+ +
                 lock = list_entry(waiting_locks_list.next, struct ldlm_lock,
- -                                      l_pending_chain);
- -                timeout_rounded = (cfs_time_t)round_timeout(lock->l_callback_timeout);
- -              mod_timer(&waiting_locks_timer, timeout_rounded);
- -        }
+ +                                l_pending_chain);
+ +              timeout_jiffies = cfs_time_seconds(lock->l_callback_timeout);
+ +              mod_timer(&waiting_locks_timer, timeout_jiffies);
+ +      }
         spin_unlock_bh(&waiting_locks_spinlock);
   }
   
@@@ -372,10 -374,10 +372,10 @@@
    *
    * Called with the namespace lock held.
    */
- -static int __ldlm_add_waiting_lock(struct ldlm_lock *lock, int seconds)
+ +static int __ldlm_add_waiting_lock(struct ldlm_lock *lock, time64_t seconds)
   {
- -        cfs_time_t timeout;
- -        cfs_time_t timeout_rounded;
+ +      unsigned long timeout_jiffies;
+ +      time64_t timeout;
   
         if (!list_empty(&lock->l_pending_chain))
                   return 0;
@@@ -384,22 -386,21 +384,22 @@@
               OBD_FAIL_CHECK(OBD_FAIL_PTLRPC_HPREQ_TIMEOUT))
                   seconds = 1;
   
- -        timeout = cfs_time_shift(seconds);
- -        if (likely(cfs_time_after(timeout, lock->l_callback_timeout)))
+ +      timeout = ktime_get_seconds() + seconds;
+ +      if (likely(timeout > lock->l_callback_timeout))
                   lock->l_callback_timeout = timeout;
   
- -        timeout_rounded = round_timeout(lock->l_callback_timeout);
+ +      timeout_jiffies = cfs_time_seconds(lock->l_callback_timeout);
   
- -      if (cfs_time_before(timeout_rounded, waiting_locks_timer.expires) ||
- -          !timer_pending(&waiting_locks_timer)) {
- -              mod_timer(&waiting_locks_timer, timeout_rounded);
- -        }
- -        /* if the new lock has a shorter timeout than something earlier on
- -           the list, we'll wait the longer amount of time; no big deal. */
- -        /* FIFO */
+ +      if (time_before(timeout_jiffies, waiting_locks_timer.expires) ||
+ +          !timer_pending(&waiting_locks_timer))
+ +              mod_timer(&waiting_locks_timer, timeout_jiffies);
+ +
+ +      /* if the new lock has a shorter timeout than something earlier on
+ +       * the list, we'll wait the longer amount of time; no big deal.
+ +       */
+ +      /* FIFO */
         list_add_tail(&lock->l_pending_chain, &waiting_locks_list);
- -        return 1;
+ +      return 1;
   }
   
   static void ldlm_add_blocked_lock(struct ldlm_lock *lock)
@@@ -426,8 -427,8 +426,8 @@@
   
   static int ldlm_add_waiting_lock(struct ldlm_lock *lock)
   {
+ +      time64_t timeout = ldlm_bl_timeout(lock);
         int ret;
- -      int timeout = ldlm_bl_timeout(lock);
   
         /* NB: must be called with hold of lock_res_and_lock() */
         LASSERT(ldlm_is_res_locked(lock));
@@@ -446,12 -447,12 +446,12 @@@
         }
   
         if (ldlm_is_destroyed(lock)) {
- -              static cfs_time_t next;
+ +              static time64_t next;
   
                 spin_unlock_bh(&waiting_locks_spinlock);
                 LDLM_ERROR(lock, "not waiting on destroyed lock (bug 5653)");
- -              if (cfs_time_after(cfs_time_current(), next)) {
- -                      next = cfs_time_shift(14400);
+ +              if (ktime_get_seconds() > next) {
+ +                      next = ktime_get_seconds() + 14400;
                         libcfs_debug_dumpstack(NULL);
                 }
                 return 0;
@@@ -470,7 -471,7 +470,7 @@@
         if (ret)
                 ldlm_add_blocked_lock(lock);
   
- -      LDLM_DEBUG(lock, "%sadding to wait list(timeout: %d, AT: %s)",
+ +      LDLM_DEBUG(lock, "%sadding to wait list(timeout: %lld, AT: %s)",
                    ret == 0 ? "not re-" : "", timeout,
                    AT_OFF ? "off" : "on");
         return ret;
@@@ -500,11 -501,10 +500,11 @@@ static int __ldlm_del_waiting_lock(stru
                         del_timer(&waiting_locks_timer);
                   } else {
                           struct ldlm_lock *next;
+ +
                         next = list_entry(list_next, struct ldlm_lock,
- -                                              l_pending_chain);
+ +                                        l_pending_chain);
                         mod_timer(&waiting_locks_timer,
- -                                round_timeout(next->l_callback_timeout));
+ +                                cfs_time_seconds(next->l_callback_timeout));
                   }
           }
         list_del_init(&lock->l_pending_chain);
@@@ -547,7 -547,7 +547,7 @@@ int ldlm_del_waiting_lock(struct ldlm_l
    *
    * Called with namespace lock held.
    */
- -int ldlm_refresh_waiting_lock(struct ldlm_lock *lock, int timeout)
+ +int ldlm_refresh_waiting_lock(struct ldlm_lock *lock, time64_t timeout)
   {
         if (lock->l_export == NULL) {
                 /* We don't have a "waiting locks list" on clients. */
@@@ -587,7 -587,7 +587,7 @@@ int ldlm_del_waiting_lock(struct ldlm_l
           RETURN(0);
   }
   
- -int ldlm_refresh_waiting_lock(struct ldlm_lock *lock, int timeout)
+ +int ldlm_refresh_waiting_lock(struct ldlm_lock *lock, time64_t timeout)
   {
           RETURN(0);
   }
@@@ -605,9 -605,9 +605,9 @@@
    *
    * \retval            timeout in seconds to wait for the client reply
    */
- -unsigned int ldlm_bl_timeout(struct ldlm_lock *lock)
+ +time64_t ldlm_bl_timeout(struct ldlm_lock *lock)
   {
- -      unsigned int timeout;
+ +      time64_t timeout;
   
         if (AT_OFF)
                 return obd_timeout / 2;
@@@ -617,7 -617,7 +617,7 @@@
          * It would be nice to have some kind of "early reply" mechanism for
          * lock callbacks too... */
         timeout = at_get(&lock->l_export->exp_bl_lock_at);
- -      return max(timeout + (timeout >> 1), ldlm_enqueue_min);
+ +      return max(timeout + (timeout >> 1), (time64_t)ldlm_enqueue_min);
   }
   EXPORT_SYMBOL(ldlm_bl_timeout);
   
@@@ -654,7 -654,14 +654,7 @@@ static int ldlm_handle_ast_error(struc
         struct lnet_process_id peer = req->rq_import->imp_connection->c_peer;
   
         if (!req->rq_replied || (rc && rc != -EINVAL)) {
- -              if (lock->l_export && lock->l_export->exp_libclient) {
- -                      LDLM_DEBUG(lock,
- -                                 "%s AST (req@%p x%llu) to liblustre client (nid %s) timeout, just cancelling lock",
- -                                 ast_type, req, req->rq_xid,
- -                                 libcfs_nid2str(peer.nid));
- -                      ldlm_lock_cancel(lock);
- -                      rc = -ERESTART;
- -              } else if (ldlm_is_cancel(lock)) {
+ +              if (ldlm_is_cancel(lock)) {
                         LDLM_DEBUG(lock,
                                    "%s AST (req@%p x%llu) timeout from nid %s, but cancel was received (AST reply lost?)",
                                    ast_type, req, req->rq_xid,
@@@ -706,7 -713,7 +706,7 @@@
                         /* update lvbo to return proper attributes.
                          * see bug 23174 */
                         ldlm_resource_getref(res);
-                       ldlm_res_lvbo_update(res, NULL, 1);
+                       ldlm_lvbo_update(res, lock, NULL, 1);
                         ldlm_resource_putref(res);
                 }
                 ldlm_lock_cancel(lock);
@@@ -741,11 -748,11 +741,11 @@@ static int ldlm_cb_interpret(const stru
                 } else if (rc == -ELDLM_NO_LOCK_DATA) {
                         LDLM_DEBUG(lock, "lost race - client has a lock but no "
                                    "inode");
-                       ldlm_res_lvbo_update(lock->l_resource, NULL, 1);
+                       ldlm_lvbo_update(lock->l_resource, lock, NULL, 1);
                 } else if (rc != 0) {
                         rc = ldlm_handle_ast_error(lock, req, rc, "glimpse");
                 } else {
-                       rc = ldlm_res_lvbo_update(lock->l_resource, req, 1);
+                       rc = ldlm_lvbo_update(lock->l_resource, lock, req, 1);
                 }
                 break;
         case LDLM_BL_CALLBACK:
@@@ -773,8 -780,8 +773,8 @@@
   
   static void ldlm_update_resend(struct ptlrpc_request *req, void *data)
   {
- -      struct ldlm_cb_async_args *ca   = data;
- -      struct ldlm_lock          *lock = ca->ca_lock;
+ +      struct ldlm_cb_async_args *ca = data;
+ +      struct ldlm_lock *lock = ca->ca_lock;
   
         ldlm_refresh_waiting_lock(lock, ldlm_bl_timeout(lock));
   }
@@@ -867,18 -874,18 +867,18 @@@ int ldlm_server_blocking_ast(struct ldl
   
           ldlm_lock_reorder_req(lock);
   
- -        req = ptlrpc_request_alloc_pack(lock->l_export->exp_imp_reverse,
- -                                        &RQF_LDLM_BL_CALLBACK,
- -                                        LUSTRE_DLM_VERSION, LDLM_BL_CALLBACK);
- -        if (req == NULL)
- -                RETURN(-ENOMEM);
+ +      req = ptlrpc_request_alloc_pack(lock->l_export->exp_imp_reverse,
+ +                                      &RQF_LDLM_BL_CALLBACK,
+ +                                      LUSTRE_DLM_VERSION, LDLM_BL_CALLBACK);
+ +      if (req == NULL)
+ +              RETURN(-ENOMEM);
   
- -        CLASSERT(sizeof(*ca) <= sizeof(req->rq_async_args));
- -        ca = ptlrpc_req_async_args(req);
- -        ca->ca_set_arg = arg;
- -        ca->ca_lock = lock;
+ +      CLASSERT(sizeof(*ca) <= sizeof(req->rq_async_args));
+ +      ca = ptlrpc_req_async_args(req);
+ +      ca->ca_set_arg = arg;
+ +      ca->ca_lock = lock;
   
- -        req->rq_interpret_reply = ldlm_cb_interpret;
+ +      req->rq_interpret_reply = ldlm_cb_interpret;
   
         lock_res_and_lock(lock);
         if (ldlm_is_destroyed(lock)) {
@@@ -985,21 -992,21 +985,21 @@@ int ldlm_server_completion_ast(struct l
                 lvb_len = 0;
   
         req_capsule_set_size(&req->rq_pill, &RMF_DLM_LVB, RCL_CLIENT, lvb_len);
- -        rc = ptlrpc_request_pack(req, LUSTRE_DLM_VERSION, LDLM_CP_CALLBACK);
- -        if (rc) {
- -                ptlrpc_request_free(req);
- -                RETURN(rc);
- -        }
+ +      rc = ptlrpc_request_pack(req, LUSTRE_DLM_VERSION, LDLM_CP_CALLBACK);
+ +      if (rc) {
+ +              ptlrpc_request_free(req);
+ +              RETURN(rc);
+ +      }
   
- -        CLASSERT(sizeof(*ca) <= sizeof(req->rq_async_args));
- -        ca = ptlrpc_req_async_args(req);
- -        ca->ca_set_arg = arg;
- -        ca->ca_lock = lock;
+ +      CLASSERT(sizeof(*ca) <= sizeof(req->rq_async_args));
+ +      ca = ptlrpc_req_async_args(req);
+ +      ca->ca_set_arg = arg;
+ +      ca->ca_lock = lock;
   
- -        req->rq_interpret_reply = ldlm_cb_interpret;
- -        body = req_capsule_client_get(&req->rq_pill, &RMF_DLM_REQ);
+ +      req->rq_interpret_reply = ldlm_cb_interpret;
+ +      body = req_capsule_client_get(&req->rq_pill, &RMF_DLM_REQ);
   
- -        body->lock_handle[0] = lock->l_remote_handle;
+ +      body->lock_handle[0] = lock->l_remote_handle;
         body->lock_flags = ldlm_flags_to_wire(flags);
           ldlm_lock2desc(lock, &body->lock_desc);
         if (lvb_len > 0) {
@@@ -1113,9 -1120,9 +1113,9 @@@ int ldlm_server_glimpse_ast(struct ldlm
                 *desc = *arg->gl_desc;
         }
   
- -        body = req_capsule_client_get(&req->rq_pill, &RMF_DLM_REQ);
- -        body->lock_handle[0] = lock->l_remote_handle;
- -        ldlm_lock2desc(lock, &body->lock_desc);
+ +      body = req_capsule_client_get(&req->rq_pill, &RMF_DLM_REQ);
+ +      body->lock_handle[0] = lock->l_remote_handle;
+ +      ldlm_lock2desc(lock, &body->lock_desc);
   
         CLASSERT(sizeof(*ca) <= sizeof(req->rq_async_args));
         ca = ptlrpc_req_async_args(req);
@@@ -1145,6 -1152,7 +1145,7 @@@
   
         RETURN(rc);
   }
+ EXPORT_SYMBOL(ldlm_server_glimpse_ast);
   
   int ldlm_glimpse_locks(struct ldlm_resource *res,
                        struct list_head *gl_work_list)
@@@ -1177,40 -1185,6 +1178,6 @@@ struct ldlm_lock *ldlm_request_lock(str
   }
   EXPORT_SYMBOL(ldlm_request_lock);
   
- static void ldlm_svc_get_eopc(const struct ldlm_request *dlm_req,
-                        struct lprocfs_stats *srv_stats)
- {
-         int lock_type = 0, op = 0;
- 
-         lock_type = dlm_req->lock_desc.l_resource.lr_type;
- 
-         switch (lock_type) {
-         case LDLM_PLAIN:
-                 op = PTLRPC_LAST_CNTR + LDLM_PLAIN_ENQUEUE;
-                 break;
-         case LDLM_EXTENT:
-                 if (dlm_req->lock_flags & LDLM_FL_HAS_INTENT)
-                         op = PTLRPC_LAST_CNTR + LDLM_GLIMPSE_ENQUEUE;
-                 else
-                         op = PTLRPC_LAST_CNTR + LDLM_EXTENT_ENQUEUE;
-                 break;
-         case LDLM_FLOCK:
-                 op = PTLRPC_LAST_CNTR + LDLM_FLOCK_ENQUEUE;
-                 break;
-         case LDLM_IBITS:
-                 op = PTLRPC_LAST_CNTR + LDLM_IBITS_ENQUEUE;
-                 break;
-         default:
-                 op = 0;
-                 break;
-         }
- 
-         if (op)
-                 lprocfs_counter_incr(srv_stats, op);
- 
-         return;
- }
- 
   /**
    * Main server-side entry point into LDLM for enqueue. This is called by ptlrpc
    * service threads to carry out client lock enqueueing requests.
@@@ -1236,7 -1210,9 +1203,9 @@@ int ldlm_handle_enqueue0(struct ldlm_na
   
         LASSERT(req->rq_export);
   
-       if (ptlrpc_req2svc(req)->srv_stats != NULL)
+       /* for intent enqueue the stat will be updated inside intent policy */
+       if (ptlrpc_req2svc(req)->srv_stats != NULL &&
+           !(dlm_req->lock_flags & LDLM_FL_HAS_INTENT))
                 ldlm_svc_get_eopc(dlm_req, ptlrpc_req2svc(req)->srv_stats);
   
           if (req->rq_export && req->rq_export->exp_nid_stats &&
@@@ -1356,7 -1332,6 +1325,6 @@@
                 lock->l_req_extent = lock->l_policy_data.l_extent;
   
   existing_lock:
- 
           if (flags & LDLM_FL_HAS_INTENT) {
                   /* In this case, the reply buffer is allocated deep in
                    * local_lock_enqueue by the policy function. */
@@@ -1425,9 -1400,34 +1393,9 @@@
                                   ldlm_add_waiting_lock(lock);
                   }
           }
- -        /* Make sure we never ever grant usual metadata locks to liblustre
- -           clients */
- -        if ((dlm_req->lock_desc.l_resource.lr_type == LDLM_PLAIN ||
- -            dlm_req->lock_desc.l_resource.lr_type == LDLM_IBITS) &&
- -             req->rq_export->exp_libclient) {
- -              if (unlikely(!ldlm_is_cancel_on_block(lock) ||
- -                             !(dlm_rep->lock_flags & LDLM_FL_CANCEL_ON_BLOCK))){
- -                        CERROR("Granting sync lock to libclient. "
- -                             "req fl %d, rep fl %d, lock fl %#llx\n",
- -                               dlm_req->lock_flags, dlm_rep->lock_flags,
- -                               lock->l_flags);
- -                        LDLM_ERROR(lock, "sync lock");
- -                      if (dlm_req->lock_flags & LDLM_FL_HAS_INTENT) {
- -                              struct ldlm_intent *it;
- -
- -                              it = req_capsule_client_get(&req->rq_pill,
- -                                                          &RMF_LDLM_INTENT);
- -                              if (it != NULL) {
- -                                      CERROR("This is intent %s (%llu)\n",
- -                                             ldlm_it2str(it->opc), it->opc);
- -                              }
- -                      }
- -                }
- -        }
+ +      unlock_res_and_lock(lock);
   
- -        unlock_res_and_lock(lock);
- -
- -        EXIT;
+ +      EXIT;
    out:
           req->rq_status = rc ?: err; /* return either error - bug 11190 */
           if (!req->rq_packed_final) {
@@@ -1658,7 -1658,9 +1626,9 @@@ int ldlm_request_cancel(struct ptlrpc_r
                           if (res != NULL) {
                                   ldlm_resource_getref(res);
                                   LDLM_RESOURCE_ADDREF(res);
-                                 ldlm_res_lvbo_update(res, NULL, 1);
+ 
+                               if (!ldlm_is_discard_data(lock))
+                                       ldlm_lvbo_update(res, lock, NULL, 1);
                           }
                           pres = res;
                   }
@@@ -1773,8 -1775,7 +1743,8 @@@ static void ldlm_handle_cp_callback(str
   
         INIT_LIST_HEAD(&ast_list);
         if (OBD_FAIL_CHECK(OBD_FAIL_LDLM_CANCEL_BL_CB_RACE)) {
- -              int to = cfs_time_seconds(1);
+ +              long to = cfs_time_seconds(1);
+ +
                 while (to > 0) {
                         set_current_state(TASK_INTERRUPTIBLE);
                         schedule_timeout(to);
@@@ -2945,7 -2946,7 +2915,7 @@@ static int ldlm_setup(void
           if (ldlm_state == NULL)
                   RETURN(-ENOMEM);
   
- -      ldlm_kobj = kobject_create_and_add("ldlm", lustre_kobj);
+ +      ldlm_kobj = kobject_create_and_add("ldlm", &lustre_kset->kobj);
         if (!ldlm_kobj)
                 GOTO(out, -ENOMEM);
   
@@@ -3153,10 -3154,8 +3123,10 @@@ static int ldlm_cleanup(void
                 kset_unregister(ldlm_ns_kset);
         if (ldlm_svc_kset)
                 kset_unregister(ldlm_svc_kset);
- -      if (ldlm_kobj)
+ +      if (ldlm_kobj) {
+ +              sysfs_remove_group(ldlm_kobj, &ldlm_attr_group);
                 kobject_put(ldlm_kobj);
+ +      }
   
         ldlm_proc_cleanup();
   
@@@ -3201,11 -3200,22 +3171,22 @@@ int ldlm_init(void
         if (ldlm_interval_tree_slab == NULL)
                 goto out_interval;
   
+ #ifdef HAVE_SERVER_SUPPORT
+       ldlm_glimpse_work_kmem = kmem_cache_create("ldlm_glimpse_work_kmem",
+                                       sizeof(struct ldlm_glimpse_work),
+                                       0, 0, NULL);
+       if (ldlm_glimpse_work_kmem == NULL)
+               goto out_interval_tree;
+ #endif
+ 
   #if LUSTRE_TRACKS_LOCK_EXP_REFS
         class_export_dump_hook = ldlm_dump_export_locks;
   #endif
         return 0;
- 
+ #ifdef HAVE_SERVER_SUPPORT
+ out_interval_tree:
+       kmem_cache_destroy(ldlm_interval_tree_slab);
+ #endif
   out_interval:
         kmem_cache_destroy(ldlm_interval_slab);
   out_lock:
@@@ -3228,4 -3238,7 +3209,7 @@@ void ldlm_exit(void
         kmem_cache_destroy(ldlm_lock_slab);
         kmem_cache_destroy(ldlm_interval_slab);
         kmem_cache_destroy(ldlm_interval_tree_slab);
+ #ifdef HAVE_SERVER_SUPPORT
+       kmem_cache_destroy(ldlm_glimpse_work_kmem);
+ #endif
   }
diff --combined lustre/ldlm/ldlm_request.c

index 43da254,72f0049..859568d
--- 1/lustre/ldlm/ldlm_request.c
--- 2/lustre/ldlm/ldlm_request.c
+++ b/lustre/ldlm/ldlm_request.c
@@@ -120,16 -120,16 +120,16 @@@ int ldlm_expired_completion_wait(void *
   
           ENTRY;
           if (lock->l_conn_export == NULL) {
- -                static cfs_time_t next_dump = 0, last_dump = 0;
+ +              static time64_t next_dump, last_dump;
   
                 LDLM_ERROR(lock, "lock timed out (enqueued at %lld, %llds ago); "
                            "not entering recovery in server code, just going back to sleep",
                            (s64)lock->l_last_activity,
                            (s64)(ktime_get_real_seconds() -
                                  lock->l_last_activity));
- -                if (cfs_time_after(cfs_time_current(), next_dump)) {
+ +              if (ktime_get_seconds() > next_dump) {
                           last_dump = next_dump;
- -                        next_dump = cfs_time_shift(300);
+ +                      next_dump = ktime_get_seconds() + 300;
                           ldlm_namespace_dump(D_DLMTRACE,
                                               ldlm_lock_to_ns(lock));
                           if (last_dump == 0)
@@@ -161,9 -161,9 +161,9 @@@
   
   /* We use the same basis for both server side and client side functions
      from a single node. */
- -static unsigned int ldlm_cp_timeout(struct ldlm_lock *lock)
+ +static time64_t ldlm_cp_timeout(struct ldlm_lock *lock)
   {
- -      unsigned int timeout;
+ +      time64_t timeout;
   
         if (AT_OFF)
                 return obd_timeout;
@@@ -172,7 -172,7 +172,7 @@@
          * lock from another client.  Server will evict the other client if it
          * doesn't respond reasonably, and then give us the lock. */
         timeout = at_get(ldlm_lock_to_ns_at(lock));
- -      return max(3 * timeout, ldlm_enqueue_min);
+ +      return max(3 * timeout, (time64_t) ldlm_enqueue_min);
   }
   
   /**
@@@ -255,7 -255,7 +255,7 @@@ int ldlm_completion_ast(struct ldlm_loc
           struct obd_device *obd;
           struct obd_import *imp = NULL;
           struct l_wait_info lwi;
- -        __u32 timeout;
+ +      time64_t timeout;
           int rc = 0;
           ENTRY;
   
@@@ -284,7 -284,7 +284,7 @@@ noreproc
         timeout = ldlm_cp_timeout(lock);
   
         lwd.lwd_lock = lock;
- -      lock->l_last_activity = cfs_time_current_sec();
+ +      lock->l_last_activity = ktime_get_real_seconds();
   
         if (ldlm_is_no_timeout(lock)) {
                   LDLM_DEBUG(lock, "waiting indefinitely because of NO_TIMEOUT");
@@@ -946,7 -946,7 +946,7 @@@ int ldlm_cli_enqueue(struct obd_export 
         lock->l_export = NULL;
         lock->l_blocking_ast = einfo->ei_cb_bl;
         lock->l_flags |= (*flags & (LDLM_FL_NO_LRU | LDLM_FL_EXCL));
- -        lock->l_last_activity = cfs_time_current_sec();
+ +      lock->l_last_activity = ktime_get_real_seconds();
   
         /* lock not sent to server yet */
         if (reqp == NULL || *reqp == NULL) {
@@@ -976,6 -976,28 +976,28 @@@
         body->lock_flags = ldlm_flags_to_wire(*flags);
         body->lock_handle[0] = *lockh;
   
+       /* extended LDLM opcodes in client stats */
+       if (exp->exp_obd->obd_svc_stats != NULL) {
+               bool glimpse = *flags & LDLM_FL_HAS_INTENT;
+ 
+               /* OST glimpse has no intent buffer */
+               if (req_capsule_has_field(&req->rq_pill, &RMF_LDLM_INTENT,
+                                         RCL_CLIENT)) {
+                       struct ldlm_intent *it;
+ 
+                       it = req_capsule_client_get(&req->rq_pill,
+                                                   &RMF_LDLM_INTENT);
+                       glimpse = (it && (it->opc == IT_GLIMPSE));
+               }
+ 
+               if (!glimpse)
+                       ldlm_svc_get_eopc(body, exp->exp_obd->obd_svc_stats);
+               else
+                       lprocfs_counter_incr(exp->exp_obd->obd_svc_stats,
+                                            PTLRPC_LAST_CNTR +
+                                            LDLM_GLIMPSE_ENQUEUE);
+       }
+ 
         if (async) {
                 LASSERT(reqp != NULL);
                 RETURN(0);
@@@ -1817,8 -1839,8 +1839,8 @@@ static int ldlm_prepare_lru_list(struc
                 lock->l_flags |= LDLM_FL_CBPENDING | LDLM_FL_CANCELING;
   
                 if ((lru_flags & LDLM_LRU_FLAG_CLEANUP) &&
-                   lock->l_resource->lr_type == LDLM_EXTENT &&
-                   lock->l_granted_mode == LCK_PR)
+                   (lock->l_resource->lr_type == LDLM_EXTENT ||
+                    ldlm_has_dom(lock)) && lock->l_granted_mode == LCK_PR)
                         ldlm_set_discard_data(lock);
   
                 /* We can't re-add to l_lru as it confuses the
diff --combined lustre/llite/llite_lib.c

index 6071d49,960f90c..002c9a7
--- 1/lustre/llite/llite_lib.c
--- 2/lustre/llite/llite_lib.c
+++ b/lustre/llite/llite_lib.c
@@@ -99,7 -99,8 +99,7 @@@ static struct ll_sb_info *ll_init_sbi(v
         sbi->ll_ra_info.ra_max_pages_per_file = min(pages / 32,
                                            SBI_DEFAULT_READAHEAD_MAX);
         sbi->ll_ra_info.ra_max_pages = sbi->ll_ra_info.ra_max_pages_per_file;
- -      sbi->ll_ra_info.ra_max_read_ahead_whole_pages =
- -                                         SBI_DEFAULT_READAHEAD_WHOLE_MAX;
+ +      sbi->ll_ra_info.ra_max_read_ahead_whole_pages = -1;
   
           ll_generate_random_uuid(uuid);
           class_uuid_unparse(uuid, &sbi->ll_sb_uuid);
@@@ -195,13 -196,18 +195,18 @@@ static int client_common_fill_super(str
                   RETURN(-ENOMEM);
           }
   
+       /* pass client page size via ocd_grant_blkbits, the server should report
+        * back its backend blocksize for grant calculation purpose */
+       data->ocd_grant_blkbits = PAGE_SHIFT;
+ 
         /* indicate MDT features supported by this client */
-         data->ocd_connect_flags = OBD_CONNECT_IBITS    | OBD_CONNECT_NODEVOH  |
-                                   OBD_CONNECT_ATTRFID  |
-                                   OBD_CONNECT_VERSION  | OBD_CONNECT_BRW_SIZE |
-                                   OBD_CONNECT_MDS_CAPA | OBD_CONNECT_OSS_CAPA |
-                                   OBD_CONNECT_CANCELSET | OBD_CONNECT_FID     |
-                                   OBD_CONNECT_AT       | OBD_CONNECT_LOV_V3   |
+       data->ocd_connect_flags = OBD_CONNECT_IBITS    | OBD_CONNECT_NODEVOH  |
+                                 OBD_CONNECT_ATTRFID  | OBD_CONNECT_GRANT |
+                                 OBD_CONNECT_VERSION  | OBD_CONNECT_BRW_SIZE |
+                                 OBD_CONNECT_SRVLOCK  | OBD_CONNECT_TRUNCLOCK|
+                                 OBD_CONNECT_MDS_CAPA | OBD_CONNECT_OSS_CAPA |
+                                 OBD_CONNECT_CANCELSET | OBD_CONNECT_FID     |
+                                 OBD_CONNECT_AT       | OBD_CONNECT_LOV_V3   |
                                   OBD_CONNECT_VBR | OBD_CONNECT_FULL20 |
                                   OBD_CONNECT_64BITHASH |
                                   OBD_CONNECT_EINPROGRESS |
@@@ -212,9 -218,10 +217,10 @@@
                                   OBD_CONNECT_DISP_STRIPE | OBD_CONNECT_LFSCK |
                                   OBD_CONNECT_OPEN_BY_FID |
                                   OBD_CONNECT_DIR_STRIPE |
-                                 OBD_CONNECT_BULK_MBITS |
+                                 OBD_CONNECT_BULK_MBITS | OBD_CONNECT_CKSUM |
                                   OBD_CONNECT_SUBTREE |
-                                 OBD_CONNECT_FLAGS2 | OBD_CONNECT_MULTIMODRPCS;
+                                 OBD_CONNECT_MULTIMODRPCS |
+                                 OBD_CONNECT_GRANT_PARAM | OBD_CONNECT_FLAGS2;
   
         data->ocd_connect_flags2 = 0;
   
@@@ -227,6 -234,8 +233,8 @@@
                                    OBD_CONNECT_LARGE_ACL;
   #endif
   
+       data->ocd_cksum_types = cksum_types_supported_client();
+ 
         if (OBD_FAIL_CHECK(OBD_FAIL_MDC_LIGHTWEIGHT))
                 /* flag mdc connection as lightweight, only used for test
                  * purpose, use with care */
@@@ -258,7 -267,7 +266,7 @@@
         if (sbi->ll_flags & LL_SBI_ALWAYS_PING)
                 data->ocd_connect_flags &= ~OBD_CONNECT_PINGLESS;
   
- -#ifdef HAVE_SECURITY_DENTRY_INIT_SECURITY
+ +#if defined(HAVE_SECURITY_DENTRY_INIT_SECURITY) && defined(CONFIG_SECURITY)
         data->ocd_connect_flags2 |= OBD_CONNECT2_FILE_SECCTX;
   #endif /* HAVE_SECURITY_DENTRY_INIT_SECURITY */
   
@@@ -399,7 -408,7 +407,7 @@@
                                   OBD_CONNECT_JOBSTATS | OBD_CONNECT_LVB_TYPE |
                                   OBD_CONNECT_LAYOUTLOCK |
                                   OBD_CONNECT_PINGLESS | OBD_CONNECT_LFSCK |
- -                                OBD_CONNECT_BULK_MBITS |
+ +                                OBD_CONNECT_BULK_MBITS | OBD_CONNECT_SHORTIO |
                                   OBD_CONNECT_FLAGS2;
   
   /* The client currently advertises support for OBD_CONNECT_LOCKAHEAD_OLD so it
@@@ -466,12 -475,6 +474,12 @@@
   
         sbi->ll_dt_exp->exp_connect_data = *data;
   
+ +      /* Don't change value if it was specified in the config log */
+ +      if (sbi->ll_ra_info.ra_max_read_ahead_whole_pages == -1)
+ +              sbi->ll_ra_info.ra_max_read_ahead_whole_pages =
+ +                      max_t(unsigned long, SBI_DEFAULT_READAHEAD_WHOLE_MAX,
+ +                            (data->ocd_brw_size >> PAGE_SHIFT));
+ +
         err = obd_fid_init(sbi->ll_dt_exp->exp_obd, sbi->ll_dt_exp,
                            LUSTRE_SEQ_METADATA);
         if (err) {
@@@ -948,41 -951,14 +956,41 @@@ void ll_lli_init(struct ll_inode_info *
         memset(lli->lli_jobid, 0, LUSTRE_JOBID_SIZE);
   }
   
- -static inline int ll_bdi_register(struct backing_dev_info *bdi)
+ +#ifndef HAVE_SUPER_SETUP_BDI_NAME
+ +
+ +#define LSI_BDI_INITIALIZED   0x00400000
+ +
+ +#ifndef HAVE_BDI_CAP_MAP_COPY
+ +# define BDI_CAP_MAP_COPY     0
+ +#endif
+ +
+ +#define MAX_STRING_SIZE 128
+ +
+ +static int super_setup_bdi_name(struct super_block *sb, char *fmt, ...)
   {
- -      static atomic_t ll_bdi_num = ATOMIC_INIT(0);
+ +      struct  lustre_sb_info *lsi = s2lsi(sb);
+ +      char buf[MAX_STRING_SIZE];
+ +      va_list args;
+ +      int err;
+ +
+ +      err = bdi_init(&lsi->lsi_bdi);
+ +      if (err)
+ +              return err;
+ +
+ +      lsi->lsi_flags |= LSI_BDI_INITIALIZED;
+ +      lsi->lsi_bdi.capabilities = BDI_CAP_MAP_COPY;
+ +      lsi->lsi_bdi.name = "lustre";
+ +      va_start(args, fmt);
+ +      vsnprintf(buf, MAX_STRING_SIZE, fmt, args);
+ +      va_end(args);
+ +      err = bdi_register(&lsi->lsi_bdi, NULL, "%s", buf);
+ +      va_end(args);
+ +      if (!err)
+ +              sb->s_bdi = &lsi->lsi_bdi;
   
- -      bdi->name = "lustre";
- -      return bdi_register(bdi, NULL, "lustre-%d",
- -                          atomic_inc_return(&ll_bdi_num));
+ +      return err;
   }
+ +#endif /* !HAVE_SUPER_SETUP_BDI_NAME */
   
   int ll_fill_super(struct super_block *sb, struct vfsmount *mnt)
   {
@@@ -1019,10 -995,20 +1027,10 @@@
         if (err)
                 GOTO(out_free, err);
   
- -      err = bdi_init(&lsi->lsi_bdi);
- -      if (err)
- -              GOTO(out_free, err);
- -      lsi->lsi_flags |= LSI_BDI_INITIALIZED;
- -#ifdef HAVE_BDI_CAP_MAP_COPY
- -      lsi->lsi_bdi.capabilities = BDI_CAP_MAP_COPY;
- -#else
- -      lsi->lsi_bdi.capabilities = 0;
- -#endif
- -      err = ll_bdi_register(&lsi->lsi_bdi);
+ +      err = super_setup_bdi_name(sb, "lustre-%p", sb);
         if (err)
                 GOTO(out_free, err);
   
- -      sb->s_bdi = &lsi->lsi_bdi;
   #ifndef HAVE_DCACHE_LOCK
         /* kernel >= 2.6.38 store dentry operations in sb->s_d_op. */
         sb->s_d_op = &ll_d_ops;
@@@ -1164,12 -1150,10 +1172,12 @@@ void ll_put_super(struct super_block *s
           if (profilenm)
                   class_del_profile(profilenm);
   
+ +#ifndef HAVE_SUPER_SETUP_BDI_NAME
         if (lsi->lsi_flags & LSI_BDI_INITIALIZED) {
                 bdi_destroy(&lsi->lsi_bdi);
                 lsi->lsi_flags &= ~LSI_BDI_INITIALIZED;
         }
+ +#endif
   
           ll_free_sbi(sb);
           lsi->lsi_llsbi = NULL;
diff --combined lustre/llite/namei.c

index ef18c1d,f4f8911..3920026
--- 1/lustre/llite/namei.c
--- 2/lustre/llite/namei.c
+++ b/lustre/llite/namei.c
@@@ -184,6 -184,45 +184,45 @@@ int ll_test_inode_by_fid(struct inode *
         return lu_fid_eq(&ll_i2info(inode)->lli_fid, opaque);
   }
   
+ int ll_dom_lock_cancel(struct inode *inode, struct ldlm_lock *lock)
+ {
+       struct lu_env *env;
+       struct ll_inode_info *lli = ll_i2info(inode);
+       struct cl_layout clt = { .cl_layout_gen = 0, };
+       int rc;
+       __u16 refcheck;
+ 
+ 
+       ENTRY;
+ 
+       if (!lli->lli_clob)
+               RETURN(0);
+ 
+       env = cl_env_get(&refcheck);
+       if (IS_ERR(env))
+               RETURN(PTR_ERR(env));
+ 
+       rc = cl_object_layout_get(env, lli->lli_clob, &clt);
+       if (rc) {
+               CDEBUG(D_INODE, "Cannot get layout for "DFID"\n",
+                      PFID(ll_inode2fid(inode)));
+               rc = -ENODATA;
+       } else if (clt.cl_size == 0 || clt.cl_dom_comp_size == 0) {
+               CDEBUG(D_INODE, "DOM lock without DOM layout for "DFID"\n",
+                      PFID(ll_inode2fid(inode)));
+       } else {
+               enum cl_fsync_mode mode;
+               loff_t end = clt.cl_dom_comp_size - 1;
+ 
+               mode = ldlm_is_discard_data(lock) ?
+                                       CL_FSYNC_DISCARD : CL_FSYNC_LOCAL;
+               rc = cl_sync_file_range(inode, 0, end, mode, 1);
+               truncate_inode_pages_range(inode->i_mapping, 0, end);
+       }
+       cl_env_put(env, &refcheck);
+       RETURN(rc);
+ }
+ 
   int ll_md_blocking_ast(struct ldlm_lock *lock, struct ldlm_lock_desc *desc,
                        void *data, int flag)
   {
@@@ -204,10 -243,6 +243,6 @@@
                 struct inode *inode = ll_inode_from_resource_lock(lock);
                 __u64 bits = lock->l_policy_data.l_inodebits.bits;
   
-               /* Inode is set to lock->l_resource->lr_lvb_inode
-                * for mdc - bug 24555 */
-               LASSERT(lock->l_ast_data == NULL);
- 
                 if (inode == NULL)
                         break;
   
@@@ -257,9 -292,22 +292,22 @@@
                 }
   
                 if (bits & (MDS_INODELOCK_LOOKUP | MDS_INODELOCK_UPDATE |
-                           MDS_INODELOCK_LAYOUT | MDS_INODELOCK_PERM))
+                           MDS_INODELOCK_LAYOUT | MDS_INODELOCK_PERM |
+                           MDS_INODELOCK_DOM))
                         ll_have_md_lock(inode, &bits, LCK_MINMODE);
   
+               if (bits & MDS_INODELOCK_DOM) {
+                       rc =  ll_dom_lock_cancel(inode, lock);
+                       if (rc < 0)
+                               CDEBUG(D_INODE, "cannot flush DoM data "
+                                      DFID": rc = %d\n",
+                                      PFID(ll_inode2fid(inode)), rc);
+                       lock_res_and_lock(lock);
+                       ldlm_set_kms_ignore(lock);
+                       unlock_res_and_lock(lock);
+                       bits &= ~MDS_INODELOCK_DOM;
+               }
+ 
                 if (bits & MDS_INODELOCK_LAYOUT) {
                         struct cl_object_conf conf = {
                                 .coc_opc = OBJECT_CONF_INVALIDATE,
@@@ -1386,18 -1434,17 +1434,18 @@@ int ll_rmdir_entry(struct inode *dir, c
   static int ll_unlink(struct inode *dir, struct dentry *dchild)
   {
         struct qstr *name = &dchild->d_name;
- -        struct ptlrpc_request *request = NULL;
- -        struct md_op_data *op_data;
- -        int rc;
- -        ENTRY;
+ +      struct ptlrpc_request *request = NULL;
+ +      struct md_op_data *op_data;
+ +      struct mdt_body *body;
+ +      int rc;
+ +      ENTRY;
         CDEBUG(D_VFSTRACE, "VFS Op:name=%.*s, dir="DFID"(%p)\n",
                name->len, name->name, PFID(ll_inode2fid(dir)), dir);
   
- -        /*
- -         * XXX: unlink bind mountpoint maybe call to here,
- -         * just check it as vfs_unlink does.
- -         */
+ +      /*
+ +       * XXX: unlink bind mountpoint maybe call to here,
+ +       * just check it as vfs_unlink does.
+ +       */
         if (unlikely(d_mountpoint(dchild)))
                 RETURN(-EBUSY);
   
@@@ -1406,7 -1453,8 +1454,7 @@@
         if (IS_ERR(op_data))
                 RETURN(PTR_ERR(op_data));
   
- -      if (dchild->d_inode != NULL)
- -              op_data->op_fid3 = *ll_inode2fid(dchild->d_inode);
+ +      op_data->op_fid3 = *ll_inode2fid(dchild->d_inode);
   
         op_data->op_fid2 = op_data->op_fid3;
         rc = md_unlink(ll_i2sbi(dir)->ll_md_exp, op_data, &request);
@@@ -1414,20 -1462,12 +1462,20 @@@
         if (rc)
                 GOTO(out, rc);
   
- -        ll_update_times(request, dir);
- -        ll_stats_ops_tally(ll_i2sbi(dir), LPROC_LL_UNLINK, 1);
+ +      /*
+ +       * The server puts attributes in on the last unlink, use them to update
+ +       * the link count so the inode can be freed immediately.
+ +       */
+ +      body = req_capsule_server_get(&request->rq_pill, &RMF_MDT_BODY);
+ +      if (body->mbo_valid & OBD_MD_FLNLINK)
+ +              set_nlink(dchild->d_inode, body->mbo_nlink);
   
- - out:
- -        ptlrpc_req_finished(request);
- -        RETURN(rc);
+ +      ll_update_times(request, dir);
+ +      ll_stats_ops_tally(ll_i2sbi(dir), LPROC_LL_UNLINK, 1);
+ +
+ +out:
+ +      ptlrpc_req_finished(request);
+ +      RETURN(rc);
   }
   
   static int ll_rename(struct inode *src, struct dentry *src_dchild,
diff --combined lustre/lmv/lmv_obd.c

index 754c96f,1d7dad7..ea8950d
--- 1/lustre/lmv/lmv_obd.c
--- 2/lustre/lmv/lmv_obd.c
+++ b/lustre/lmv/lmv_obd.c
@@@ -418,7 -418,7 +418,7 @@@ static int lmv_add_target(struct obd_de
         mutex_lock(&lmv->lmv_init_mutex);
         if ((index < lmv->tgts_size) && (lmv->tgts[index] != NULL)) {
                 tgt = lmv->tgts[index];
-               CERROR("%s: UUID %s already assigned at LOV target index %d:"
+               CERROR("%s: UUID %s already assigned at LMV target index %d:"
                        " rc = %d\n", obd->obd_name,
                        obd_uuid2str(&tgt->ltd_uuid), index, -EEXIST);
                 mutex_unlock(&lmv->lmv_init_mutex);
@@@ -821,25 -821,11 +821,25 @@@ static int lmv_hsm_ct_register(struct l
   {
         struct file             *filp;
         __u32                    i, j;
- -      int                      err, rc;
+ +      int                      err;
         bool                     any_set = false;
- -      struct kkuc_ct_data      kcd = { 0 };
+ +      struct kkuc_ct_data      kcd = {
+ +              .kcd_magic   = KKUC_CT_DATA_MAGIC,
+ +              .kcd_uuid    = lmv->cluuid,
+ +              .kcd_archive = lk->lk_data
+ +      };
+ +      int                      rc = 0;
         ENTRY;
   
+ +      filp = fget(lk->lk_wfd);
+ +      if (!filp)
+ +              RETURN(-EBADF);
+ +
+ +      rc = libcfs_kkuc_group_add(filp, lk->lk_uid, lk->lk_group,
+ +                                 &kcd, sizeof(kcd));
+ +      if (rc)
+ +              GOTO(err_fput, rc);
+ +
         /* All or nothing: try to register to all MDS.
          * In case of failure, unregister from previous MDS,
          * except if it because of inactive target. */
@@@ -848,7 -834,6 +848,7 @@@
   
                 if (tgt == NULL || tgt->ltd_exp == NULL)
                         continue;
+ +
                 err = obd_iocontrol(cmd, tgt->ltd_exp, len, lk, uarg);
                 if (err) {
                         if (tgt->ltd_active) {
@@@ -867,7 -852,7 +867,7 @@@
                                         obd_iocontrol(cmd, tgt->ltd_exp, len,
                                                       lk, uarg);
                                 }
- -                              RETURN(rc);
+ +                              GOTO(err_kkuc_rem, rc);
                         }
                         /* else: transient error.
                          * kuc will register to the missing MDT
@@@ -879,16 -864,23 +879,16 @@@
   
         if (!any_set)
                 /* no registration done: return error */
- -              RETURN(-ENOTCONN);
+ +              GOTO(err_kkuc_rem, rc = -ENOTCONN);
   
- -      /* at least one registration done, with no failure */
- -      filp = fget(lk->lk_wfd);
- -      if (filp == NULL)
- -              RETURN(-EBADF);
+ +      RETURN(0);
   
- -      kcd.kcd_magic = KKUC_CT_DATA_MAGIC;
- -      kcd.kcd_uuid = lmv->cluuid;
- -      kcd.kcd_archive = lk->lk_data;
+ +err_kkuc_rem:
+ +      libcfs_kkuc_group_rem(lk->lk_uid, lk->lk_group);
   
- -      rc = libcfs_kkuc_group_add(filp, lk->lk_uid, lk->lk_group,
- -                                 &kcd, sizeof(kcd));
- -      if (rc != 0)
- -              fput(filp);
- -
- -      RETURN(rc);
+ +err_fput:
+ +      fput(filp);
+ +      return rc;
   }
   
   
@@@ -1975,7 -1967,7 +1975,7 @@@ static int lmv_rename(struct obd_expor
                         RETURN(rc);
   
                 rc = lmv_early_cancel(exp, NULL, op_data, src_tgt->ltd_idx,
-                                     LCK_EX, MDS_INODELOCK_FULL,
+                                     LCK_EX, MDS_INODELOCK_ELC,
                                       MF_MDC_CANCEL_FID3);
                 if (rc != 0)
                         RETURN(rc);
@@@ -1989,7 -1981,7 +1989,7 @@@ retry_rename
                 struct lmv_tgt_desc *tgt;
   
                 rc = lmv_early_cancel(exp, NULL, op_data, src_tgt->ltd_idx,
-                                     LCK_EX, MDS_INODELOCK_FULL,
+                                     LCK_EX, MDS_INODELOCK_ELC,
                                       MF_MDC_CANCEL_FID4);
                 if (rc != 0)
                         RETURN(rc);
@@@ -2532,7 -2524,7 +2532,7 @@@ try_next_stripe
         }
   
         rc = lmv_early_cancel(exp, NULL, op_data, tgt->ltd_idx, LCK_EX,
-                             MDS_INODELOCK_FULL, MF_MDC_CANCEL_FID3);
+                             MDS_INODELOCK_ELC, MF_MDC_CANCEL_FID3);
         if (rc != 0)
                 RETURN(rc);
   
diff --combined lustre/lod/lod_lov.c

index ff2eda4,663cbb2..f5df58d
--- 1/lustre/lod/lod_lov.c
--- 2/lustre/lod/lod_lov.c
+++ b/lustre/lod/lod_lov.c
@@@ -1257,7 -1257,8 +1257,8 @@@ int lod_parse_striping(const struct lu_
                 }
   
                 pattern = le32_to_cpu(lmm->lmm_pattern);
-               if (lov_pattern(pattern) != LOV_PATTERN_RAID0)
+               if (lov_pattern(pattern) != LOV_PATTERN_RAID0 &&
+                   lov_pattern(pattern) != LOV_PATTERN_MDT)
                         GOTO(out, rc = -EINVAL);
   
                 lod_comp->llc_pattern = pattern;
@@@ -1318,7 -1319,8 +1319,8 @@@
                 if (!lod_comp_inited(lod_comp))
                         continue;
   
-               if (!(lod_comp->llc_pattern & LOV_PATTERN_F_RELEASED)) {
+               if (!(lod_comp->llc_pattern & LOV_PATTERN_F_RELEASED) &&
+                   !(lod_comp->llc_pattern & LOV_PATTERN_MDT)) {
                         rc = lod_initialize_objects(env, lo, objs, i);
                         if (rc)
                                 GOTO(out, rc);
@@@ -1511,7 -1513,7 +1513,7 @@@ static int lod_verify_v1v3(struct lod_d
                 GOTO(out, rc = -EINVAL);
         }
   
- -      magic = le32_to_cpu(lum->lmm_magic) & ~LOV_MAGIC_DEF;
+ +      magic = le32_to_cpu(lum->lmm_magic) & ~LOV_MAGIC_DEFINED;
         if (magic != LOV_USER_MAGIC_V1 &&
             magic != LOV_USER_MAGIC_V3 &&
             magic != LOV_USER_MAGIC_SPECIFIC) {
@@@ -1545,7 -1547,8 +1547,8 @@@
         }
   
         stripe_offset = le16_to_cpu(lum->lmm_stripe_offset);
-       if (!is_from_disk && stripe_offset != LOV_OFFSET_DEFAULT) {
+       if (!is_from_disk && stripe_offset != LOV_OFFSET_DEFAULT &&
+           lov_pattern(le32_to_cpu(lum->lmm_pattern)) != LOV_PATTERN_MDT) {
                 /* if offset is not within valid range [0, osts_size) */
                 if (stripe_offset >= d->lod_osts_size) {
                         CDEBUG(D_LAYOUT, "stripe offset %u >= bitmap size %u\n",
@@@ -1637,7 -1640,7 +1640,7 @@@ int lod_verify_striping(struct lod_devi
                 RETURN(-EINVAL);
         }
   
- -      magic = le32_to_cpu(lum->lmm_magic) & ~LOV_MAGIC_DEF;
+ +      magic = le32_to_cpu(lum->lmm_magic) & ~LOV_MAGIC_DEFINED;
         if (magic != LOV_USER_MAGIC_V1 &&
             magic != LOV_USER_MAGIC_V3 &&
             magic != LOV_USER_MAGIC_SPECIFIC &&
@@@ -1701,6 -1704,38 +1704,38 @@@
                         tmp.lb_buf = (char *)comp_v1 +
                                      le32_to_cpu(ent->lcme_offset);
                         tmp.lb_len = le32_to_cpu(ent->lcme_size);
+ 
+                       /* Checks for DoM entry in composite layout. */
+                       lum = tmp.lb_buf;
+                       if (lov_pattern(le32_to_cpu(lum->lmm_pattern)) ==
+                           LOV_PATTERN_MDT) {
+                               /* DoM component can be only the first entry */
+                               if (i > 0) {
+                                       CDEBUG(D_LAYOUT, "invalid DoM layout "
+                                              "entry found at %i index\n", i);
+                                       RETURN(-EINVAL);
+                               }
+                               stripe_size = le32_to_cpu(lum->lmm_stripe_size);
+                               /* There is just one stripe on MDT and it must
+                                * cover whole component size. */
+                               if (stripe_size != prev_end) {
+                                       CDEBUG(D_LAYOUT, "invalid DoM layout "
+                                              "stripe size %u != %llu "
+                                              "(component size)\n",
+                                              stripe_size, prev_end);
+                                       RETURN(-EINVAL);
+                               }
+                               /* Check stripe size againts per-MDT limit */
+                               if (stripe_size > d->lod_dom_max_stripesize) {
+                                       CDEBUG(D_LAYOUT, "DoM component size "
+                                              "%u is bigger than MDT limit "
+                                              "%u, check dom_max_stripesize"
+                                              " parameter\n",
+                                              stripe_size,
+                                              d->lod_dom_max_stripesize);
+                                       RETURN(-EINVAL);
+                               }
+                       }
                         rc = lod_verify_v1v3(d, &tmp, is_from_disk);
                         if (rc)
                                 break;
@@@ -1779,7 -1814,8 +1814,8 @@@ void lod_fix_desc_stripe_count(__u32 *v
   void lod_fix_desc_pattern(__u32 *val)
   {
         /* from lov_setstripe */
-       if ((*val != 0) && (*val != LOV_PATTERN_RAID0)) {
+       if ((*val != 0) && (*val != LOV_PATTERN_RAID0) &&
+           (*val != LOV_PATTERN_MDT)) {
                 LCONSOLE_WARN("Unknown stripe pattern: %#x\n", *val);
                 *val = 0;
         }
diff --combined lustre/lod/lod_object.c

index 330dd31,1254f33..6fbedf0
--- 1/lustre/lod/lod_object.c
--- 2/lustre/lod/lod_object.c
+++ b/lustre/lod/lod_object.c
@@@ -3634,6 -3634,7 +3634,7 @@@ static int lod_get_default_lov_striping
                 }
   
                 if (v1->lmm_pattern != LOV_PATTERN_RAID0 &&
+                   v1->lmm_pattern != LOV_PATTERN_MDT &&
                     v1->lmm_pattern != 0) {
                         lod_free_def_comp_entries(lds);
                         RETURN(-EINVAL);
@@@ -3648,6 -3649,7 +3649,7 @@@
                 lod_comp->llc_stripe_count = v1->lmm_stripe_count;
                 lod_comp->llc_stripe_size = v1->lmm_stripe_size;
                 lod_comp->llc_stripe_offset = v1->lmm_stripe_offset;
+               lod_comp->llc_pattern = v1->lmm_pattern;
   
                 pool = NULL;
                 if (v1->lmm_magic == LOV_USER_MAGIC_V3) {
@@@ -3758,10 -3760,11 +3760,11 @@@ static void lod_striping_from_default(s
                                                 &lds->lds_def_comp_entries[i];
   
                         CDEBUG(D_LAYOUT, "Inherite from default: size:%hu "
-                              "nr:%u offset:%u %s\n",
+                              "nr:%u offset:%u pattern %#x %s\n",
                                def_comp->llc_stripe_size,
                                def_comp->llc_stripe_count,
                                def_comp->llc_stripe_offset,
+                              def_comp->llc_pattern,
                                def_comp->llc_pool ?: "");
   
                         *obj_comp = *def_comp;
@@@ -3782,7 -3785,8 +3785,8 @@@
                         if (!lo->ldo_is_composite)
                                 continue;
   
-                       if (obj_comp->llc_stripe_count <= 0)
+                       if (obj_comp->llc_stripe_count <= 0 &&
+                           obj_comp->llc_pattern != LOV_PATTERN_MDT)
                                 obj_comp->llc_stripe_count =
                                         desc->ld_default_stripe_count;
                         if (obj_comp->llc_stripe_size <= 0)
@@@ -4330,6 -4334,9 +4334,9 @@@ int lod_striped_create(const struct lu_
                 if (lod_comp->llc_pattern & LOV_PATTERN_F_RELEASED)
                         lod_comp_set_init(lod_comp);
   
+               if (lov_pattern(lod_comp->llc_pattern) == LOV_PATTERN_MDT)
+                       lod_comp_set_init(lod_comp);
+ 
                 if (lod_comp->llc_stripe == NULL)
                         continue;
   
@@@ -4863,9 -4870,9 +4870,9 @@@ static int lod_declare_layout_change(co
         if (buf && buf->lb_len)  {
                 struct lov_user_md_v1 *v1 = buf->lb_buf;
   
- -              if (v1->lmm_magic != (LOV_MAGIC_DEF | LOV_MAGIC_COMP_V1) &&
- -                  v1->lmm_magic !=
- -                              __swab32(LOV_MAGIC_DEF | LOV_MAGIC_COMP_V1)) {
+ +              if (v1->lmm_magic != (LOV_MAGIC_DEFINED | LOV_MAGIC_COMP_V1) &&
+ +                  v1->lmm_magic != __swab32(LOV_MAGIC_DEFINED |
+ +                                            LOV_MAGIC_COMP_V1)) {
                         CERROR("%s: the replay buffer of layout extend "
                                "(magic %#x) does not contain expected "
                                "composite layout.\n",
diff --combined lustre/lod/lod_qos.c

index 4ae665d,bf09e1f..fbb8111
--- 1/lustre/lod/lod_qos.c
--- 2/lustre/lod/lod_qos.c
+++ b/lustre/lod/lod_qos.c
@@@ -1729,7 -1729,7 +1729,7 @@@ int lod_use_defined_striping(const stru
         int     rc = 0, i;
         ENTRY;
   
- -      magic = le32_to_cpu(v1->lmm_magic) & ~LOV_MAGIC_DEF;
+ +      magic = le32_to_cpu(v1->lmm_magic) & ~LOV_MAGIC_DEFINED;
   
         if (magic != LOV_MAGIC_V1 && magic != LOV_MAGIC_V3 &&
             magic != LOV_MAGIC_COMP_V1)
@@@ -1799,7 -1799,8 +1799,8 @@@
                 lod_obj_set_pool(mo, i, pool_name);
   
                 if ((!mo->ldo_is_composite || lod_comp_inited(lod_comp)) &&
-                   !(lod_comp->llc_pattern & LOV_PATTERN_F_RELEASED)) {
+                   !(lod_comp->llc_pattern & LOV_PATTERN_F_RELEASED) &&
+                   !(lod_comp->llc_pattern & LOV_PATTERN_MDT)) {
                         rc = lod_initialize_objects(env, mo, objs, i);
                         if (rc)
                                 GOTO(out, rc);
@@@ -1857,7 -1858,7 +1858,7 @@@ int lod_qos_parse_config(const struct l
         comp_v1 = buf->lb_buf;
         magic = v1->lmm_magic;
   
- -      if (unlikely(le32_to_cpu(magic) & LOV_MAGIC_DEF)) {
+ +      if (unlikely(le32_to_cpu(magic) & LOV_MAGIC_DEFINED)) {
                 /* try to use as fully defined striping */
                 rc = lod_use_defined_striping(env, lo, buf);
                 RETURN(rc);
@@@ -1958,25 -1959,29 +1959,29 @@@
   
                 if (v1->lmm_pattern == 0)
                         v1->lmm_pattern = LOV_PATTERN_RAID0;
-               if (lov_pattern(v1->lmm_pattern) != LOV_PATTERN_RAID0) {
+               if (lov_pattern(v1->lmm_pattern) != LOV_PATTERN_RAID0 &&
+                   lov_pattern(v1->lmm_pattern) != LOV_PATTERN_MDT) {
                         CDEBUG(D_LAYOUT, "%s: invalid pattern: %x\n",
                                lod2obd(d)->obd_name, v1->lmm_pattern);
                         GOTO(free_comp, rc = -EINVAL);
                 }
   
                 lod_comp->llc_pattern = v1->lmm_pattern;
- 
                 lod_comp->llc_stripe_size = desc->ld_default_stripe_size;
                 if (v1->lmm_stripe_size)
                         lod_comp->llc_stripe_size = v1->lmm_stripe_size;
   
                 lod_comp->llc_stripe_count = desc->ld_default_stripe_count;
-               if (v1->lmm_stripe_count)
+               if (v1->lmm_stripe_count ||
+                   lov_pattern(v1->lmm_pattern) == LOV_PATTERN_MDT)
                         lod_comp->llc_stripe_count = v1->lmm_stripe_count;
   
                 lod_comp->llc_stripe_offset = v1->lmm_stripe_offset;
                 lod_obj_set_pool(lo, i, pool_name);
   
+               LASSERT(ergo(lov_pattern(lod_comp->llc_pattern) ==
+                            LOV_PATTERN_MDT, lod_comp->llc_stripe_count == 0));
+ 
                 if (pool_name == NULL)
                         continue;
   
@@@ -2051,6 -2056,10 +2056,10 @@@ int lod_qos_prep_create(const struct lu
         if (lod_comp->llc_pattern & LOV_PATTERN_F_RELEASED)
                 RETURN(0);
   
+       /* A Data-on-MDT component is being created */
+       if (lov_pattern(lod_comp->llc_pattern) == LOV_PATTERN_MDT)
+               RETURN(0);
+ 
         if (likely(lod_comp->llc_stripe == NULL)) {
                 /*
                  * no striping has been created so far
diff --combined lustre/lod/lproc_lod.c

index ea17bac,45ab539..3e23ec8
--- 1/lustre/lod/lproc_lod.c
--- 2/lustre/lod/lproc_lod.c
+++ b/lustre/lod/lproc_lod.c
@@@ -54,6 -54,69 +54,69 @@@
    * \retval 0          on success
    * \retval negative   error code if failed
    */
+ static int lod_dom_stripesize_seq_show(struct seq_file *m, void *v)
+ {
+       struct obd_device *dev = m->private;
+       struct lod_device *lod;
+ 
+       LASSERT(dev != NULL);
+       lod  = lu2lod_dev(dev->obd_lu_dev);
+       seq_printf(m, "%u\n", lod->lod_dom_max_stripesize);
+       return 0;
+ }
+ 
+ /**
+  * Set default stripe size.
+  *
+  * \param[in] file    proc file
+  * \param[in] buffer  string containing the maximum number of bytes stored in
+  *                    each object before moving to the next object in the
+  *                    layout (if any)
+  * \param[in] count   @buffer length
+  * \param[in] off     unused for single entry
+  *
+  * \retval @count     on success
+  * \retval negative   error code if failed
+  */
+ static ssize_t
+ lod_dom_stripesize_seq_write(struct file *file, const char __user *buffer,
+                             size_t count, loff_t *off)
+ {
+       struct seq_file *m = file->private_data;
+       struct obd_device *dev = m->private;
+       struct lod_device *lod;
+       __s64 val;
+       int rc;
+ 
+       LASSERT(dev != NULL);
+       lod  = lu2lod_dev(dev->obd_lu_dev);
+       rc = lprocfs_str_with_units_to_s64(buffer, count, &val, '1');
+       if (rc)
+               return rc;
+       if (val < 0)
+               return -ERANGE;
+ 
+       /* */
+       if (val > (1ULL << 30))
+               return -ERANGE;
+       else if (val > 0)
+               lod_fix_desc_stripe_size(&val);
+ 
+       lod->lod_dom_max_stripesize = val;
+ 
+       return count;
+ }
+ LPROC_SEQ_FOPS(lod_dom_stripesize);
+ 
+ /**
+  * Show default stripe size.
+  *
+  * \param[in] m               seq file
+  * \param[in] v               unused for single entry
+  *
+  * \retval 0          on success
+  * \retval negative   error code if failed
+  */
   static int lod_stripesize_seq_show(struct seq_file *m, void *v)
   {
         struct obd_device *dev = m->private;
@@@ -120,7 -183,8 +183,7 @@@ static int lod_stripeoffset_seq_show(st
   
         LASSERT(dev != NULL);
         lod  = lu2lod_dev(dev->obd_lu_dev);
- -      seq_printf(m, "%llu\n",
- -                 lod->lod_desc.ld_default_stripe_offset);
+ +      seq_printf(m, "%lld\n", lod->lod_desc.ld_default_stripe_offset);
         return 0;
   }
   
@@@ -150,7 -214,7 +213,7 @@@ lod_stripeoffset_seq_write(struct file 
   
         LASSERT(dev != NULL);
         lod  = lu2lod_dev(dev->obd_lu_dev);
- -      rc = lprocfs_str_with_units_to_s64(buffer, count, &val, '1');
+ +      rc = lprocfs_str_to_s64(buffer, count, &val);
         if (rc)
                 return rc;
         if (val < -1)
@@@ -757,6 -821,10 +820,10 @@@ static struct lprocfs_vars lprocfs_lod_
           .fops =       &lod_qos_maxage_fops    },
         { .name =       "lmv_failout",
           .fops =       &lod_lmv_failout_fops   },
+       {
+         .name = "dom_stripesize",
+         .fops = &lod_dom_stripesize_fops
+       },
         { NULL }
   };
   
diff --combined lustre/mdc/mdc_locks.c

index e007fc3,6e17fea..86b0e67
--- 1/lustre/mdc/mdc_locks.c
--- 2/lustre/mdc/mdc_locks.c
+++ b/lustre/mdc/mdc_locks.c
@@@ -544,8 -544,10 +544,10 @@@ static int mdc_finish_enqueue(struct ob
         struct ldlm_request *lockreq;
         struct ldlm_reply   *lockrep;
         struct ldlm_lock    *lock;
+       struct mdt_body     *body = NULL;
         void                *lvb_data = NULL;
         __u32                lvb_len = 0;
+ 
           ENTRY;
   
           LASSERT(rc >= 0);
@@@ -604,8 -606,6 +606,6 @@@
   
           /* We know what to expect, so we do any byte flipping required here */
         if (it_has_reply_body(it)) {
-                 struct mdt_body *body;
- 
                   body = req_capsule_server_get(pill, &RMF_MDT_BODY);
                   if (body == NULL) {
                           CERROR ("Can't swab mdt_body\n");
@@@ -688,7 -688,10 +688,10 @@@
          * client still does this checking in case it's talking with an old
          * server. - Jinshan */
         lock = ldlm_handle2lock(lockh);
-       if (lock != NULL && ldlm_has_layout(lock) && lvb_data != NULL &&
+       if (lock == NULL)
+               RETURN(rc);
+ 
+       if (ldlm_has_layout(lock) && lvb_data != NULL &&
             !(lockrep->lock_flags & LDLM_FL_BLOCKED_MASK)) {
                 void *lmm;
   
@@@ -696,10 -699,9 +699,9 @@@
                         ldlm_it2str(it->it_op), lvb_len);
   
                 OBD_ALLOC_LARGE(lmm, lvb_len);
-               if (lmm == NULL) {
-                       LDLM_LOCK_PUT(lock);
-                       RETURN(-ENOMEM);
-               }
+               if (lmm == NULL)
+                       GOTO(out_lock, rc = -ENOMEM);
+ 
                 memcpy(lmm, lvb_data, lvb_len);
   
                 /* install lvb_data */
@@@ -714,8 -716,24 +716,24 @@@
                 if (lmm != NULL)
                         OBD_FREE_LARGE(lmm, lvb_len);
         }
-       if (lock != NULL)
-               LDLM_LOCK_PUT(lock);
+ 
+       if (ldlm_has_dom(lock)) {
+               LASSERT(lock->l_glimpse_ast == mdc_ldlm_glimpse_ast);
+ 
+               body = req_capsule_server_get(pill, &RMF_MDT_BODY);
+               if (!(body->mbo_valid & OBD_MD_DOM_SIZE)) {
+                       LDLM_ERROR(lock, "%s: DoM lock without size.\n",
+                                  exp->exp_obd->obd_name);
+                       GOTO(out_lock, rc = -EPROTO);
+               }
+ 
+               LDLM_DEBUG(lock, "DoM lock is returned by: %s, size: %llu",
+                          ldlm_it2str(it->it_op), body->mbo_dom_size);
+ 
+               rc = mdc_fill_lvb(req, &lock->l_ost_lvb);
+       }
+ out_lock:
+       LDLM_LOCK_PUT(lock);
   
         RETURN(rc);
   }
@@@ -812,18 -830,25 +830,25 @@@ resend
                 rc = obd_get_request_slot(&obddev->u.cli);
                 if (rc != 0) {
                         mdc_put_mod_rpc_slot(req, it);
-                         mdc_clear_replay_flag(req, 0);
-                         ptlrpc_req_finished(req);
-                         RETURN(rc);
-                 }
-         }
+                       mdc_clear_replay_flag(req, 0);
+                       ptlrpc_req_finished(req);
+                       RETURN(rc);
+               }
+       }
   
-         rc = ldlm_cli_enqueue(exp, &req, einfo, &res_id, policy, &flags, NULL,
+       /* With Data-on-MDT the glimpse callback is needed too.
+        * It is set here in advance but not in mdc_finish_enqueue()
+        * to avoid possible races. It is safe to have glimpse handler
+        * for non-DOM locks and costs nothing.*/
+       if (einfo->ei_cb_gl == NULL)
+               einfo->ei_cb_gl = mdc_ldlm_glimpse_ast;
+ 
+       rc = ldlm_cli_enqueue(exp, &req, einfo, &res_id, policy, &flags, NULL,
                               0, lvb_type, lockh, 0);
-         if (!it) {
-                 /* For flock requests we immediatelly return without further
-                    delay and let caller deal with the rest, since rest of
-                    this function metadata processing makes no sense for flock
+       if (!it) {
+               /* For flock requests we immediatelly return without further
+                  delay and let caller deal with the rest, since rest of
+                  this function metadata processing makes no sense for flock
                    requests anyway. But in case of problem during comms with
                    Server (ETIMEDOUT) or any signal/kill attempt (EINTR), we
                    can not rely on caller and this mainly for F_UNLCKs
@@@ -840,10 -865,8 +865,10 @@@
         mdc_put_mod_rpc_slot(req, it);
   
         if (rc < 0) {
- -              CDEBUG(D_INFO, "%s: ldlm_cli_enqueue failed: rc = %d\n",
- -                     obddev->obd_name, rc);
+ +              CDEBUG(D_INFO,
+ +                    "%s: ldlm_cli_enqueue "DFID":"DFID"=%s failed: rc = %d\n",
+ +                    obddev->obd_name, PFID(&op_data->op_fid1),
+ +                    PFID(&op_data->op_fid2), op_data->op_name ?: "", rc);
   
                 mdc_clear_replay_flag(req, rc);
                 ptlrpc_req_finished(req);
@@@ -1116,6 -1139,7 +1141,7 @@@ int mdc_intent_lock(struct obd_export *
                 .ei_mode        = it_to_lock_mode(it),
                 .ei_cb_bl       = cb_blocking,
                 .ei_cb_cp       = ldlm_completion_ast,
+               .ei_cb_gl       = mdc_ldlm_glimpse_ast,
         };
         struct lustre_handle lockh;
         int rc = 0;
@@@ -1242,6 -1266,13 +1268,13 @@@ int mdc_intent_getattr_async(struct obd
                 RETURN(rc);
         }
   
+       /* With Data-on-MDT the glimpse callback is needed too.
+        * It is set here in advance but not in mdc_finish_enqueue()
+        * to avoid possible races. It is safe to have glimpse handler
+        * for non-DOM locks and costs nothing.*/
+       if (minfo->mi_einfo.ei_cb_gl == NULL)
+               minfo->mi_einfo.ei_cb_gl = mdc_ldlm_glimpse_ast;
+ 
         rc = ldlm_cli_enqueue(exp, &req, &minfo->mi_einfo, &res_id, &policy,
                               &flags, NULL, 0, LVB_T_NONE, &minfo->mi_lockh, 1);
         if (rc < 0) {
diff --combined lustre/mdc/mdc_request.c

index 9a5d25d,61ed623..1641161
--- 1/lustre/mdc/mdc_request.c
--- 2/lustre/mdc/mdc_request.c
+++ b/lustre/mdc/mdc_request.c
@@@ -56,6 -56,7 +56,7 @@@
   #include <uapi/linux/lustre/lustre_param.h>
   #include <lustre_swab.h>
   #include <obd_class.h>
+ #include <lustre_osc.h>
   
   #include "mdc_internal.h"
   
@@@ -333,11 -334,11 +334,11 @@@ static int mdc_xattr_common(struct obd_
                 }
         }
   
- -        if (opcode == MDS_REINT) {
- -                struct mdt_rec_setxattr *rec;
+ +      if (opcode == MDS_REINT) {
+ +              struct mdt_rec_setxattr *rec;
   
- -                CLASSERT(sizeof(struct mdt_rec_setxattr) ==
- -                         sizeof(struct mdt_rec_reint));
+ +              CLASSERT(sizeof(struct mdt_rec_setxattr) ==
+ +                       sizeof(struct mdt_rec_reint));
                 rec = req_capsule_client_get(&req->rq_pill, &RMF_REC_REINT);
                 rec->sx_opcode = REINT_SETXATTR;
                 rec->sx_fsuid  = from_kuid(&init_user_ns, current_fsuid());
@@@ -2239,14 -2240,6 +2240,6 @@@ static int mdc_set_info_async(const str
                                          keylen, key, vallen, val, set);
                   RETURN(rc);
           }
-         if (KEY_IS(KEY_SPTLRPC_CONF)) {
-                 sptlrpc_conf_client_adapt(exp->exp_obd);
-                 RETURN(0);
-         }
-         if (KEY_IS(KEY_FLUSH_CTX)) {
-                 sptlrpc_import_flush_my_ctx(imp);
-                 RETURN(0);
-         }
           if (KEY_IS(KEY_CHANGELOG_CLEAR)) {
                   rc = do_set_info_async(imp, MDS_SET_INFO, LUSTRE_MDS_VERSION,
                                          keylen, key, vallen, val, set);
@@@ -2264,8 -2257,8 +2257,8 @@@
                 RETURN(0);
         }
   
-       CERROR("Unknown key %s\n", (char *)key);
-       RETURN(-EINVAL);
+       rc = osc_set_info_async(env, exp, keylen, key, vallen, val, set);
+       RETURN(rc);
   }
   
   static int mdc_get_info(const struct lu_env *env, struct obd_export *exp,
@@@ -2344,14 -2337,19 +2337,19 @@@ static int mdc_fsync(struct obd_export 
   static int mdc_import_event(struct obd_device *obd, struct obd_import *imp,
                             enum obd_import_event event)
   {
+       struct client_obd *cli = &obd->u.cli;
         int rc = 0;
   
         LASSERT(imp->imp_obd == obd);
   
         switch (event) {
- 
-       case IMP_EVENT_INACTIVE: {
-               struct client_obd *cli = &obd->u.cli;
+       case IMP_EVENT_DISCON:
+               spin_lock(&cli->cl_loi_list_lock);
+               cli->cl_avail_grant = 0;
+               cli->cl_lost_grant = 0;
+               spin_unlock(&cli->cl_loi_list_lock);
+               break;
+       case IMP_EVENT_INACTIVE:
                 /*
                  * Flush current sequence to make client obtain new one
                  * from server in case of disconnect/reconnect.
@@@ -2363,12 -2361,28 +2361,28 @@@
   
                 rc = obd_notify_observer(obd, obd, OBD_NOTIFY_INACTIVE);
                 break;
-       }
         case IMP_EVENT_INVALIDATE: {
                 struct ldlm_namespace *ns = obd->obd_namespace;
+               struct lu_env *env;
+               __u16 refcheck;
   
                 ldlm_namespace_cleanup(ns, LDLM_FL_LOCAL_ONLY);
   
+               env = cl_env_get(&refcheck);
+               if (!IS_ERR(env)) {
+                       /* Reset grants. All pages go to failing rpcs due to
+                        * the invalid import.
+                        */
+                       osc_io_unplug(env, cli, NULL);
+ 
+                       cfs_hash_for_each_nolock(ns->ns_rs_hash,
+                                                osc_ldlm_resource_invalidate,
+                                                env, 0);
+                       cl_env_put(env, &refcheck);
+                       ldlm_namespace_cleanup(ns, LDLM_FL_LOCAL_ONLY);
+               } else {
+                       rc = PTR_ERR(env);
+               }
                 break;
         }
         case IMP_EVENT_ACTIVE:
@@@ -2377,10 -2391,15 +2391,15 @@@
                 if (rc == 0)
                         rc = mdc_kuc_reregister(imp);
                 break;
-       case IMP_EVENT_OCD:
+       case IMP_EVENT_OCD: {
+               struct obd_connect_data *ocd = &imp->imp_connect_data;
+ 
+               if (OCD_HAS_FLAG(ocd, GRANT))
+                       osc_init_grant(cli, ocd);
+ 
                 rc = obd_notify_observer(obd, obd, OBD_NOTIFY_OCD);
                 break;
-       case IMP_EVENT_DISCON:
+       }
         case IMP_EVENT_DEACTIVATE:
         case IMP_EVENT_ACTIVATE:
                 break;
@@@ -2477,23 -2496,22 +2496,22 @@@ static void mdc_llog_finish(struct obd_
         EXIT;
   }
   
- static int mdc_setup(struct obd_device *obd, struct lustre_cfg *cfg)
+ int mdc_setup(struct obd_device *obd, struct lustre_cfg *cfg)
   {
-       int                             rc;
+       int rc;
+ 
         ENTRY;
   
-       rc = ptlrpcd_addref();
+       rc = osc_setup_common(obd, cfg);
         if (rc < 0)
                 RETURN(rc);
   
-         rc = client_obd_setup(obd, cfg);
-         if (rc)
-               GOTO(err_ptlrpcd_decref, rc);
   #ifdef CONFIG_PROC_FS
         obd->obd_vars = lprocfs_mdc_obd_vars;
         lprocfs_obd_setup(obd, false);
         lprocfs_alloc_md_stats(obd, 0);
   #endif
+ 
         sptlrpc_lprocfs_cliobd_attach(obd);
         ptlrpc_lprocfs_register_obd(obd);
   
@@@ -2505,26 -2523,27 +2523,27 @@@
           if (rc) {
                   CERROR("%s: failed to setup llogging subsystems: rc = %d\n",
                        obd->obd_name, rc);
-               GOTO(err_mdc_cleanup, rc);
+               GOTO(err_llog_cleanup, rc);
           }
   
         rc = mdc_changelog_cdev_init(obd);
         if (rc) {
                 CERROR("%s: failed to setup changelog char device: rc = %d\n",
                        obd->obd_name, rc);
-               GOTO(err_mdc_cleanup, rc);
+               GOTO(err_changelog_cleanup, rc);
         }
   
-       EXIT;
- err_mdc_cleanup:
-       if (rc)
-               client_obd_cleanup(obd);
+       RETURN(rc);
   
- err_ptlrpcd_decref:
-       if (rc)
-               ptlrpcd_decref();
+ err_changelog_cleanup:
+       mdc_llog_finish(obd);
+ err_llog_cleanup:
+       ptlrpc_lprocfs_unregister_obd(obd);
+       lprocfs_obd_cleanup(obd);
+       lprocfs_free_md_stats(obd);
   
-         return rc;
+       osc_cleanup_common(obd);
+       return rc;
   }
   
   /* Initialize the default and maximum LOV EA sizes.  This allows
@@@ -2555,6 -2574,8 +2574,8 @@@ static int mdc_precleanup(struct obd_de
   {
         ENTRY;
   
+       osc_precleanup_common(obd);
+ 
         /* Failsafe, ok if racy */
         if (obd->obd_type->typ_refcnt <= 1)
                 libcfs_kkuc_group_rem(0, KUC_GRP_HSM);
@@@ -2571,15 -2592,15 +2592,15 @@@
   
   static int mdc_cleanup(struct obd_device *obd)
   {
-         ptlrpcd_decref();
- 
-         return client_obd_cleanup(obd);
+       return osc_cleanup_common(obd);
   }
   
- static int mdc_process_config(struct obd_device *obd, size_t len, void *buf)
+ int mdc_process_config(struct obd_device *obd, size_t len, void *buf)
   {
-         struct lustre_cfg *lcfg = buf;
-       int rc = class_process_proc_param(PARAM_MDC, obd->obd_vars, lcfg, obd);
+       struct lustre_cfg *lcfg = buf;
+       int rc;
+ 
+       rc = class_process_proc_param(PARAM_MDC, obd->obd_vars, lcfg, obd);
         return (rc > 0 ? 0: rc);
   }
   
@@@ -2591,7 -2612,8 +2612,8 @@@ static struct obd_ops mdc_obd_ops = 
           .o_add_conn         = client_import_add_conn,
           .o_del_conn         = client_import_del_conn,
           .o_connect          = client_connect_import,
-         .o_disconnect       = client_disconnect_export,
+       .o_reconnect        = osc_reconnect,
+       .o_disconnect       = osc_disconnect,
           .o_iocontrol        = mdc_iocontrol,
           .o_set_info_async   = mdc_set_info_async,
           .o_statfs           = mdc_statfs,
@@@ -2637,7 -2659,7 +2659,7 @@@ static struct md_ops mdc_md_ops = 
   static int __init mdc_init(void)
   {
         return class_register_type(&mdc_obd_ops, &mdc_md_ops, true, NULL,
-                                  LUSTRE_MDC_NAME, NULL);
+                                  LUSTRE_MDC_NAME, &mdc_device_type);
   }
   
   static void __exit mdc_exit(void)
diff --combined lustre/mdt/mdt_handler.c

index 9f8590b,20a64a2..5ef06db
--- 1/lustre/mdt/mdt_handler.c
--- 2/lustre/mdt/mdt_handler.c
+++ b/lustre/mdt/mdt_handler.c
@@@ -61,7 -61,7 +61,7 @@@
   #include <obd.h>
   #include <obd_support.h>
   #include <lustre_barrier.h>
- 
+ #include <obd_cksum.h>
   #include <llog_swab.h>
   
   #include "mdt_internal.h"
@@@ -415,7 -415,8 +415,8 @@@ static int mdt_statfs(struct tgt_sessio
   {
         struct ptlrpc_request           *req = tgt_ses_req(tsi);
         struct mdt_thread_info          *info = tsi2mdt_info(tsi);
-       struct md_device                *next = info->mti_mdt->mdt_child;
+       struct mdt_device               *mdt = info->mti_mdt;
+       struct tg_grants_data           *tgd = &mdt->mdt_lut.lut_tgd;
         struct ptlrpc_service_part      *svcpt;
         struct obd_statfs               *osfs;
         int                             rc;
@@@ -440,24 -441,44 +441,44 @@@
         if (!osfs)
                 GOTO(out, rc = -EPROTO);
   
-       /** statfs information are cached in the mdt_device */
-       if (cfs_time_before_64(info->mti_mdt->mdt_osfs_age,
-                              cfs_time_shift_64(-OBD_STATFS_CACHE_SECONDS))) {
-               /** statfs data is too old, get up-to-date one */
-               rc = next->md_ops->mdo_statfs(info->mti_env, next, osfs);
-               if (rc)
-                       GOTO(out, rc);
-               spin_lock(&info->mti_mdt->mdt_lock);
-               info->mti_mdt->mdt_osfs = *osfs;
-               info->mti_mdt->mdt_osfs_age = cfs_time_current_64();
-               spin_unlock(&info->mti_mdt->mdt_lock);
-       } else {
-               /** use cached statfs data */
-               spin_lock(&info->mti_mdt->mdt_lock);
-               *osfs = info->mti_mdt->mdt_osfs;
-               spin_unlock(&info->mti_mdt->mdt_lock);
-       }
+       rc = tgt_statfs_internal(tsi->tsi_env, &mdt->mdt_lut, osfs,
+                                cfs_time_shift_64(-OBD_STATFS_CACHE_SECONDS),
+                                NULL);
+       if (unlikely(rc))
+               GOTO(out, rc);
   
+       /* at least try to account for cached pages.  its still racy and
+        * might be under-reporting if clients haven't announced their
+        * caches with brw recently */
+       CDEBUG(D_SUPER | D_CACHE, "blocks cached %llu granted %llu"
+              " pending %llu free %llu avail %llu\n",
+              tgd->tgd_tot_dirty, tgd->tgd_tot_granted,
+              tgd->tgd_tot_pending,
+              osfs->os_bfree << tgd->tgd_blockbits,
+              osfs->os_bavail << tgd->tgd_blockbits);
+ 
+       osfs->os_bavail -= min_t(u64, osfs->os_bavail,
+                                ((tgd->tgd_tot_dirty + tgd->tgd_tot_pending +
+                                  osfs->os_bsize - 1) >> tgd->tgd_blockbits));
+ 
+       tgt_grant_sanity_check(mdt->mdt_lu_dev.ld_obd, __func__);
+       CDEBUG(D_CACHE, "%llu blocks: %llu free, %llu avail; "
+              "%llu objects: %llu free; state %x\n",
+              osfs->os_blocks, osfs->os_bfree, osfs->os_bavail,
+              osfs->os_files, osfs->os_ffree, osfs->os_state);
+ 
+       if (!exp_grant_param_supp(tsi->tsi_exp) &&
+           tgd->tgd_blockbits > COMPAT_BSIZE_SHIFT) {
+               /* clients which don't support OBD_CONNECT_GRANT_PARAM
+                * should not see a block size > page size, otherwise
+                * cl_lost_grant goes mad. Therefore, we emulate a 4KB (=2^12)
+                * block size which is the biggest block size known to work
+                * with all client's page size. */
+               osfs->os_blocks <<= tgd->tgd_blockbits - COMPAT_BSIZE_SHIFT;
+               osfs->os_bfree  <<= tgd->tgd_blockbits - COMPAT_BSIZE_SHIFT;
+               osfs->os_bavail <<= tgd->tgd_blockbits - COMPAT_BSIZE_SHIFT;
+               osfs->os_bsize = 1 << COMPAT_BSIZE_SHIFT;
+       }
         if (rc == 0)
                 mdt_counter_incr(req, LPROC_MDT_STATFS);
   out:
@@@ -465,6 -486,41 +486,41 @@@
         RETURN(rc);
   }
   
+ /**
+  * Pack size attributes into the reply.
+  */
+ int mdt_pack_size2body(struct mdt_thread_info *info,
+                       const struct lu_fid *fid, bool dom_lock)
+ {
+       struct mdt_body *b;
+       struct md_attr *ma = &info->mti_attr;
+       int dom_stripe;
+ 
+       ENTRY;
+ 
+       LASSERT(ma->ma_attr.la_valid & LA_MODE);
+ 
+       if (!S_ISREG(ma->ma_attr.la_mode) ||
+           !(ma->ma_valid & MA_LOV && ma->ma_lmm != NULL))
+               RETURN(-ENODATA);
+ 
+       dom_stripe = mdt_lmm_dom_entry(ma->ma_lmm);
+       /* no DoM stripe, no size in reply */
+       if (dom_stripe == LMM_NO_DOM)
+               RETURN(-ENOENT);
+ 
+       /* no DoM lock, no size in reply */
+       if (!dom_lock)
+               RETURN(0);
+ 
+       /* Either DoM lock exists or LMM has only DoM stripe then
+        * return size on body. */
+       b = req_capsule_server_get(info->mti_pill, &RMF_MDT_BODY);
+ 
+       mdt_dom_object_size(info->mti_env, info->mti_mdt, fid, b, dom_lock);
+       RETURN(0);
+ }
+ 
   #ifdef CONFIG_FS_POSIX_ACL
   /*
    * Pack ACL data into the reply. UIDs/GIDs are mapped and filtered by nodemap.
@@@ -665,17 -721,18 +721,18 @@@ void mdt_pack_attr2body(struct mdt_thre
                 /* if no object is allocated on osts, the size on mds is valid.
                  * b=22272 */
                 b->mbo_valid |= OBD_MD_FLSIZE | OBD_MD_FLBLOCKS;
-       } else if ((ma->ma_valid & MA_LOV) && ma->ma_lmm != NULL &&
-                  mdt_hsm_is_released(ma->ma_lmm)) {
-               /* A released file stores its size on MDS. */
-               /* But return 1 block for released file, unless tools like tar
-                * will consider it fully sparse. (LU-3864)
-                */
-               if (unlikely(b->mbo_size == 0))
-                       b->mbo_blocks = 0;
-               else
-                       b->mbo_blocks = 1;
-               b->mbo_valid |= OBD_MD_FLSIZE | OBD_MD_FLBLOCKS;
+       } else if ((ma->ma_valid & MA_LOV) && ma->ma_lmm != NULL) {
+               if (mdt_hsm_is_released(ma->ma_lmm)) {
+                       /* A released file stores its size on MDS. */
+                       /* But return 1 block for released file, unless tools
+                        * like tar will consider it fully sparse. (LU-3864)
+                        */
+                       if (unlikely(b->mbo_size == 0))
+                               b->mbo_blocks = 0;
+                       else
+                               b->mbo_blocks = 1;
+                       b->mbo_valid |= OBD_MD_FLSIZE | OBD_MD_FLBLOCKS;
+               }
         }
   
         if (fid != NULL && (b->mbo_valid & OBD_MD_FLSIZE))
@@@ -1683,12 -1740,16 +1740,16 @@@ static int mdt_getattr_name_lock(struc
                 /* layout lock must be granted in a best-effort way
                  * for IT operations */
                 LASSERT(!(child_bits & MDS_INODELOCK_LAYOUT));
-               if (!OBD_FAIL_CHECK(OBD_FAIL_MDS_NO_LL_GETATTR) &&
-                   exp_connect_layout(info->mti_exp) &&
-                   S_ISREG(lu_object_attr(&child->mot_obj)) &&
+               if (S_ISREG(lu_object_attr(&child->mot_obj)) &&
                     !mdt_object_remote(child) && ldlm_rep != NULL) {
-                       /* try to grant layout lock for regular file. */
-                       try_bits = MDS_INODELOCK_LAYOUT;
+                       if (!OBD_FAIL_CHECK(OBD_FAIL_MDS_NO_LL_GETATTR) &&
+                           exp_connect_layout(info->mti_exp)) {
+                               /* try to grant layout lock for regular file. */
+                               try_bits = MDS_INODELOCK_LAYOUT;
+                       }
+                       /* Acquire DOM lock in advance for data-on-mdt file */
+                       if (child != parent)
+                               try_bits |= MDS_INODELOCK_DOM;
                 }
   
                 if (try_bits != 0) {
@@@ -1723,6 -1784,27 +1784,27 @@@
                          "Lock res_id: "DLDLMRES", fid: "DFID"\n",
                          PLDLMRES(lock->l_resource),
                          PFID(mdt_object_fid(child)));
+ 
+               if (S_ISREG(lu_object_attr(&child->mot_obj)) &&
+                   mdt_object_exists(child) && !mdt_object_remote(child) &&
+                   child != parent) {
+                       LDLM_LOCK_PUT(lock);
+                       mdt_object_put(info->mti_env, child);
+                       /* NB: call the mdt_pack_size2body always after
+                        * mdt_object_put(), that is why this speacial
+                        * exit path is used. */
+                       rc = mdt_pack_size2body(info, child_fid,
+                                               child_bits & MDS_INODELOCK_DOM);
+                       if (rc != 0 && child_bits & MDS_INODELOCK_DOM) {
+                               /* DOM lock was taken in advance but this is
+                                * not DoM file. Drop the lock. */
+                               lock_res_and_lock(lock);
+                               ldlm_inodebits_drop(lock, MDS_INODELOCK_DOM);
+                               unlock_res_and_lock(lock);
+                       }
+ 
+                       GOTO(out_parent, rc = 0);
+               }
           }
           if (lock)
                   LDLM_LOCK_PUT(lock);
@@@ -2082,20 -2164,21 +2164,21 @@@ static int mdt_device_sync(const struc
   }
   
   /* this should sync this object */
- static int mdt_object_sync(struct mdt_thread_info *info)
+ static int mdt_object_sync(const struct lu_env *env, struct obd_export *exp,
+                          struct mdt_object *mo)
   {
-       struct md_object *next;
         int rc;
+ 
         ENTRY;
   
-       if (!mdt_object_exists(info->mti_object)) {
+       if (!mdt_object_exists(mo)) {
                 CWARN("%s: non existing object "DFID": rc = %d\n",
-                     mdt_obd_name(info->mti_mdt),
-                     PFID(mdt_object_fid(info->mti_object)), -ESTALE);
+                     exp->exp_obd->obd_name, PFID(mdt_object_fid(mo)),
+                     -ESTALE);
                 RETURN(-ESTALE);
         }
-       next = mdt_object_child(info->mti_object);
-       rc = mo_object_sync(info->mti_env, next);
+ 
+       rc = mo_object_sync(env, mdt_object_child(mo));
   
         RETURN(rc);
   }
@@@ -2118,7 -2201,8 +2201,8 @@@ static int mdt_sync(struct tgt_session_
                 struct mdt_thread_info *info = tsi2mdt_info(tsi);
   
                 /* sync an object */
-               rc = mdt_object_sync(info);
+               rc = mdt_object_sync(tsi->tsi_env, tsi->tsi_exp,
+                                    info->mti_object);
                 if (rc == 0) {
                         const struct lu_fid *fid;
                         struct lu_attr *la = &info->mti_attr.ma_attr;
@@@ -2142,6 -2226,54 +2226,54 @@@
         RETURN(rc);
   }
   
+ static int mdt_data_sync(struct tgt_session_info *tsi)
+ {
+       struct mdt_thread_info *info;
+       struct mdt_device *mdt = mdt_exp2dev(tsi->tsi_exp);
+       struct ost_body *body = tsi->tsi_ost_body;
+       struct ost_body *repbody;
+       struct mdt_object *mo = NULL;
+       struct md_attr *ma;
+       int rc = 0;
+ 
+       ENTRY;
+ 
+       repbody = req_capsule_server_get(tsi->tsi_pill, &RMF_OST_BODY);
+ 
+       /* if no fid is specified then do nothing,
+        * device sync is done via MDS_SYNC */
+       if (fid_is_zero(&tsi->tsi_fid))
+               RETURN(0);
+ 
+       mo = mdt_object_find(tsi->tsi_env, mdt, &tsi->tsi_fid);
+       if (IS_ERR(mo))
+               RETURN(PTR_ERR(mo));
+ 
+       rc = mdt_object_sync(tsi->tsi_env, tsi->tsi_exp, mo);
+       if (rc)
+               GOTO(put, rc);
+ 
+       repbody->oa.o_oi = body->oa.o_oi;
+       repbody->oa.o_valid = OBD_MD_FLID | OBD_MD_FLGROUP;
+ 
+       info = tsi2mdt_info(tsi);
+       ma = &info->mti_attr;
+       ma->ma_need = MA_INODE;
+       ma->ma_valid = 0;
+       rc = mdt_attr_get_complex(info, mo, ma);
+       if (rc == 0)
+               obdo_from_la(&repbody->oa, &ma->ma_attr, VALID_FLAGS);
+       else
+               rc = 0;
+       mdt_thread_info_fini(info);
+ 
+       EXIT;
+ put:
+       if (mo != NULL)
+               mdt_object_put(tsi->tsi_env, mo);
+       return rc;
+ }
+ 
   /*
    * Handle quota control requests to consult current usage/limit, but also
    * to configure quota enforcement
@@@ -2865,8 -2997,8 +2997,8 @@@ int mdt_object_lock_try(struct mdt_thre
    * \param mode lock mode
    * \param decref force immediate lock releasing
    */
- static void mdt_save_lock(struct mdt_thread_info *info, struct lustre_handle *h,
-                         enum ldlm_mode mode, int decref)
+ void mdt_save_lock(struct mdt_thread_info *info, struct lustre_handle *h,
+                  enum ldlm_mode mode, int decref)
   {
         ENTRY;
   
@@@ -3221,13 -3353,14 +3353,14 @@@ enum mdt_it_code 
           MDT_IT_GETXATTR,
           MDT_IT_LAYOUT,
         MDT_IT_QUOTA,
-         MDT_IT_NR
+       MDT_IT_GLIMPSE,
+       MDT_IT_BRW,
+       MDT_IT_NR
   };
   
   static int mdt_intent_getattr(enum mdt_it_code opcode,
-                               struct mdt_thread_info *info,
-                               struct ldlm_lock **,
-                             __u64);
+                             struct mdt_thread_info *info,
+                             struct ldlm_lock **, __u64);
   
   static int mdt_intent_getxattr(enum mdt_it_code opcode,
                                 struct mdt_thread_info *info,
@@@ -3242,6 -3375,20 +3375,20 @@@ static int mdt_intent_reint(enum mdt_it
                               struct mdt_thread_info *info,
                               struct ldlm_lock **,
                             __u64);
+ static int mdt_intent_glimpse(enum mdt_it_code opcode,
+                             struct mdt_thread_info *info,
+                             struct ldlm_lock **lockp, __u64 flags)
+ {
+       return mdt_glimpse_enqueue(info, info->mti_mdt->mdt_namespace,
+                                  lockp, flags);
+ }
+ static int mdt_intent_brw(enum mdt_it_code opcode,
+                         struct mdt_thread_info *info,
+                         struct ldlm_lock **lockp, __u64 flags)
+ {
+       return mdt_brw_enqueue(info, info->mti_mdt->mdt_namespace,
+                              lockp, flags);
+ }
   
   static struct mdt_it_flavor {
           const struct req_format *it_fmt;
@@@ -3313,14 -3460,24 +3460,24 @@@
                 .it_fmt   = &RQF_LDLM_INTENT_LAYOUT,
                 .it_flags = 0,
                 .it_act   = mdt_intent_layout
-       }
+       },
+       [MDT_IT_GLIMPSE] = {
+               .it_fmt = &RQF_LDLM_INTENT,
+               .it_flags = 0,
+               .it_act = mdt_intent_glimpse,
+       },
+       [MDT_IT_BRW] = {
+               .it_fmt = &RQF_LDLM_INTENT,
+               .it_flags = 0,
+               .it_act = mdt_intent_brw,
+       },
+ 
   };
   
- static int
- mdt_intent_lock_replace(struct mdt_thread_info *info,
-                       struct ldlm_lock **lockp,
-                       struct mdt_lock_handle *lh,
-                       __u64 flags, int result)
+ int mdt_intent_lock_replace(struct mdt_thread_info *info,
+                           struct ldlm_lock **lockp,
+                           struct mdt_lock_handle *lh,
+                           __u64 flags, int result)
   {
           struct ptlrpc_request  *req = mdt_info_req(info);
           struct ldlm_lock       *lock = *lockp;
@@@ -3396,6 -3553,8 +3553,8 @@@
           new_lock->l_export = class_export_lock_get(req->rq_export, new_lock);
           new_lock->l_blocking_ast = lock->l_blocking_ast;
           new_lock->l_completion_ast = lock->l_completion_ast;
+       if (ldlm_has_dom(new_lock))
+               new_lock->l_glimpse_ast = ldlm_server_glimpse_ast;
           new_lock->l_remote_handle = lock->l_remote_handle;
           new_lock->l_flags &= ~LDLM_FL_LOCAL;
   
@@@ -3411,10 -3570,9 +3570,9 @@@
           RETURN(ELDLM_LOCK_REPLACED);
   }
   
- static void mdt_intent_fixup_resent(struct mdt_thread_info *info,
-                                   struct ldlm_lock *new_lock,
-                                   struct mdt_lock_handle *lh,
-                                   __u64 flags)
+ void mdt_intent_fixup_resent(struct mdt_thread_info *info,
+                            struct ldlm_lock *new_lock,
+                            struct mdt_lock_handle *lh, __u64 flags)
   {
           struct ptlrpc_request  *req = mdt_info_req(info);
           struct ldlm_request    *dlmreq;
@@@ -3829,6 -3987,12 +3987,12 @@@ static int mdt_intent_code(enum ldlm_in
         case IT_QUOTA_CONN:
                 rc = MDT_IT_QUOTA;
                 break;
+       case IT_GLIMPSE:
+               rc = MDT_IT_GLIMPSE;
+               break;
+       case IT_BRW:
+               rc = MDT_IT_BRW;
+               break;
         default:
                 CERROR("Unknown intent opcode: 0x%08x\n", itcode);
                 rc = -EINVAL;
@@@ -3900,6 -4064,18 +4064,18 @@@ static int mdt_intent_opc(enum ldlm_int
         RETURN(rc);
   }
   
+ static void mdt_ptlrpc_stats_update(struct ptlrpc_request *req,
+                                   enum ldlm_intent_flags it_opc)
+ {
+       struct lprocfs_stats *srv_stats = ptlrpc_req2svc(req)->srv_stats;
+ 
+       /* update stats when IT code is known */
+       if (srv_stats != NULL)
+               lprocfs_counter_incr(srv_stats,
+                               PTLRPC_LAST_CNTR + (it_opc == IT_GLIMPSE ?
+                               LDLM_GLIMPSE_ENQUEUE : LDLM_IBITS_ENQUEUE));
+ }
+ 
   static int mdt_intent_policy(struct ldlm_namespace *ns,
                              struct ldlm_lock **lockp, void *req_cookie,
                              enum ldlm_mode mode, __u64 flags, void *data)
@@@ -3909,6 -4085,7 +4085,7 @@@
         struct ptlrpc_request   *req  =  req_cookie;
         struct ldlm_intent      *it;
         struct req_capsule      *pill;
+       const struct ldlm_lock_desc *ldesc;
         int rc;
   
         ENTRY;
@@@ -3918,37 -4095,37 +4095,37 @@@
         tsi = tgt_ses_info(req->rq_svc_thread->t_env);
   
         info = tsi2mdt_info(tsi);
-         LASSERT(info != NULL);
-         pill = info->mti_pill;
-         LASSERT(pill->rc_req == req);
+       LASSERT(info != NULL);
+       pill = info->mti_pill;
+       LASSERT(pill->rc_req == req);
+       ldesc = &info->mti_dlm_req->lock_desc;
   
-         if (req->rq_reqmsg->lm_bufcount > DLM_INTENT_IT_OFF) {
+       if (req->rq_reqmsg->lm_bufcount > DLM_INTENT_IT_OFF) {
                 req_capsule_extend(pill, &RQF_LDLM_INTENT_BASIC);
-                 it = req_capsule_client_get(pill, &RMF_LDLM_INTENT);
-                 if (it != NULL) {
-                         rc = mdt_intent_opc(it->opc, info, lockp, flags);
-                         if (rc == 0)
-                                 rc = ELDLM_OK;
- 
-                         /* Lock without inodebits makes no sense and will oops
-                          * later in ldlm. Let's check it now to see if we have
-                          * ibits corrupted somewhere in mdt_intent_opc().
-                          * The case for client miss to set ibits has been
-                          * processed by others. */
-                         LASSERT(ergo(info->mti_dlm_req->lock_desc.l_resource.\
-                                         lr_type == LDLM_IBITS,
-                                      info->mti_dlm_req->lock_desc.\
-                                         l_policy_data.l_inodebits.bits != 0));
-                 } else
-                         rc = err_serious(-EFAULT);
-         } else {
-                 /* No intent was provided */
-                 LASSERT(pill->rc_fmt == &RQF_LDLM_ENQUEUE);
+               it = req_capsule_client_get(pill, &RMF_LDLM_INTENT);
+               if (it != NULL) {
+                       mdt_ptlrpc_stats_update(req, it->opc);
+                       rc = mdt_intent_opc(it->opc, info, lockp, flags);
+                       if (rc == 0)
+                               rc = ELDLM_OK;
+ 
+                       /* Lock without inodebits makes no sense and will oops
+                        * later in ldlm. Let's check it now to see if we have
+                        * ibits corrupted somewhere in mdt_intent_opc().
+                        * The case for client miss to set ibits has been
+                        * processed by others. */
+                       LASSERT(ergo(ldesc->l_resource.lr_type == LDLM_IBITS,
+                               ldesc->l_policy_data.l_inodebits.bits != 0));
+               } else {
+                       rc = err_serious(-EFAULT);
+               }
+       } else {
+               /* No intent was provided */
                 req_capsule_set_size(pill, &RMF_DLM_LVB, RCL_SERVER, 0);
-                 rc = req_capsule_server_pack(pill);
-                 if (rc)
-                         rc = err_serious(rc);
-         }
+               rc = req_capsule_server_pack(pill);
+               if (rc)
+                       rc = err_serious(rc);
+       }
         mdt_thread_info_fini(info);
         RETURN(rc);
   }
@@@ -4631,6 -4808,11 +4808,11 @@@ static int mdt_tgt_getxattr(struct tgt_
         return rc;
   }
   
+ #define OBD_FAIL_OST_READ_NET OBD_FAIL_OST_BRW_NET
+ #define OBD_FAIL_OST_WRITE_NET        OBD_FAIL_OST_BRW_NET
+ #define OST_BRW_READ  OST_READ
+ #define OST_BRW_WRITE OST_WRITE
+ 
   static struct tgt_handler mdt_tgt_handlers[] = {
   TGT_RPC_HANDLER(MDS_FIRST_OPC,
                 0,                      MDS_CONNECT,    mdt_tgt_connect,
@@@ -4671,6 -4853,14 +4853,14 @@@ TGT_MDT_HDL(HABEO_CLAVIS | HABEO_CORPU
             mdt_swap_layouts),
   };
   
+ static struct tgt_handler mdt_io_ops[] = {
+ TGT_OST_HDL(HABEO_CORPUS | HABEO_REFERO, OST_BRW_READ,        tgt_brw_read),
+ TGT_OST_HDL(HABEO_CORPUS | MUTABOR,    OST_BRW_WRITE, tgt_brw_write),
+ TGT_OST_HDL(HABEO_CORPUS | HABEO_REFERO | MUTABOR,
+                                        OST_PUNCH,     mdt_punch_hdl),
+ TGT_OST_HDL(HABEO_CORPUS | HABEO_REFERO, OST_SYNC,    mdt_data_sync),
+ };
+ 
   static struct tgt_handler mdt_sec_ctx_ops[] = {
   TGT_SEC_HDL_VAR(0,                    SEC_CTX_INIT,     mdt_sec_ctx_handle),
   TGT_SEC_HDL_VAR(0,                    SEC_CTX_INIT_CONT,mdt_sec_ctx_handle),
@@@ -4732,7 -4922,11 +4922,11 @@@ static struct tgt_opc_slice mdt_common_
                 .tos_opc_end    = LFSCK_LAST_OPC,
                 .tos_hs         = tgt_lfsck_handlers
         },
- 
+       {
+               .tos_opc_start  = OST_FIRST_OPC,
+               .tos_opc_end    = OST_LAST_OPC,
+               .tos_hs         = mdt_io_ops
+       },
         {
                 .tos_hs         = NULL
         }
@@@ -4816,68 -5010,70 +5010,71 @@@ static void mdt_fini(const struct lu_en
   static int mdt_postrecov(const struct lu_env *, struct mdt_device *);
   
   static int mdt_init0(const struct lu_env *env, struct mdt_device *m,
- -                     struct lu_device_type *ldt, struct lustre_cfg *cfg)
+ +                   struct lu_device_type *ldt, struct lustre_cfg *cfg)
   {
- -      struct mdt_thread_info    *info;
- -      struct obd_device         *obd;
+ +      const struct dt_device_param *dt_conf;
+ +      struct mdt_thread_info *info;
+ +      struct obd_device *obd;
+ +      const char *dev = lustre_cfg_string(cfg, 0);
+ +      const char *num = lustre_cfg_string(cfg, 2);
+       struct tg_grants_data *tgd = &m->mdt_lut.lut_tgd;
- -        const char                *dev = lustre_cfg_string(cfg, 0);
- -        const char                *num = lustre_cfg_string(cfg, 2);
- -        struct lustre_mount_info  *lmi = NULL;
- -        struct lustre_sb_info     *lsi;
- -        struct lu_site            *s;
- -      struct seq_server_site    *ss_site;
- -        const char                *identity_upcall = "NONE";
- -        struct md_device          *next;
- -        int                        rc;
- -      long                       node_id;
- -        mntopt_t                   mntopts;
- -        ENTRY;
+ +      struct lustre_mount_info *lmi = NULL;
+ +      struct lustre_sb_info *lsi;
+ +      struct lu_site *s;
+ +      struct seq_server_site *ss_site;
+ +      const char *identity_upcall = "NONE";
+ +      struct md_device *next;
+ +      int rc;
+ +      long node_id;
+ +      mntopt_t mntopts;
+ +      ENTRY;
   
         lu_device_init(&m->mdt_lu_dev, ldt);
- -        /*
- -         * Environment (env) might be missing mdt_thread_key values at that
- -         * point, if device is allocated when mdt_thread_key is in QUIESCENT
- -         * mode.
- -         *
- -         * Usually device allocation path doesn't use module key values, but
- -         * mdt has to do a lot of work here, so allocate key value.
- -         */
- -        rc = lu_env_refill((struct lu_env *)env);
- -        if (rc != 0)
- -                RETURN(rc);
+ +      /*
+ +       * Environment (env) might be missing mdt_thread_key values at that
+ +       * point, if device is allocated when mdt_thread_key is in QUIESCENT
+ +       * mode.
+ +       *
+ +       * Usually device allocation path doesn't use module key values, but
+ +       * mdt has to do a lot of work here, so allocate key value.
+ +       */
+ +      rc = lu_env_refill((struct lu_env *)env);
+ +      if (rc != 0)
+ +              RETURN(rc);
   
- -        info = lu_context_key_get(&env->le_ctx, &mdt_thread_key);
- -        LASSERT(info != NULL);
+ +      info = lu_context_key_get(&env->le_ctx, &mdt_thread_key);
+ +      LASSERT(info != NULL);
   
- -        obd = class_name2obd(dev);
- -        LASSERT(obd != NULL);
+ +      obd = class_name2obd(dev);
+ +      LASSERT(obd != NULL);
   
- -        m->mdt_max_mdsize = MAX_MD_SIZE; /* 4 stripes */
+ +      m->mdt_max_mdsize = MAX_MD_SIZE; /* 4 stripes */
         m->mdt_opts.mo_evict_tgt_nids = 1;
- -        m->mdt_opts.mo_cos = MDT_COS_DEFAULT;
+ +      m->mdt_opts.mo_cos = MDT_COS_DEFAULT;
   
         lmi = server_get_mount(dev);
- -        if (lmi == NULL) {
- -                CERROR("Cannot get mount info for %s!\n", dev);
- -                RETURN(-EFAULT);
- -        } else {
- -                lsi = s2lsi(lmi->lmi_sb);
- -                /* CMD is supported only in IAM mode */
- -                LASSERT(num);
- -                node_id = simple_strtol(num, NULL, 10);
+ +      if (lmi == NULL) {
+ +              CERROR("Cannot get mount info for %s!\n", dev);
+ +              RETURN(-EFAULT);
+ +      } else {
+ +              lsi = s2lsi(lmi->lmi_sb);
+ +              /* CMD is supported only in IAM mode */
+ +              LASSERT(num);
+ +              node_id = simple_strtol(num, NULL, 10);
                 obd->u.obt.obt_magic = OBT_MAGIC;
                 if (lsi->lsi_lmd != NULL &&
                     lsi->lsi_lmd->lmd_flags & LMD_FLG_SKIP_LFSCK)
                         m->mdt_skip_lfsck = 1;
         }
   
+       /* DoM files get IO lock at open by default */
+       m->mdt_opts.mo_dom_lock = 1;
+ 
         m->mdt_squash.rsi_uid = 0;
         m->mdt_squash.rsi_gid = 0;
         INIT_LIST_HEAD(&m->mdt_squash.rsi_nosquash_nids);
         init_rwsem(&m->mdt_squash.rsi_sem);
         spin_lock_init(&m->mdt_lock);
-       m->mdt_osfs_age = cfs_time_shift_64(-1000);
         m->mdt_enable_remote_dir = 0;
         m->mdt_enable_remote_dir_gid = 0;
   
@@@ -4902,16 -5098,16 +5099,16 @@@
         s->ld_seq_site = ss_site;
         ss_site->ss_lu = s;
   
- -        /* set server index */
+ +      /* set server index */
         ss_site->ss_node_id = node_id;
   
         /* failover is the default
          * FIXME: we do not failout mds0/mgs, which may cause some problems.
          * assumed whose ss_node_id == 0 XXX
          * */
- -        obd->obd_replayable = 1;
- -        /* No connection accepted until configurations will finish */
- -        obd->obd_no_conn = 1;
+ +      obd->obd_replayable = 1;
+ +      /* No connection accepted until configurations will finish */
+ +      obd->obd_no_conn = 1;
   
         if (cfg->lcfg_bufcount > 4 && LUSTRE_CFG_BUFLEN(cfg, 4) > 0) {
                 char *str = lustre_cfg_string(cfg, 4);
@@@ -4931,25 -5127,25 +5128,25 @@@
   
         snprintf(info->mti_u.ns_name, sizeof(info->mti_u.ns_name), "%s-%s",
                  LUSTRE_MDT_NAME, obd->obd_uuid.uuid);
- -        m->mdt_namespace = ldlm_namespace_new(obd, info->mti_u.ns_name,
- -                                              LDLM_NAMESPACE_SERVER,
- -                                              LDLM_NAMESPACE_GREEDY,
- -                                              LDLM_NS_TYPE_MDT);
- -        if (m->mdt_namespace == NULL)
- -                GOTO(err_fini_seq, rc = -ENOMEM);
+ +      m->mdt_namespace = ldlm_namespace_new(obd, info->mti_u.ns_name,
+ +                                            LDLM_NAMESPACE_SERVER,
+ +                                            LDLM_NAMESPACE_GREEDY,
+ +                                            LDLM_NS_TYPE_MDT);
+ +      if (m->mdt_namespace == NULL)
+ +              GOTO(err_fini_seq, rc = -ENOMEM);
   
         m->mdt_namespace->ns_lvbp = m;
         m->mdt_namespace->ns_lvbo = &mdt_lvbo;
   
- -        ldlm_register_intent(m->mdt_namespace, mdt_intent_policy);
- -        /* set obd_namespace for compatibility with old code */
- -        obd->obd_namespace = m->mdt_namespace;
+ +      ldlm_register_intent(m->mdt_namespace, mdt_intent_policy);
+ +      /* set obd_namespace for compatibility with old code */
+ +      obd->obd_namespace = m->mdt_namespace;
   
         rc = mdt_hsm_cdt_init(m);
         if (rc != 0) {
                 CERROR("%s: error initializing coordinator, rc %d\n",
                        mdt_obd_name(m), rc);
- -                GOTO(err_free_ns, rc);
+ +              GOTO(err_free_ns, rc);
         }
   
         rc = tgt_init(env, &m->mdt_lut, obd, m->mdt_bottom, mdt_common_slice,
@@@ -4958,28 -5154,40 +5155,37 @@@
         if (rc)
                 GOTO(err_free_hsm, rc);
   
+       /* Amount of available space excluded from granting and reserved
+        * for metadata. It is in percentage and 50% is default value. */
+       tgd->tgd_reserved_pcnt = 50;
+ 
+       if (ONE_MB_BRW_SIZE < (1U << tgd->tgd_blockbits))
+               m->mdt_brw_size = 1U << tgd->tgd_blockbits;
+       else
+               m->mdt_brw_size = ONE_MB_BRW_SIZE;
+ 
         rc = mdt_fs_setup(env, m, obd, lsi);
         if (rc)
                 GOTO(err_tgt, rc);
   
         tgt_adapt_sptlrpc_conf(&m->mdt_lut);
   
- -        next = m->mdt_child;
- -        rc = next->md_ops->mdo_iocontrol(env, next, OBD_IOC_GET_MNTOPT, 0,
- -                                         &mntopts);
- -        if (rc)
- -              GOTO(err_fs_cleanup, rc);
+ +      next = m->mdt_child;
+ +      dt_conf = next->md_ops->mdo_dtconf_get(env, next);
   
- -        if (mntopts & MNTOPT_USERXATTR)
- -                m->mdt_opts.mo_user_xattr = 1;
- -        else
- -                m->mdt_opts.mo_user_xattr = 0;
+ +      mntopts = dt_conf->ddp_mntopts;
   
- -      rc = next->md_ops->mdo_maxeasize_get(env, next, &m->mdt_max_ea_size);
- -      if (rc)
- -              GOTO(err_fs_cleanup, rc);
+ +      if (mntopts & MNTOPT_USERXATTR)
+ +              m->mdt_opts.mo_user_xattr = 1;
+ +      else
+ +              m->mdt_opts.mo_user_xattr = 0;
   
- -        if (mntopts & MNTOPT_ACL)
- -                m->mdt_opts.mo_acl = 1;
- -        else
- -                m->mdt_opts.mo_acl = 0;
+ +      m->mdt_max_ea_size = dt_conf->ddp_max_ea_size;
+ +
+ +      if (mntopts & MNTOPT_ACL)
+ +              m->mdt_opts.mo_acl = 1;
+ +      else
+ +              m->mdt_opts.mo_acl = 0;
   
         /* XXX: to support suppgid for ACL, we enable identity_upcall
          * by default, otherwise, maybe got unexpected -EACCESS. */
@@@ -4995,11 -5203,11 +5201,11 @@@
                 GOTO(err_fs_cleanup, rc);
         }
   
- -        rc = mdt_procfs_init(m, dev);
- -        if (rc) {
- -                CERROR("Can't init MDT lprocfs, rc %d\n", rc);
- -                GOTO(err_recovery, rc);
- -        }
+ +      rc = mdt_procfs_init(m, dev);
+ +      if (rc) {
+ +              CERROR("Can't init MDT lprocfs, rc %d\n", rc);
+ +              GOTO(err_recovery, rc);
+ +      }
   
         rc = mdt_quota_init(env, m, cfg);
         if (rc)
@@@ -5015,13 -5223,13 +5221,13 @@@
          * when the whole stack is complete and ready
          * to serve the requests */
   
- -        /* Reduce the initial timeout on an MDS because it doesn't need such
- -         * a long timeout as an OST does. Adaptive timeouts will adjust this
- -         * value appropriately. */
- -        if (ldlm_timeout == LDLM_TIMEOUT_DEFAULT)
- -                ldlm_timeout = MDS_LDLM_TIMEOUT_DEFAULT;
+ +      /* Reduce the initial timeout on an MDS because it doesn't need such
+ +       * a long timeout as an OST does. Adaptive timeouts will adjust this
+ +       * value appropriately. */
+ +      if (ldlm_timeout == LDLM_TIMEOUT_DEFAULT)
+ +              ldlm_timeout = MDS_LDLM_TIMEOUT_DEFAULT;
   
- -        RETURN(0);
+ +      RETURN(0);
   err_procfs:
         mdt_procfs_fini(m);
   err_recovery:
@@@ -5155,6 -5363,7 +5361,7 @@@ static struct lu_object *mdt_object_all
                 o->lo_ops = &mdt_obj_ops;
                 spin_lock_init(&mo->mot_write_lock);
                 mutex_init(&mo->mot_lov_mutex);
+               init_rwsem(&mo->mot_dom_sem);
                 init_rwsem(&mo->mot_open_sem);
                 atomic_set(&mo->mot_open_count, 0);
                 RETURN(o);
@@@ -5323,9 -5532,10 +5530,10 @@@ static int mdt_obd_set_info_async(cons
    * \retval -EPROTO \a data unexpectedly has zero obd_connect_data::ocd_brw_size
    * \retval -EBADE  client and server feature requirements are incompatible
    */
- static int mdt_connect_internal(struct obd_export *exp,
+ static int mdt_connect_internal(const struct lu_env *env,
+                               struct obd_export *exp,
                                 struct mdt_device *mdt,
-                               struct obd_connect_data *data)
+                               struct obd_connect_data *data, bool reconnect)
   {
         LASSERT(data != NULL);
   
@@@ -5357,7 -5567,8 +5565,8 @@@
                 data->ocd_connect_flags &= ~OBD_CONNECT_XATTR;
   
         if (OCD_HAS_FLAG(data, BRW_SIZE)) {
-               data->ocd_brw_size = min(data->ocd_brw_size, MD_MAX_BRW_SIZE);
+               data->ocd_brw_size = min(data->ocd_brw_size,
+                                        mdt->mdt_brw_size);
                 if (data->ocd_brw_size == 0) {
                         CERROR("%s: cli %s/%p ocd_connect_flags: %#llx "
                                "ocd_version: %x ocd_grant: %d ocd_index: %u "
@@@ -5371,6 -5582,30 +5580,30 @@@
                 }
         }
   
+       if (OCD_HAS_FLAG(data, GRANT_PARAM)) {
+               struct dt_device_param *ddp = &mdt->mdt_lut.lut_dt_conf;
+ 
+               /* client is reporting its page size, for future use */
+               exp->exp_target_data.ted_pagebits = data->ocd_grant_blkbits;
+               data->ocd_grant_blkbits  = mdt->mdt_lut.lut_tgd.tgd_blockbits;
+               /* ddp_inodespace may not be power-of-two value, eg. for ldiskfs
+                * it's LDISKFS_DIR_REC_LEN(20) = 28. */
+               data->ocd_grant_inobits = fls(ddp->ddp_inodespace - 1);
+               /* ocd_grant_tax_kb is in 1K byte blocks */
+               data->ocd_grant_tax_kb = ddp->ddp_extent_tax >> 10;
+               data->ocd_grant_max_blks = ddp->ddp_max_extent_blks;
+       }
+ 
+       if (OCD_HAS_FLAG(data, GRANT)) {
+               /* Save connect_data we have so far because tgt_grant_connect()
+                * uses it to calculate grant. */
+               exp->exp_connect_data = *data;
+               tgt_grant_connect(env, exp, data, !reconnect);
+       }
+ 
+       if (OCD_HAS_FLAG(data, MAXBYTES))
+               data->ocd_maxbytes = mdt->mdt_lut.lut_dt_conf.ddp_maxbytes;
+ 
         /* NB: Disregard the rule against updating
          * exp_connect_data.ocd_connect_flags in this case, since
          * tgt_client_new() needs to know if this is a lightweight
@@@ -5414,6 -5649,32 +5647,32 @@@
                 spin_unlock(&exp->exp_lock);
         }
   
+       if (OCD_HAS_FLAG(data, CKSUM)) {
+               __u32 cksum_types = data->ocd_cksum_types;
+ 
+               /* The client set in ocd_cksum_types the checksum types it
+                * supports. We have to mask off the algorithms that we don't
+                * support */
+               data->ocd_cksum_types &= cksum_types_supported_server();
+ 
+               if (unlikely(data->ocd_cksum_types == 0)) {
+                       CERROR("%s: Connect with checksum support but no "
+                              "ocd_cksum_types is set\n",
+                              exp->exp_obd->obd_name);
+                       RETURN(-EPROTO);
+               }
+ 
+               CDEBUG(D_RPCTRACE, "%s: cli %s supports cksum type %x, return "
+                      "%x\n", exp->exp_obd->obd_name, obd_export_nid2str(exp),
+                      cksum_types, data->ocd_cksum_types);
+       } else {
+               /* This client does not support OBD_CONNECT_CKSUM
+                * fall back to CRC32 */
+               CDEBUG(D_RPCTRACE, "%s: cli %s does not support "
+                      "OBD_CONNECT_CKSUM, CRC32 will be used\n",
+                      exp->exp_obd->obd_name, obd_export_nid2str(exp));
+       }
+ 
         return 0;
   }
   
@@@ -5538,11 -5799,15 +5797,15 @@@ static inline void mdt_disable_slc(stru
   
   static int mdt_obd_disconnect(struct obd_export *exp)
   {
-         int rc;
-         ENTRY;
+       int rc;
   
-         LASSERT(exp);
-         class_export_get(exp);
+       ENTRY;
+ 
+       LASSERT(exp);
+       class_export_get(exp);
+ 
+       if (!(exp->exp_flags & OBD_OPT_FORCE))
+               tgt_grant_sanity_check(exp->exp_obd, __func__);
   
         if ((exp_connect_flags(exp) & OBD_CONNECT_MDS_MDS) &&
             !(exp_connect_flags(exp) & OBD_CONNECT_LIGHTWEIGHT)) {
@@@ -5556,6 -5821,8 +5819,8 @@@
         if (rc != 0)
                 CDEBUG(D_IOCTL, "server disconnect error: rc = %d\n", rc);
   
+       tgt_grant_discard(exp);
+ 
         rc = mdt_export_cleanup(exp);
         nodemap_del_member(exp);
         class_export_put(exp);
@@@ -5617,7 -5884,7 +5882,7 @@@ static int mdt_obd_connect(const struc
         if (rc != 0 && rc != -EEXIST)
                 GOTO(out, rc);
   
-       rc = mdt_connect_internal(lexp, mdt, data);
+       rc = mdt_connect_internal(env, lexp, mdt, data, false);
         if (rc == 0) {
                 struct lsd_client_data *lcd = lexp->exp_target_data.ted_lcd;
   
@@@ -5663,7 -5930,8 +5928,8 @@@ static int mdt_obd_reconnect(const stru
         if (rc != 0 && rc != -EEXIST)
                 RETURN(rc);
   
-       rc = mdt_connect_internal(exp, mdt_dev(obd->obd_lu_dev), data);
+       rc = mdt_connect_internal(env, exp, mdt_dev(obd->obd_lu_dev), data,
+                                 true);
         if (rc == 0)
                 mdt_export_stats_init(obd, exp, localdata);
         else
@@@ -5725,6 -5993,17 +5991,17 @@@ static int mdt_destroy_export(struct ob
         LASSERT(list_empty(&exp->exp_outstanding_replies));
         LASSERT(list_empty(&exp->exp_mdt_data.med_open_head));
   
+       /*
+        * discard grants once we're sure no more
+        * interaction with the client is possible
+        */
+       tgt_grant_discard(exp);
+       if (exp_connect_flags(exp) & OBD_CONNECT_GRANT)
+               exp->exp_obd->u.obt.obt_lut->lut_tgd.tgd_tot_granted_clients--;
+ 
+       if (!(exp->exp_flags & OBD_OPT_FORCE))
+               tgt_grant_sanity_check(exp->exp_obd, __func__);
+ 
         RETURN(0);
   }
   
@@@ -6290,6 -6569,9 +6567,9 @@@ static struct obd_ops mdt_obd_device_op
           .o_destroy_export = mdt_destroy_export,
           .o_iocontrol      = mdt_iocontrol,
           .o_postrecov      = mdt_obd_postrecov,
+       /* Data-on-MDT IO methods */
+       .o_preprw         = mdt_obd_preprw,
+       .o_commitrw       = mdt_obd_commitrw,
   };
   
   static struct lu_device* mdt_device_fini(const struct lu_env *env,
diff --combined lustre/mdt/mdt_internal.h

index 2ba6f9a,d876431..ecd0a85
--- 1/lustre/mdt/mdt_internal.h
--- 2/lustre/mdt/mdt_internal.h
+++ b/lustre/mdt/mdt_internal.h
@@@ -179,8 -179,6 +179,8 @@@ struct coordinator 
   
         /* Remove archive on last unlink policy */
         bool                     cdt_remove_archive_on_last_unlink;
+ +
+ +      bool                     cdt_wakeup_coordinator;
   };
   
   /* mdt state flag bits */
@@@ -209,7 -207,8 +209,8 @@@ struct mdt_device 
                 unsigned int       mo_user_xattr:1,
                                    mo_acl:1,
                                    mo_cos:1,
-                                  mo_evict_tgt_nids:1;
+                                  mo_evict_tgt_nids:1,
+                                  mo_dom_lock:1;
         } mdt_opts;
           /* mdt state flags */
           unsigned long              mdt_state;
@@@ -223,6 -222,9 +224,9 @@@
   
         int                        mdt_max_ea_size;
   
+       /* preferred BRW size, decided by storage type and capability */
+       __u32                      mdt_brw_size;
+ 
           struct upcall_cache        *mdt_identity_cache;
   
         unsigned int               mdt_capa_conf:1,
@@@ -235,10 -237,6 +239,6 @@@
         /* lock for osfs and md_root */
         spinlock_t                 mdt_lock;
   
-       /* statfs optimization: we cache a bit  */
-       struct obd_statfs          mdt_osfs;
-       __u64                      mdt_osfs_age;
- 
           /* root squash */
         struct root_squash_info    mdt_squash;
   
@@@ -274,6 -272,8 +274,8 @@@ struct mdt_object 
         spinlock_t              mot_write_lock;
           /* Lock to protect create_data */
         struct mutex            mot_lov_mutex;
+       /* lock to protect read/write stages for Data-on-MDT files */
+       struct rw_semaphore     mot_dom_sem;
         /* Lock to protect lease open.
          * Lease open acquires write lock; normal open acquires read lock */
         struct rw_semaphore     mot_open_sem;
@@@ -323,7 -323,7 +325,7 @@@ enum 
   #define MDT_EREMOTE_OPEN (EREMOTE + 1024)
   
   struct mdt_reint_record {
- -      mdt_reint_t                      rr_opcode;
+ +      enum mds_reint_op                rr_opcode;
         const struct lustre_handle      *rr_handle;
         const struct lu_fid             *rr_fid1;
         const struct lu_fid             *rr_fid2;
@@@ -615,6 -615,44 +617,44 @@@ static inline bool mdt_is_striped_clien
         return exp_connect_flags(exp) & OBD_CONNECT_DIR_STRIPE;
   }
   
+ enum {
+       LMM_NO_DOM,
+       LMM_DOM_ONLY,
+       LMM_DOM_OST
+ };
+ 
+ /* XXX Look into layout in MDT layer. This must be done in LOD. */
+ static inline int mdt_lmm_dom_entry(struct lov_mds_md *lmm)
+ {
+       struct lov_comp_md_v1 *comp_v1;
+       struct lov_mds_md *v1;
+       int i;
+ 
+       if (lmm->lmm_magic == LOV_MAGIC_COMP_V1) {
+               comp_v1 = (struct lov_comp_md_v1 *)lmm;
+               v1 = (struct lov_mds_md *)((char *)comp_v1 +
+                       comp_v1->lcm_entries[0].lcme_offset);
+               /* DoM entry is the first entry always */
+               if (lov_pattern(v1->lmm_pattern) != LOV_PATTERN_MDT)
+                       return LMM_NO_DOM;
+ 
+               for (i = 1; i < comp_v1->lcm_entry_count; i++) {
+                       int j;
+ 
+                       v1 = (struct lov_mds_md *)((char *)comp_v1 +
+                               comp_v1->lcm_entries[i].lcme_offset);
+                       for (j = 0; j < v1->lmm_stripe_count; j++) {
+                               /* if there is any object on OST */
+                               if (v1->lmm_objects[j].l_ost_idx !=
+                                   (__u32)-1UL)
+                                       return LMM_DOM_OST;
+                       }
+               }
+               return LMM_DOM_ONLY;
+       }
+       return LMM_NO_DOM;
+ }
+ 
   __u64 mdt_get_disposition(struct ldlm_reply *rep, __u64 op_flag);
   void mdt_set_disposition(struct mdt_thread_info *info,
                          struct ldlm_reply *rep, __u64 op_flag);
@@@ -645,6 -683,8 +685,8 @@@ int mdt_object_lock_try(struct mdt_thre
   
   void mdt_object_unlock(struct mdt_thread_info *info, struct mdt_object *mo,
                        struct mdt_lock_handle *lh, int decref);
+ void mdt_save_lock(struct mdt_thread_info *info, struct lustre_handle *h,
+                  enum ldlm_mode mode, int decref);
   
   struct mdt_object *mdt_object_new(const struct lu_env *env,
                                   struct mdt_device *,
@@@ -685,8 -725,9 +727,9 @@@ int mdt_pack_acl2body(struct mdt_thread
                       struct mdt_object *o, struct lu_nodemap *nodemap);
   #endif
   void mdt_pack_attr2body(struct mdt_thread_info *info, struct mdt_body *b,
-                         const struct lu_attr *attr, const struct lu_fid *fid);
- 
+                       const struct lu_attr *attr, const struct lu_fid *fid);
+ int mdt_pack_size2body(struct mdt_thread_info *info,
+                       const struct lu_fid *fid, bool dom_lock);
   int mdt_getxattr(struct mdt_thread_info *info);
   int mdt_reint_setxattr(struct mdt_thread_info *info,
                          struct mdt_lock_handle *lh);
@@@ -765,6 -806,13 +808,13 @@@ void mdt_thread_info_init(struct ptlrpc
                           struct mdt_thread_info *mti);
   void mdt_thread_info_fini(struct mdt_thread_info *mti);
   struct mdt_thread_info *tsi2mdt_info(struct tgt_session_info *tsi);
+ void mdt_intent_fixup_resent(struct mdt_thread_info *info,
+                            struct ldlm_lock *new_lock,
+                            struct mdt_lock_handle *lh, __u64 flags);
+ int mdt_intent_lock_replace(struct mdt_thread_info *info,
+                           struct ldlm_lock **lockp,
+                           struct mdt_lock_handle *lh,
+                           __u64 flags, int result);
   
   int mdt_hsm_attr_set(struct mdt_thread_info *info, struct mdt_object *obj,
                      const struct md_hsm *mh);
@@@ -1008,6 -1056,11 +1058,11 @@@ static inline int is_identity_get_disab
   
   int mdt_blocking_ast(struct ldlm_lock*, struct ldlm_lock_desc*, void*, int);
   
+ static int mdt_dom_glimpse_ast(struct ldlm_lock *lock, void *reqp)
+ {
+       return -ELDLM_NO_LOCK_DATA;
+ }
+ 
   /* Issues dlm lock on passed @ns, @f stores it lock handle into @lh. */
   static inline int mdt_fid_lock(struct ldlm_namespace *ns,
                                struct lustre_handle *lh, enum ldlm_mode mode,
@@@ -1016,14 -1069,16 +1071,16 @@@
                                __u64 flags, const __u64 *client_cookie)
   {
         int rc;
+       bool glimpse = policy->l_inodebits.bits & MDS_INODELOCK_DOM;
   
         LASSERT(ns != NULL);
         LASSERT(lh != NULL);
   
         rc = ldlm_cli_enqueue_local(ns, res_id, LDLM_IBITS, policy,
                                     mode, &flags, mdt_blocking_ast,
-                                   ldlm_completion_ast, NULL, NULL, 0,
-                                   LVB_T_NONE, client_cookie, lh);
+                                   ldlm_completion_ast,
+                                   glimpse ? mdt_dom_glimpse_ast : NULL,
+                                   NULL, 0, LVB_T_NONE, client_cookie, lh);
         return rc == ELDLM_OK ? 0 : -EIO;
   }
   
@@@ -1056,6 -1111,9 +1113,9 @@@ static inline enum ldlm_mode mdt_mdl_mo
   
   /* mdt_lvb.c */
   extern struct ldlm_valblock_ops mdt_lvbo;
+ int mdt_dom_lvb_is_valid(struct ldlm_resource *res);
+ int mdt_dom_lvbo_update(struct ldlm_resource *res, struct ldlm_lock *lock,
+                       struct ptlrpc_request *req, bool increase_only);
   
   void mdt_enable_cos(struct mdt_device *, int);
   int mdt_cos_is_enabled(struct mdt_device *);
@@@ -1076,9 -1134,12 +1136,12 @@@ enum 
           LPROC_MDT_SETXATTR,
           LPROC_MDT_STATFS,
           LPROC_MDT_SYNC,
-         LPROC_MDT_SAMEDIR_RENAME,
-         LPROC_MDT_CROSSDIR_RENAME,
-         LPROC_MDT_LAST,
+       LPROC_MDT_SAMEDIR_RENAME,
+       LPROC_MDT_CROSSDIR_RENAME,
+       LPROC_MDT_IO_READ,
+       LPROC_MDT_IO_WRITE,
+       LPROC_MDT_IO_PUNCH,
+       LPROC_MDT_LAST,
   };
   void mdt_counter_incr(struct ptlrpc_request *req, int opcode);
   void mdt_stats_counter_init(struct lprocfs_stats *stats);
@@@ -1119,4 -1180,49 +1182,49 @@@ static inline char *mdt_req_get_jobid(s
         return jobid;
   }
   
+ /* MDT IO */
+ 
+ #define VALID_FLAGS (LA_TYPE | LA_MODE | LA_SIZE | LA_BLOCKS | \
+                    LA_BLKSIZE | LA_ATIME | LA_MTIME | LA_CTIME)
+ 
+ int mdt_obd_preprw(const struct lu_env *env, int cmd, struct obd_export *exp,
+                  struct obdo *oa, int objcount, struct obd_ioobj *obj,
+                  struct niobuf_remote *rnb, int *nr_local,
+                  struct niobuf_local *lnb);
+ 
+ int mdt_obd_commitrw(const struct lu_env *env, int cmd, struct obd_export *exp,
+                    struct obdo *oa, int objcount, struct obd_ioobj *obj,
+                    struct niobuf_remote *rnb, int npages,
+                    struct niobuf_local *lnb, int old_rc);
+ int mdt_punch_hdl(struct tgt_session_info *tsi);
+ int mdt_glimpse_enqueue(struct mdt_thread_info *mti, struct ldlm_namespace *ns,
+                       struct ldlm_lock **lockp, __u64 flags);
+ int mdt_brw_enqueue(struct mdt_thread_info *info, struct ldlm_namespace *ns,
+                   struct ldlm_lock **lockp, __u64 flags);
+ void mdt_dom_discard_data(struct mdt_thread_info *info,
+                         const struct lu_fid *fid);
+ int mdt_dom_disk_lvbo_update(const struct lu_env *env, struct mdt_object *mo,
+                            struct ldlm_resource *res, bool increase_only);
+ void mdt_dom_obj_lvb_update(const struct lu_env *env, struct mdt_object *mo,
+                           bool increase_only);
+ int mdt_dom_lvb_alloc(struct ldlm_resource *res);
+ 
+ static inline void mdt_dom_check_and_discard(struct mdt_thread_info *mti,
+                                            struct mdt_object *mo)
+ {
+       if (lu_object_is_dying(&mo->mot_header) &&
+           S_ISREG(lu_object_attr(&mo->mot_obj)))
+               mdt_dom_discard_data(mti, mdt_object_fid(mo));
+ }
+ 
+ int mdt_dom_object_size(const struct lu_env *env, struct mdt_device *mdt,
+                       const struct lu_fid *fid, struct mdt_body *mb,
+                       bool dom_lock);
+ bool mdt_dom_client_has_lock(struct mdt_thread_info *info,
+                            const struct lu_fid *fid);
+ /* grants */
+ long mdt_grant_connect(const struct lu_env *env, struct obd_export *exp,
+                      u64 want, bool conservative);
+ extern struct kmem_cache *ldlm_glimpse_work_kmem;
+ 
   #endif /* _MDT_INTERNAL_H */
diff --combined lustre/mdt/mdt_mds.c

index 2b12f84,078051a..b9c806f
--- 1/lustre/mdt/mdt_mds.c
--- 2/lustre/mdt/mdt_mds.c
+++ b/lustre/mdt/mdt_mds.c
@@@ -64,7 -64,9 +64,8 @@@ struct mds_device 
         struct ptlrpc_service   *mds_mdsc_service;
         struct ptlrpc_service   *mds_mdss_service;
         struct ptlrpc_service   *mds_fld_service;
+       struct ptlrpc_service   *mds_io_service;
         struct mutex             mds_health_mutex;
- -      struct kset             *mds_kset;
   };
   
   /*
@@@ -74,6 -76,10 +75,10 @@@ static unsigned long mds_num_threads
   module_param(mds_num_threads, ulong, 0444);
   MODULE_PARM_DESC(mds_num_threads, "number of MDS service threads to start");
   
+ int mds_max_io_threads = 512;
+ module_param(mds_max_io_threads, int, 0444);
+ MODULE_PARM_DESC(mds_max_io_threads, "maximum number of MDS IO service threads");
+ 
   static char *mds_num_cpts;
   module_param(mds_num_cpts, charp, 0444);
   MODULE_PARM_DESC(mds_num_cpts, "CPU partitions MDS threads should run on");
@@@ -133,6 -139,10 +138,10 @@@ static void mds_stop_ptlrpc_service(str
                 ptlrpc_unregister_service(m->mds_fld_service);
                 m->mds_fld_service = NULL;
         }
+       if (m->mds_io_service != NULL) {
+               ptlrpc_unregister_service(m->mds_io_service);
+               m->mds_io_service = NULL;
+       }
         mutex_unlock(&m->mds_health_mutex);
   
         EXIT;
@@@ -183,7 -193,7 +192,7 @@@ static int mds_start_ptlrpc_service(str
                         .so_hpreq_handler       = ptlrpc_hpreq_handler,
                 },
         };
- -      m->mds_regular_service = ptlrpc_register_service(&conf, m->mds_kset,
+ +      m->mds_regular_service = ptlrpc_register_service(&conf, &obd->obd_kset,
                                                          procfs_entry);
         if (IS_ERR(m->mds_regular_service)) {
                 rc = PTR_ERR(m->mds_regular_service);
@@@ -227,7 -237,7 +236,7 @@@
                         .so_req_printer         = target_print_req,
                 },
         };
- -      m->mds_readpage_service = ptlrpc_register_service(&conf, m->mds_kset,
+ +      m->mds_readpage_service = ptlrpc_register_service(&conf, &obd->obd_kset,
                                                           procfs_entry);
         if (IS_ERR(m->mds_readpage_service)) {
                 rc = PTR_ERR(m->mds_readpage_service);
@@@ -275,7 -285,7 +284,7 @@@
                         .so_hpreq_handler       = NULL,
                 },
         };
- -      m->mds_setattr_service = ptlrpc_register_service(&conf, m->mds_kset,
+ +      m->mds_setattr_service = ptlrpc_register_service(&conf, &obd->obd_kset,
                                                          procfs_entry);
         if (IS_ERR(m->mds_setattr_service)) {
                 rc = PTR_ERR(m->mds_setattr_service);
@@@ -321,7 -331,7 +330,7 @@@
                         .so_hpreq_handler       = NULL,
                 },
         };
- -      m->mds_out_service = ptlrpc_register_service(&conf, m->mds_kset,
+ +      m->mds_out_service = ptlrpc_register_service(&conf, &obd->obd_kset,
                                                      procfs_entry);
         if (IS_ERR(m->mds_out_service)) {
                 rc = PTR_ERR(m->mds_out_service);
@@@ -357,7 -367,7 +366,7 @@@
                         .so_hpreq_handler       = NULL,
                 },
         };
- -      m->mds_mdsc_service = ptlrpc_register_service(&conf, m->mds_kset,
+ +      m->mds_mdsc_service = ptlrpc_register_service(&conf, &obd->obd_kset,
                                                       procfs_entry);
         if (IS_ERR(m->mds_mdsc_service)) {
                 rc = PTR_ERR(m->mds_mdsc_service);
@@@ -394,7 -404,7 +403,7 @@@
                         .so_hpreq_handler       = NULL,
                 },
         };
- -      m->mds_mdss_service = ptlrpc_register_service(&conf, m->mds_kset,
+ +      m->mds_mdss_service = ptlrpc_register_service(&conf, &obd->obd_kset,
                                                       procfs_entry);
         if (IS_ERR(m->mds_mdss_service)) {
                 rc = PTR_ERR(m->mds_mdss_service);
@@@ -429,7 -439,7 +438,7 @@@
                         .so_hpreq_handler       = NULL,
                 },
         };
- -      m->mds_fld_service = ptlrpc_register_service(&conf, m->mds_kset,
+ +      m->mds_fld_service = ptlrpc_register_service(&conf, &obd->obd_kset,
                                                      procfs_entry);
         if (IS_ERR(m->mds_fld_service)) {
                 rc = PTR_ERR(m->mds_fld_service);
@@@ -439,6 -449,43 +448,43 @@@
                 GOTO(err_mds_svc, rc);
         }
   
- -      m->mds_io_service = ptlrpc_register_service(&conf, m->mds_kset,
+       memset(&conf, 0, sizeof(conf));
+       conf = (typeof(conf)) {
+               .psc_name               = LUSTRE_MDT_NAME "_io",
+               .psc_watchdog_factor    = MDT_SERVICE_WATCHDOG_FACTOR,
+               .psc_buf                = {
+                       .bc_nbufs               = OST_NBUFS,
+                       .bc_buf_size            = OST_IO_BUFSIZE,
+                       .bc_req_max_size        = OST_IO_MAXREQSIZE,
+                       .bc_rep_max_size        = OST_IO_MAXREPSIZE,
+                       .bc_req_portal          = MDS_IO_PORTAL,
+                       .bc_rep_portal          = MDC_REPLY_PORTAL,
+               },
+               .psc_thr                = {
+                       .tc_thr_name            = "ll_mdt_io",
+                       .tc_thr_factor          = OSS_THR_FACTOR,
+                       .tc_nthrs_init          = OSS_NTHRS_INIT,
+                       .tc_nthrs_base          = OSS_NTHRS_BASE,
+                       .tc_nthrs_max           = mds_max_io_threads,
+                       .tc_cpu_affinity        = 1,
+                       .tc_ctx_tags            = LCT_DT_THREAD | LCT_MD_THREAD,
+               },
+               .psc_ops                = {
+                       .so_thr_init            = tgt_io_thread_init,
+                       .so_thr_done            = tgt_io_thread_done,
+                       .so_req_handler         = tgt_request_handle,
+                       .so_req_printer         = target_print_req,
+               },
+       };
++      m->mds_io_service = ptlrpc_register_service(&conf, &obd->obd_kset,
+                                                   procfs_entry);
+       if (IS_ERR(m->mds_io_service)) {
+               rc = PTR_ERR(m->mds_io_service);
+               CERROR("failed to start MDT I/O service: %d\n", rc);
+               m->mds_io_service = NULL;
+               GOTO(err_mds_svc, rc);
+       }
+ 
         EXIT;
   err_mds_svc:
         if (rc)
@@@ -460,7 -507,7 +506,7 @@@ static struct lu_device *mds_device_fin
         ENTRY;
   
         mds_stop_ptlrpc_service(m);
- -      lprocfs_kset_unregister(obd, m->mds_kset);
+ +      lprocfs_obd_cleanup(obd);
         RETURN(NULL);
   }
   
@@@ -498,7 -545,7 +544,7 @@@ static struct lu_device *mds_device_all
         /* set this lu_device to obd, because error handling need it */
         obd->obd_lu_dev = l;
   
- -      rc = lprocfs_kset_register(obd, &m->mds_kset);
+ +      rc = lprocfs_obd_setup(obd, true);
         if (rc != 0) {
                 mds_device_free(env, l);
                 l = ERR_PTR(rc);
@@@ -509,7 -556,7 +555,7 @@@
   
         rc = mds_start_ptlrpc_service(m);
         if (rc != 0) {
- -              lprocfs_kset_unregister(obd, m->mds_kset);
+ +              lprocfs_obd_cleanup(obd);
                 mds_device_free(env, l);
                 l = ERR_PTR(rc);
                 return l;
@@@ -553,6 -600,7 +599,7 @@@ static int mds_health_check(const struc
         rc |= ptlrpc_service_health_check(mds->mds_mdsc_service);
         rc |= ptlrpc_service_health_check(mds->mds_mdss_service);
         rc |= ptlrpc_service_health_check(mds->mds_fld_service);
+       rc |= ptlrpc_service_health_check(mds->mds_io_service);
         mutex_unlock(&mds->mds_health_mutex);
   
         return rc != 0 ? 1 : 0;
diff --combined lustre/mdt/mdt_open.c

index 2fcdb85,76555b5..2b6ee7d
--- 1/lustre/mdt/mdt_open.c
--- 2/lustre/mdt/mdt_open.c
+++ b/lustre/mdt/mdt_open.c
@@@ -775,11 -775,15 +775,15 @@@ static int mdt_object_open_lock(struct 
   {
         struct md_attr *ma = &info->mti_attr;
         __u64 open_flags = info->mti_spec.sp_cr_flags;
+       __u64 trybits = 0;
         enum ldlm_mode lm = LCK_CR;
         bool acq_lease = !!(open_flags & MDS_OPEN_LEASE);
         bool try_layout = false;
         bool create_layout = false;
         int rc = 0;
+       int dom_stripes = LMM_NO_DOM;
+       bool dom_lock = false;
+ 
         ENTRY;
   
         *ibits = 0;
@@@ -795,6 -799,24 +799,24 @@@
                 if (exp_connect_layout(info->mti_exp) && !create_layout &&
                     ma->ma_need & MA_LOV)
                         try_layout = true;
+ 
+               /* DoM files can have just MDT stripe or combined MDT + OST
+                * stripes.
+                * - In the first case the open for read/write will do IO to
+                *   the MDT stripe and it makes sense to take IO lock in
+                *   advance along with OPEN even if it is blocking lock.
+                * - In the second case it is just size of MDT stripe and it
+                *   is quite unlikely that client will write into it, though
+                *   it may read it. So IO lock will be taken optionally if it
+                *   is non-blocking one.
+                */
+               if (ma->ma_valid & MA_LOV && ma->ma_lmm != NULL)
+                       dom_stripes = mdt_lmm_dom_entry(ma->ma_lmm);
+ 
+               if (dom_stripes == LMM_DOM_ONLY &&
+                   info->mti_mdt->mdt_opts.mo_dom_lock != 0 &&
+                   !mdt_dom_client_has_lock(info, mdt_object_fid(obj)))
+                       dom_lock = true;
         }
   
         if (acq_lease) {
@@@ -847,7 -869,12 +869,12 @@@
                         try_layout = false;
   
                         lhc = &info->mti_lh[MDT_LH_LOCAL];
+               } else if (dom_lock) {
+                       lm = (open_flags & FMODE_WRITE) ? LCK_PW : LCK_PR;
+                       *ibits = MDS_INODELOCK_DOM;
+                       try_layout = false;
                 }
+ 
                 CDEBUG(D_INODE, "normal open:"DFID" lease count: %d, lm: %d\n",
                         PFID(mdt_object_fid(obj)),
                         atomic_read(&obj->mot_open_count), lm);
@@@ -863,17 -890,18 +890,18 @@@
                  * lock for each open.
                  * However this is a double-edged sword because changing
                  * permission will revoke huge # of LOOKUP locks. */
-               rc = mdt_object_lock_try(info, obj, lhc, ibits,
-                                        MDS_INODELOCK_LAYOUT |
-                                        MDS_INODELOCK_LOOKUP, false);
-       } else if (*ibits != 0) {
-               rc = mdt_object_lock(info, obj, lhc, *ibits);
+               trybits |= MDS_INODELOCK_LAYOUT | MDS_INODELOCK_LOOKUP;
         }
   
-       CDEBUG(D_INODE, "%s: Requested bits lock:"DFID ", ibits = %#llx"
+       if (trybits != 0)
+               rc = mdt_object_lock_try(info, obj, lhc, ibits, trybits, false);
+       else if (*ibits != 0)
+               rc = mdt_object_lock(info, obj, lhc, *ibits);
+ 
+       CDEBUG(D_INODE, "%s: Requested bits lock:"DFID ", ibits = %#llx/%#llx"
                ", open_flags = %#llo, try_layout = %d : rc = %d\n",
                mdt_obd_name(info->mti_mdt), PFID(mdt_object_fid(obj)),
-              *ibits, open_flags, try_layout, rc);
+              *ibits, trybits, open_flags, try_layout, rc);
   
         /* will change layout, revoke layout locks by enqueuing EX lock. */
         if (rc == 0 && create_layout) {
@@@ -974,7 -1002,8 +1002,8 @@@ static void mdt_object_open_unlock(stru
         if (ibits == 0 || rc == -MDT_EREMOTE_OPEN)
                 RETURN_EXIT;
   
-       if (!(open_flags & MDS_OPEN_LOCK) && !(ibits & MDS_INODELOCK_LAYOUT)) {
+       if (!(open_flags & MDS_OPEN_LOCK) && !(ibits & MDS_INODELOCK_LAYOUT) &&
+           !(ibits & MDS_INODELOCK_DOM)) {
                 /* for the open request, the lock will only return to client
                  * if open or layout lock is granted. */
                 rc = 1;
@@@ -1111,6 -1140,12 +1140,12 @@@ out_unlock
                 mdt_object_open_unlock(info, o, lhc, ibits, rc);
   out:
         mdt_object_put(env, o);
+       if (rc == 0) {
+               rc = mdt_pack_size2body(info, rr->rr_fid2,
+                                       ibits & MDS_INODELOCK_DOM);
+               LASSERT(ergo(ibits & MDS_INODELOCK_DOM, !rc));
+               rc = 0;
+       }
   out_parent_put:
         if (parent != NULL)
                 mdt_object_put(env, parent);
@@@ -1284,12 -1319,17 +1319,12 @@@ int mdt_reint_open(struct mdt_thread_in
                 result = mdt_cross_open(info, rr->rr_fid2, rr->rr_fid1,
                                         ldlm_rep, create_flags);
                 GOTO(out, result);
- -      } else if (req_is_replay(req) ||
- -          (req->rq_export->exp_libclient && create_flags & MDS_OPEN_HAS_EA)) {
- -              /* This is a replay request or from liblustre with ea. */
+ +      } else if (req_is_replay(req)) {
                 result = mdt_open_by_fid(info, ldlm_rep);
   
- -              if (result != -ENOENT) {
- -                      if (req->rq_export->exp_libclient &&
- -                          create_flags & MDS_OPEN_HAS_EA)
- -                              GOTO(out, result = 0);
+ +              if (result != -ENOENT)
                         GOTO(out, result);
- -              }
+ +
                 /* We didn't find the correct object, so we need to re-create it
                  * via a regular replay. */
                 if (!(create_flags & MDS_OPEN_CREAT)) {
@@@ -1567,6 -1607,12 +1602,12 @@@ out_child_unlock
                 mdt_object_open_unlock(info, child, lhc, ibits, result);
   out_child:
         mdt_object_put(info->mti_env, child);
+       if (result == 0) {
+               rc = mdt_pack_size2body(info, child_fid,
+                                       ibits & MDS_INODELOCK_DOM);
+               LASSERT(ergo(ibits & MDS_INODELOCK_DOM, !rc));
+               rc = 0;
+       }
   out_parent:
         mdt_object_unlock_put(info, parent, lh, result || !created);
   out:
@@@ -1659,7 -1705,7 +1700,7 @@@ static inline int mdt_hsm_set_released(
         __u32   off;
         int     i;
   
- -      if (lmm->lmm_magic == cpu_to_le32(LOV_MAGIC_COMP_V1_DEF)) {
+ +      if (lmm->lmm_magic == cpu_to_le32(LOV_MAGIC_COMP_V1_DEFINED)) {
                 comp_v1 = (struct lov_comp_md_v1 *)lmm;
   
                 if (comp_v1->lcm_entry_count == 0)
@@@ -1764,17 -1810,17 +1805,17 @@@ static int mdt_hsm_release(struct mdt_t
         if (!(ma->ma_valid & MA_LOV)) {
                 /* Even empty file are released */
                 memset(ma->ma_lmm, 0, sizeof(*ma->ma_lmm));
- -              ma->ma_lmm->lmm_magic = cpu_to_le32(LOV_MAGIC_V1_DEF);
+ +              ma->ma_lmm->lmm_magic = cpu_to_le32(LOV_MAGIC_V1_DEFINED);
                 ma->ma_lmm->lmm_pattern = cpu_to_le32(LOV_PATTERN_RAID0);
                 ma->ma_lmm->lmm_stripe_size = cpu_to_le32(LOV_MIN_STRIPE_SIZE);
                 ma->ma_lmm_size = sizeof(*ma->ma_lmm);
         } else {
- -              /* Magic must be LOV_MAGIC_*_DEF otherwise LOD will interpret
+ +              /* Magic must be LOV_MAGIC_*_DEFINED or LOD will interpret
                  * ma_lmm as lov_user_md, then it will be confused by union of
                  * layout_gen and stripe_offset. */
                 if ((le32_to_cpu(ma->ma_lmm->lmm_magic) & LOV_MAGIC_MASK) ==
                     LOV_MAGIC_MAGIC)
- -                      ma->ma_lmm->lmm_magic |= cpu_to_le32(LOV_MAGIC_DEF);
+ +                      ma->ma_lmm->lmm_magic |= cpu_to_le32(LOV_MAGIC_DEFINED);
                 else
                         GOTO(out_unlock, rc = -EINVAL);
         }
@@@ -2069,8 -2115,10 +2110,10 @@@ int mdt_mfd_close(struct mdt_thread_inf
         atomic_dec(&o->mot_open_count);
         mdt_handle_last_unlink(info, o, ma);
   
-         if (!MFD_CLOSED(mode))
-                 rc = mo_close(info->mti_env, next, ma, mode);
+       if (!MFD_CLOSED(mode)) {
+               rc = mo_close(info->mti_env, next, ma, mode);
+               mdt_dom_check_and_discard(info, o);
+       }
   
         /* adjust open and lease count */
         if (mode & MDS_OPEN_LEASE) {
diff --combined lustre/mdt/mdt_reint.c

index f72289e,1f079be..fde4c97
--- 1/lustre/mdt/mdt_reint.c
--- 2/lustre/mdt/mdt_reint.c
+++ b/lustre/mdt/mdt_reint.c
@@@ -660,7 -660,7 +660,7 @@@ static int mdt_attr_set(struct mdt_thre
   
           if (rc != 0)
                   GOTO(out_unlock, rc);
- 
+       mdt_dom_obj_lvb_update(info->mti_env, mo, false);
           EXIT;
   out_unlock:
         mdt_unlock_slaves(info, mo, lockpart, s0_lh, s0_obj, einfo, rc);
@@@ -795,11 -795,11 +795,11 @@@ static int mdt_reint_setattr(struct mdt
   
         mdt_pack_attr2body(info, repbody, &ma->ma_attr, mdt_object_fid(mo));
   
-         EXIT;
+       EXIT;
   out_put:
-         mdt_object_put(info->mti_env, mo);
+       mdt_object_put(info->mti_env, mo);
   out:
-         if (rc == 0)
+       if (rc == 0)
                 mdt_counter_incr(req, LPROC_MDT_SETATTR);
   
           mdt_client_compatibility(info);
@@@ -873,6 -873,7 +873,7 @@@ static int mdt_reint_unlink(struct mdt_
         bool cos_incompat = false;
         int no_name = 0;
         int rc;
+ 
         ENTRY;
   
         DEBUG_REQ(D_INODE, req, "unlink "DFID"/"DNAME"", PFID(rr->rr_fid1),
@@@ -1044,32 -1045,39 +1045,39 @@@ relock
                         mdt_object_child(mc), &rr->rr_name, ma, no_name);
   
         mutex_unlock(&mc->mot_lov_mutex);
+       if (rc != 0)
+               GOTO(unlock_child, rc);
   
-       if (rc == 0 && !lu_object_is_dying(&mc->mot_header))
+       if (!lu_object_is_dying(&mc->mot_header)) {
                 rc = mdt_attr_get_complex(info, mc, ma);
-       if (rc == 0)
-               mdt_handle_last_unlink(info, mc, ma);
+               if (rc)
+                       GOTO(out_stat, rc);
+       } else {
+               mdt_dom_check_and_discard(info, mc);
+       }
+       mdt_handle_last_unlink(info, mc, ma);
   
-         if (ma->ma_valid & MA_INODE) {
-                 switch (ma->ma_attr.la_mode & S_IFMT) {
-                 case S_IFDIR:
+ out_stat:
+       if (ma->ma_valid & MA_INODE) {
+               switch (ma->ma_attr.la_mode & S_IFMT) {
+               case S_IFDIR:
                         mdt_counter_incr(req, LPROC_MDT_RMDIR);
-                         break;
-                 case S_IFREG:
-                 case S_IFLNK:
-                 case S_IFCHR:
-                 case S_IFBLK:
-                 case S_IFIFO:
-                 case S_IFSOCK:
+                       break;
+               case S_IFREG:
+               case S_IFLNK:
+               case S_IFCHR:
+               case S_IFBLK:
+               case S_IFIFO:
+               case S_IFSOCK:
                         mdt_counter_incr(req, LPROC_MDT_UNLINK);
-                         break;
-                 default:
-                         LASSERTF(0, "bad file type %o unlinking\n",
-                                  ma->ma_attr.la_mode);
-                 }
-         }
+                       break;
+               default:
+                       LASSERTF(0, "bad file type %o unlinking\n",
+                               ma->ma_attr.la_mode);
+               }
+       }
   
-         EXIT;
+       EXIT;
   
   unlock_child:
         mdt_unlock_slaves(info, mc, MDS_INODELOCK_UPDATE, s0_lh, s0_obj, einfo,
@@@ -1452,7 -1460,6 +1460,7 @@@ again
                                 GOTO(out, rc = -EBUSY);
                         }
   
+ +                      mdt_lock_pdo_init(&mll->mll_lh, LCK_PW, &name);
                         rc = mdt_object_lock(info, mdt_pobj, &mll->mll_lh,
                                              MDS_INODELOCK_UPDATE);
                         if (rc != 0) {
@@@ -2107,8 -2114,10 +2115,10 @@@ relock
         /* handle last link of tgt object */
         if (rc == 0) {
                 mdt_counter_incr(req, LPROC_MDT_RENAME);
-               if (mnew)
+               if (mnew) {
                         mdt_handle_last_unlink(info, mnew, ma);
+                       mdt_dom_check_and_discard(info, mnew);
+               }
   
                 mdt_rename_counter_tally(info, info->mti_mdt, req,
                                          msrcdir, mtgtdir);
diff --combined lustre/obdclass/genops.c

index 8f2c64b,6799c0c..8a88c6d
--- 1/lustre/obdclass/genops.c
--- 2/lustre/obdclass/genops.c
+++ b/lustre/obdclass/genops.c
@@@ -40,7 -40,6 +40,7 @@@
   #include <linux/pid_namespace.h>
   #include <linux/kthread.h>
   #include <obd_class.h>
+ +#include <lustre_log.h>
   #include <lprocfs_status.h>
   #include <lustre_disk.h>
   #include <lustre_kernelcomm.h>
@@@ -163,19 -162,6 +163,19 @@@ void class_put_type(struct obd_type *ty
         spin_unlock(&type->obd_type_lock);
   }
   
+ +static void class_sysfs_release(struct kobject *kobj)
+ +{
+ +      struct obd_type *type = container_of(kobj, struct obd_type,
+ +                                           typ_kobj);
+ +
+ +      complete(&type->typ_kobj_unregister);
+ +}
+ +
+ +static struct kobj_type class_ktype = {
+ +      .sysfs_ops      = &lustre_sysfs_ops,
+ +      .release        = class_sysfs_release,
+ +};
+ +
   #define CLASS_MAX_NAME 1024
   
   int class_register_type(struct obd_ops *dt_ops, struct md_ops *md_ops,
@@@ -227,29 -213,28 +227,29 @@@
                 }
         }
   #endif
- -      type->typ_kobj = kobject_create_and_add(type->typ_name, lustre_kobj);
- -      if (!type->typ_kobj) {
- -              rc = -ENOMEM;
+ +      type->typ_kobj.kset = lustre_kset;
+ +      init_completion(&type->typ_kobj_unregister);
+ +      rc = kobject_init_and_add(&type->typ_kobj, &class_ktype,
+ +                                &lustre_kset->kobj, "%s", type->typ_name);
+ +      if (rc)
                 GOTO(failed, rc);
- -      }
   
- -        if (ldt != NULL) {
- -                type->typ_lu = ldt;
- -                rc = lu_device_type_init(ldt);
- -                if (rc != 0)
- -                        GOTO (failed, rc);
- -        }
+ +      if (ldt) {
+ +              type->typ_lu = ldt;
+ +              rc = lu_device_type_init(ldt);
+ +              if (rc) {
+ +                      kobject_put(&type->typ_kobj);
+ +                      GOTO(failed, rc);
+ +              }
+ +      }
   
         spin_lock(&obd_types_lock);
         list_add(&type->typ_chain, &obd_types);
         spin_unlock(&obd_types_lock);
   
- -        RETURN (0);
+ +      RETURN(0);
   
   failed:
- -      if (type->typ_kobj)
- -              kobject_put(type->typ_kobj);
         if (type->typ_name != NULL) {
   #ifdef CONFIG_PROC_FS
                 if (type->typ_procroot != NULL)
@@@ -285,8 -270,8 +285,8 @@@ int class_unregister_type(const char *n
                   RETURN(-EBUSY);
           }
   
- -      if (type->typ_kobj)
- -              kobject_put(type->typ_kobj);
+ +      kobject_put(&type->typ_kobj);
+ +      wait_for_completion(&type->typ_kobj_unregister);
   
         /* we do not use type->typ_procroot as for compatibility purposes
          * other modules can share names (i.e. lod can use lov entry). so
@@@ -317,20 -302,21 +317,20 @@@ EXPORT_SYMBOL(class_unregister_type)
   /**
    * Create a new obd device.
    *
- - * Find an empty slot in ::obd_devs[], create a new obd device in it.
+ + * Allocate the new obd_device and initialize it.
    *
    * \param[in] type_name obd device type string.
    * \param[in] name      obd device name.
+ + * \param[in] uuid      obd device UUID
    *
- - * \retval NULL if create fails, otherwise return the obd device
- - *         pointer created.
+ + * \retval newdev         pointer to created obd_device
+ + * \retval ERR_PTR(errno) on error
    */
- -struct obd_device *class_newdev(const char *type_name, const char *name)
+ +struct obd_device *class_newdev(const char *type_name, const char *name,
+ +                              const char *uuid)
   {
- -        struct obd_device *result = NULL;
           struct obd_device *newdev;
           struct obd_type *type = NULL;
- -        int i;
- -        int new_obd_minor = 0;
           ENTRY;
   
           if (strlen(name) >= MAX_OBD_NAME) {
@@@ -345,184 -331,93 +345,184 @@@
           }
   
           newdev = obd_device_alloc();
- -      if (newdev == NULL)
- -              GOTO(out_type, result = ERR_PTR(-ENOMEM));
- -
+ +      if (newdev == NULL) {
+ +              class_put_type(type);
+ +              RETURN(ERR_PTR(-ENOMEM));
+ +      }
           LASSERT(newdev->obd_magic == OBD_DEVICE_MAGIC);
+ +      strncpy(newdev->obd_name, name, sizeof(newdev->obd_name) - 1);
+ +      newdev->obd_type = type;
+ +      newdev->obd_minor = -1;
+ +
+ +      rwlock_init(&newdev->obd_pool_lock);
+ +      newdev->obd_pool_limit = 0;
+ +      newdev->obd_pool_slv = 0;
+ +
+ +      INIT_LIST_HEAD(&newdev->obd_exports);
+ +      INIT_LIST_HEAD(&newdev->obd_unlinked_exports);
+ +      INIT_LIST_HEAD(&newdev->obd_delayed_exports);
+ +      INIT_LIST_HEAD(&newdev->obd_exports_timed);
+ +      INIT_LIST_HEAD(&newdev->obd_nid_stats);
+ +      spin_lock_init(&newdev->obd_nid_lock);
+ +      spin_lock_init(&newdev->obd_dev_lock);
+ +      mutex_init(&newdev->obd_dev_mutex);
+ +      spin_lock_init(&newdev->obd_osfs_lock);
+ +      /* newdev->obd_osfs_age must be set to a value in the distant
+ +       * past to guarantee a fresh statfs is fetched on mount. */
+ +      newdev->obd_osfs_age = cfs_time_shift_64(-1000);
+ +
+ +      /* XXX belongs in setup not attach  */
+ +      init_rwsem(&newdev->obd_observer_link_sem);
+ +      /* recovery data */
+ +      init_timer(&newdev->obd_recovery_timer);
+ +      spin_lock_init(&newdev->obd_recovery_task_lock);
+ +      init_waitqueue_head(&newdev->obd_next_transno_waitq);
+ +      init_waitqueue_head(&newdev->obd_evict_inprogress_waitq);
+ +      INIT_LIST_HEAD(&newdev->obd_req_replay_queue);
+ +      INIT_LIST_HEAD(&newdev->obd_lock_replay_queue);
+ +      INIT_LIST_HEAD(&newdev->obd_final_req_queue);
+ +      INIT_LIST_HEAD(&newdev->obd_evict_list);
+ +      INIT_LIST_HEAD(&newdev->obd_lwp_list);
+ +
+ +      llog_group_init(&newdev->obd_olg);
+ +      /* Detach drops this */
+ +      atomic_set(&newdev->obd_refcount, 1);
+ +      lu_ref_init(&newdev->obd_reference);
+ +      lu_ref_add(&newdev->obd_reference, "newdev", newdev);
+ +
+ +      newdev->obd_conn_inprogress = 0;
+ +
+ +      strncpy(newdev->obd_uuid.uuid, uuid, strlen(uuid));
+ +
+ +      CDEBUG(D_IOCTL, "Allocate new device %s (%p)\n",
+ +             newdev->obd_name, newdev);
+ +
+ +      return newdev;
+ +}
   
- -      write_lock(&obd_dev_lock);
- -        for (i = 0; i < class_devno_max(); i++) {
- -                struct obd_device *obd = class_num2obd(i);
- -
- -              if (obd && (strcmp(name, obd->obd_name) == 0)) {
- -                        CERROR("Device %s already exists at %d, won't add\n",
- -                               name, i);
- -                        if (result) {
- -                                LASSERTF(result->obd_magic == OBD_DEVICE_MAGIC,
- -                                         "%p obd_magic %08x != %08x\n", result,
- -                                         result->obd_magic, OBD_DEVICE_MAGIC);
- -                                LASSERTF(result->obd_minor == new_obd_minor,
- -                                         "%p obd_minor %d != %d\n", result,
- -                                         result->obd_minor, new_obd_minor);
- -
- -                                obd_devs[result->obd_minor] = NULL;
- -                                result->obd_name[0]='\0';
- -                         }
- -                        result = ERR_PTR(-EEXIST);
- -                        break;
- -                }
- -                if (!result && !obd) {
- -                        result = newdev;
- -                        result->obd_minor = i;
- -                        new_obd_minor = i;
- -                        result->obd_type = type;
- -                        strncpy(result->obd_name, name,
- -                                sizeof(result->obd_name) - 1);
- -                        obd_devs[i] = result;
- -                }
- -        }
- -      write_unlock(&obd_dev_lock);
- -
- -        if (result == NULL && i >= class_devno_max()) {
- -                CERROR("all %u OBD devices used, increase MAX_OBD_DEVICES\n",
- -                       class_devno_max());
- -              GOTO(out, result = ERR_PTR(-EOVERFLOW));
- -        }
- -
- -      if (IS_ERR(result))
- -              GOTO(out, result);
+ +/**
+ + * Free obd device.
+ + *
+ + * \param[in] obd obd_device to be freed
+ + *
+ + * \retval none
+ + */
+ +void class_free_dev(struct obd_device *obd)
+ +{
+ +      struct obd_type *obd_type = obd->obd_type;
+ +
+ +      LASSERTF(obd->obd_magic == OBD_DEVICE_MAGIC, "%p obd_magic %08x "
+ +               "!= %08x\n", obd, obd->obd_magic, OBD_DEVICE_MAGIC);
+ +      LASSERTF(obd->obd_minor == -1 || obd_devs[obd->obd_minor] == obd,
+ +               "obd %p != obd_devs[%d] %p\n",
+ +               obd, obd->obd_minor, obd_devs[obd->obd_minor]);
+ +      LASSERTF(atomic_read(&obd->obd_refcount) == 0,
+ +               "obd_refcount should be 0, not %d\n",
+ +               atomic_read(&obd->obd_refcount));
+ +      LASSERT(obd_type != NULL);
+ +
+ +      CDEBUG(D_INFO, "Release obd device %s obd_type name = %s\n",
+ +             obd->obd_name, obd->obd_type->typ_name);
+ +
+ +      CDEBUG(D_CONFIG, "finishing cleanup of obd %s (%s)\n",
+ +                       obd->obd_name, obd->obd_uuid.uuid);
+ +      if (obd->obd_stopping) {
+ +              int err;
+ +
+ +              /* If we're not stopping, we were never set up */
+ +              err = obd_cleanup(obd);
+ +              if (err)
+ +                      CERROR("Cleanup %s returned %d\n",
+ +                              obd->obd_name, err);
+ +      }
   
- -      CDEBUG(D_IOCTL, "Adding new device %s (%p)\n",
- -             result->obd_name, result);
+ +      obd_device_free(obd);
   
- -      RETURN(result);
- -out:
- -      obd_device_free(newdev);
- -out_type:
- -      class_put_type(type);
- -      return result;
+ +      class_put_type(obd_type);
   }
   
- -void class_release_dev(struct obd_device *obd)
+ +/**
+ + * Unregister obd device.
+ + *
+ + * Free slot in obd_dev[] used by \a obd.
+ + *
+ + * \param[in] new_obd obd_device to be unregistered
+ + *
+ + * \retval none
+ + */
+ +void class_unregister_device(struct obd_device *obd)
   {
- -        struct obd_type *obd_type = obd->obd_type;
- -
- -        LASSERTF(obd->obd_magic == OBD_DEVICE_MAGIC, "%p obd_magic %08x != %08x\n",
- -                 obd, obd->obd_magic, OBD_DEVICE_MAGIC);
- -        LASSERTF(obd == obd_devs[obd->obd_minor], "obd %p != obd_devs[%d] %p\n",
- -                 obd, obd->obd_minor, obd_devs[obd->obd_minor]);
- -        LASSERT(obd_type != NULL);
+ +      write_lock(&obd_dev_lock);
+ +      if (obd->obd_minor >= 0) {
+ +              LASSERT(obd_devs[obd->obd_minor] == obd);
+ +              obd_devs[obd->obd_minor] = NULL;
+ +              obd->obd_minor = -1;
+ +      }
+ +      write_unlock(&obd_dev_lock);
+ +}
   
- -        CDEBUG(D_INFO, "Release obd device %s at %d obd_type name =%s\n",
- -               obd->obd_name, obd->obd_minor, obd->obd_type->typ_name);
+ +/**
+ + * Register obd device.
+ + *
+ + * Find free slot in obd_devs[], fills it with \a new_obd.
+ + *
+ + * \param[in] new_obd obd_device to be registered
+ + *
+ + * \retval 0          success
+ + * \retval -EEXIST    device with this name is registered
+ + * \retval -EOVERFLOW obd_devs[] is full
+ + */
+ +int class_register_device(struct obd_device *new_obd)
+ +{
+ +      int ret = 0;
+ +      int i;
+ +      int new_obd_minor = 0;
+ +      bool minor_assign = false;
   
         write_lock(&obd_dev_lock);
- -        obd_devs[obd->obd_minor] = NULL;
+ +      for (i = 0; i < class_devno_max(); i++) {
+ +              struct obd_device *obd = class_num2obd(i);
+ +
+ +              if (obd != NULL &&
+ +                  (strcmp(new_obd->obd_name, obd->obd_name) == 0)) {
+ +                      CERROR("%s: already exists, won't add\n",
+ +                             obd->obd_name);
+ +                      /* in case we found a free slot before duplicate */
+ +                      minor_assign = false;
+ +                      ret = -EEXIST;
+ +                      break;
+ +              }
+ +              if (!minor_assign && obd == NULL) {
+ +                      new_obd_minor = i;
+ +                      minor_assign = true;
+ +              }
+ +      }
+ +
+ +      if (minor_assign) {
+ +              new_obd->obd_minor = new_obd_minor;
+ +              LASSERTF(obd_devs[new_obd_minor] == NULL, "obd_devs[%d] "
+ +                       "%p\n", new_obd_minor, obd_devs[new_obd_minor]);
+ +              obd_devs[new_obd_minor] = new_obd;
+ +      } else {
+ +              if (ret == 0) {
+ +                      ret = -EOVERFLOW;
+ +                      CERROR("%s: all %u/%u devices used, increase "
+ +                             "MAX_OBD_DEVICES: rc = %d\n", new_obd->obd_name,
+ +                             i, class_devno_max(), ret);
+ +              }
+ +      }
         write_unlock(&obd_dev_lock);
- -        obd_device_free(obd);
   
- -        class_put_type(obd_type);
+ +      RETURN(ret);
   }
   
- -int class_name2dev(const char *name)
+ +static int class_name2dev_nolock(const char *name)
   {
           int i;
   
           if (!name)
                   return -1;
   
- -      read_lock(&obd_dev_lock);
           for (i = 0; i < class_devno_max(); i++) {
                   struct obd_device *obd = class_num2obd(i);
   
@@@ -531,30 -426,17 +531,30 @@@
                              out any references */
                           LASSERT(obd->obd_magic == OBD_DEVICE_MAGIC);
                           if (obd->obd_attached) {
- -                              read_unlock(&obd_dev_lock);
                                   return i;
                           }
                           break;
                   }
           }
- -      read_unlock(&obd_dev_lock);
   
           return -1;
   }
   
+ +int class_name2dev(const char *name)
+ +{
+ +      int i;
+ +
+ +      if (!name)
+ +              return -1;
+ +
+ +      read_lock(&obd_dev_lock);
+ +      i = class_name2dev_nolock(name);
+ +      read_unlock(&obd_dev_lock);
+ +
+ +      return i;
+ +}
+ +EXPORT_SYMBOL(class_name2dev);
+ +
   struct obd_device *class_name2obd(const char *name)
   {
           int dev = class_name2dev(name);
@@@ -565,34 -447,25 +565,34 @@@
   }
   EXPORT_SYMBOL(class_name2obd);
   
- -int class_uuid2dev(struct obd_uuid *uuid)
+ +int class_uuid2dev_nolock(struct obd_uuid *uuid)
   {
           int i;
   
- -      read_lock(&obd_dev_lock);
           for (i = 0; i < class_devno_max(); i++) {
                   struct obd_device *obd = class_num2obd(i);
   
                   if (obd && obd_uuid_equals(uuid, &obd->obd_uuid)) {
                           LASSERT(obd->obd_magic == OBD_DEVICE_MAGIC);
                           return i;
                   }
           }
- -      read_unlock(&obd_dev_lock);
   
           return -1;
   }
   
+ +int class_uuid2dev(struct obd_uuid *uuid)
+ +{
+ +      int i;
+ +
+ +      read_lock(&obd_dev_lock);
+ +      i = class_uuid2dev_nolock(uuid);
+ +      read_unlock(&obd_dev_lock);
+ +
+ +      return i;
+ +}
+ +EXPORT_SYMBOL(class_uuid2dev);
+ +
   struct obd_device *class_uuid2obd(struct obd_uuid *uuid)
   {
           int dev = class_uuid2dev(uuid);
@@@ -631,40 -504,6 +631,40 @@@ struct obd_device *class_num2obd(int nu
   }
   
   /**
+ + * Find obd in obd_dev[] by name or uuid.
+ + *
+ + * Increment obd's refcount if found.
+ + *
+ + * \param[in] str obd name or uuid
+ + *
+ + * \retval NULL    if not found
+ + * \retval target  pointer to found obd_device
+ + */
+ +struct obd_device *class_dev_by_str(const char *str)
+ +{
+ +      struct obd_device *target = NULL;
+ +      struct obd_uuid tgtuuid;
+ +      int rc;
+ +
+ +      obd_str2uuid(&tgtuuid, str);
+ +
+ +      read_lock(&obd_dev_lock);
+ +      rc = class_uuid2dev_nolock(&tgtuuid);
+ +      if (rc < 0)
+ +              rc = class_name2dev_nolock(str);
+ +
+ +      if (rc >= 0)
+ +              target = class_num2obd(rc);
+ +
+ +      if (target != NULL)
+ +              class_incref(target, "find", current);
+ +      read_unlock(&obd_dev_lock);
+ +
+ +      RETURN(target);
+ +}
+ +EXPORT_SYMBOL(class_dev_by_str);
+ +
+ +/**
    * Get obd devices count. Device in any
    *    state are counted
    * \retval obd device count
@@@ -956,10 -795,7 +956,10 @@@ static void class_export_destroy(struc
         LASSERT(list_empty(&exp->exp_req_replay_queue));
         LASSERT(list_empty(&exp->exp_hp_rpcs));
           obd_destroy_export(exp);
- -        class_decref(obd, "export", exp);
+ +      /* self export doesn't hold a reference to an obd, although it
+ +       * exists until freeing of the obd */
+ +      if (exp != obd->obd_self_export)
+ +              class_decref(obd, "export", exp);
   
           OBD_FREE_RCU(exp, sizeof(*exp), &exp->exp_handle);
           EXIT;
@@@ -992,37 -828,24 +992,37 @@@ void class_export_put(struct obd_expor
                atomic_read(&exp->exp_refcount) - 1);
   
         if (atomic_dec_and_test(&exp->exp_refcount)) {
- -              LASSERT(!list_empty(&exp->exp_obd_chain));
- -              LASSERT(list_empty(&exp->exp_stale_list));
+ +              struct obd_device *obd = exp->exp_obd;
+ +
                 CDEBUG(D_IOCTL, "final put %p/%s\n",
                        exp, exp->exp_client_uuid.uuid);
   
                 /* release nid stat refererence */
                 lprocfs_exp_cleanup(exp);
   
- -              obd_zombie_export_add(exp);
+ +              if (exp == obd->obd_self_export) {
+ +                      /* self export should be destroyed without
+ +                       * zombie thread as it doesn't hold a
+ +                       * reference to obd and doesn't hold any
+ +                       * resources */
+ +                      class_export_destroy(exp);
+ +                      /* self export is destroyed, no class
+ +                       * references exist and it is safe to free
+ +                       * obd */
+ +                      class_free_dev(obd);
+ +              } else {
+ +                      LASSERT(!list_empty(&exp->exp_obd_chain));
+ +                      obd_zombie_export_add(exp);
+ +              }
+ +
         }
   }
   EXPORT_SYMBOL(class_export_put);
   /* Creates a new export, adds it to the hash table, and returns a
    * pointer to it. The refcount is 2: one for the hash reference, and
    * one for the pointer returned by this function. */
- -struct obd_export *class_new_export(struct obd_device *obd,
- -                                    struct obd_uuid *cluuid)
+ +struct obd_export *__class_new_export(struct obd_device *obd,
+ +                                    struct obd_uuid *cluuid, bool is_self)
   {
           struct obd_export *export;
         struct cfs_hash *hash = NULL;
@@@ -1036,7 -859,6 +1036,7 @@@
           export->exp_conn_cnt = 0;
           export->exp_lock_hash = NULL;
         export->exp_flock_hash = NULL;
+ +      /* 2 = class_handle_hash + last */
         atomic_set(&export->exp_refcount, 2);
         atomic_set(&export->exp_rpc_count, 0);
         atomic_set(&export->exp_cb_count, 0);
@@@ -1055,7 -877,7 +1055,7 @@@
         INIT_LIST_HEAD(&export->exp_hp_rpcs);
         INIT_LIST_HEAD(&export->exp_reg_rpcs);
         class_handle_hash(&export->exp_handle, &export_handle_ops);
- -      export->exp_last_request_time = cfs_time_current_sec();
+ +      export->exp_last_request_time = ktime_get_real_seconds();
         spin_lock_init(&export->exp_lock);
         spin_lock_init(&export->exp_rpc_lock);
         INIT_HLIST_NODE(&export->exp_uuid_hash);
@@@ -1070,17 -892,17 +1070,17 @@@
         export->exp_client_uuid = *cluuid;
         obd_init_export(export);
   
- -      spin_lock(&obd->obd_dev_lock);
- -      /* shouldn't happen, but might race */
- -      if (obd->obd_stopping)
- -              GOTO(exit_unlock, rc = -ENODEV);
+ +      if (!obd_uuid_equals(cluuid, &obd->obd_uuid)) {
+ +              spin_lock(&obd->obd_dev_lock);
+ +              /* shouldn't happen, but might race */
+ +              if (obd->obd_stopping)
+ +                      GOTO(exit_unlock, rc = -ENODEV);
   
- -      hash = cfs_hash_getref(obd->obd_uuid_hash);
- -      if (hash == NULL)
- -              GOTO(exit_unlock, rc = -ENODEV);
- -      spin_unlock(&obd->obd_dev_lock);
+ +              hash = cfs_hash_getref(obd->obd_uuid_hash);
+ +              if (hash == NULL)
+ +                      GOTO(exit_unlock, rc = -ENODEV);
+ +              spin_unlock(&obd->obd_dev_lock);
   
- -        if (!obd_uuid_equals(cluuid, &obd->obd_uuid)) {
                   rc = cfs_hash_add_unique(hash, cluuid, &export->exp_uuid_hash);
                   if (rc != 0) {
                           LCONSOLE_WARN("%s: denying duplicate export for %s, %d\n",
@@@ -1092,24 -914,17 +1092,24 @@@
         at_init(&export->exp_bl_lock_at, obd_timeout, 0);
         spin_lock(&obd->obd_dev_lock);
           if (obd->obd_stopping) {
- -                cfs_hash_del(hash, cluuid, &export->exp_uuid_hash);
- -                GOTO(exit_unlock, rc = -ENODEV);
+ +              if (hash)
+ +                      cfs_hash_del(hash, cluuid, &export->exp_uuid_hash);
+ +              GOTO(exit_unlock, rc = -ESHUTDOWN);
           }
   
- -        class_incref(obd, "export", export);
- -      list_add(&export->exp_obd_chain, &export->exp_obd->obd_exports);
- -      list_add_tail(&export->exp_obd_chain_timed,
- -                    &export->exp_obd->obd_exports_timed);
- -        export->exp_obd->obd_num_exports++;
+ +      if (!is_self) {
+ +              class_incref(obd, "export", export);
+ +              list_add_tail(&export->exp_obd_chain_timed,
+ +                            &obd->obd_exports_timed);
+ +              list_add(&export->exp_obd_chain, &obd->obd_exports);
+ +              obd->obd_num_exports++;
+ +      } else {
+ +              INIT_LIST_HEAD(&export->exp_obd_chain_timed);
+ +              INIT_LIST_HEAD(&export->exp_obd_chain);
+ +      }
         spin_unlock(&obd->obd_dev_lock);
- -      cfs_hash_putref(hash);
+ +      if (hash)
+ +              cfs_hash_putref(hash);
         RETURN(export);
   
   exit_unlock:
@@@ -1123,29 -938,12 +1123,29 @@@ exit_err
           OBD_FREE_PTR(export);
           return ERR_PTR(rc);
   }
+ +
+ +struct obd_export *class_new_export(struct obd_device *obd,
+ +                                  struct obd_uuid *uuid)
+ +{
+ +      return __class_new_export(obd, uuid, false);
+ +}
   EXPORT_SYMBOL(class_new_export);
   
+ +struct obd_export *class_new_export_self(struct obd_device *obd,
+ +                                       struct obd_uuid *uuid)
+ +{
+ +      return __class_new_export(obd, uuid, true);
+ +}
+ +
   void class_unlink_export(struct obd_export *exp)
   {
         class_handle_unhash(&exp->exp_handle);
   
+ +      if (exp->exp_obd->obd_self_export == exp) {
+ +              class_export_put(exp);
+ +              return;
+ +      }
+ +
         spin_lock(&exp->exp_obd->obd_dev_lock);
         /* delete an uuid-export hashitem from hashtables */
         if (!hlist_unhashed(&exp->exp_uuid_hash))
@@@ -1528,7 -1326,7 +1528,7 @@@ static void class_disconnect_export_lis
   
                   class_export_get(exp);
                   CDEBUG(D_HA, "%s: disconnecting export at %s (%p), "
- -                     "last request at %ld\n",
+ +                     "last request at %lld\n",
                          exp->exp_obd->obd_name, obd_export_nid2str(exp),
                          exp, exp->exp_last_request_time);
                   /* release one export reference anyway */
@@@ -2188,14 -1986,14 +2188,14 @@@ int obd_get_request_slot(struct client_
         int                              rc;
   
         spin_lock(&cli->cl_loi_list_lock);
-       if (cli->cl_r_in_flight < cli->cl_max_rpcs_in_flight) {
-               cli->cl_r_in_flight++;
+       if (cli->cl_rpcs_in_flight < cli->cl_max_rpcs_in_flight) {
+               cli->cl_rpcs_in_flight++;
                 spin_unlock(&cli->cl_loi_list_lock);
                 return 0;
         }
   
         init_waitqueue_head(&orsw.orsw_waitq);
-       list_add_tail(&orsw.orsw_entry, &cli->cl_loi_read_list);
+       list_add_tail(&orsw.orsw_entry, &cli->cl_flight_waiters);
         orsw.orsw_signaled = false;
         spin_unlock(&cli->cl_loi_list_lock);
   
@@@ -2211,7 -2009,7 +2211,7 @@@
         if (rc != 0) {
                 if (!orsw.orsw_signaled) {
                         if (list_empty(&orsw.orsw_entry))
-                               cli->cl_r_in_flight--;
+                               cli->cl_rpcs_in_flight--;
                         else
                                 list_del(&orsw.orsw_entry);
                 }
@@@ -2233,15 -2031,15 +2233,15 @@@ void obd_put_request_slot(struct client
         struct obd_request_slot_waiter *orsw;
   
         spin_lock(&cli->cl_loi_list_lock);
-       cli->cl_r_in_flight--;
+       cli->cl_rpcs_in_flight--;
   
         /* If there is free slot, wakeup the first waiter. */
-       if (!list_empty(&cli->cl_loi_read_list) &&
-           likely(cli->cl_r_in_flight < cli->cl_max_rpcs_in_flight)) {
-               orsw = list_entry(cli->cl_loi_read_list.next,
+       if (!list_empty(&cli->cl_flight_waiters) &&
+           likely(cli->cl_rpcs_in_flight < cli->cl_max_rpcs_in_flight)) {
+               orsw = list_entry(cli->cl_flight_waiters.next,
                                   struct obd_request_slot_waiter, orsw_entry);
                 list_del_init(&orsw->orsw_entry);
-               cli->cl_r_in_flight++;
+               cli->cl_rpcs_in_flight++;
                 wake_up(&orsw->orsw_waitq);
         }
         spin_unlock(&cli->cl_loi_list_lock);
@@@ -2287,17 -2085,19 +2287,19 @@@ int obd_set_max_rpcs_in_flight(struct c
         spin_lock(&cli->cl_loi_list_lock);
         old = cli->cl_max_rpcs_in_flight;
         cli->cl_max_rpcs_in_flight = max;
+       client_adjust_max_dirty(cli);
+ 
         diff = max - old;
   
         /* We increase the max_rpcs_in_flight, then wakeup some waiters. */
         for (i = 0; i < diff; i++) {
-               if (list_empty(&cli->cl_loi_read_list))
+               if (list_empty(&cli->cl_flight_waiters))
                         break;
   
-               orsw = list_entry(cli->cl_loi_read_list.next,
+               orsw = list_entry(cli->cl_flight_waiters.next,
                                   struct obd_request_slot_waiter, orsw_entry);
                 list_del_init(&orsw->orsw_entry);
-               cli->cl_r_in_flight++;
+               cli->cl_rpcs_in_flight++;
                 wake_up(&orsw->orsw_waitq);
         }
         spin_unlock(&cli->cl_loi_list_lock);
diff --combined lustre/obdclass/obd_config.c

index ea7f631,84987d3..056ef94
--- 1/lustre/obdclass/obd_config.c
--- 2/lustre/obdclass/obd_config.c
+++ b/lustre/obdclass/obd_config.c
@@@ -365,7 -365,6 +365,7 @@@ EXPORT_SYMBOL(lustre_cfg_string)
    */
   int class_attach(struct lustre_cfg *lcfg)
   {
+ +      struct obd_export *exp;
           struct obd_device *obd = NULL;
           char *typename, *name, *uuid;
           int rc, len;
@@@ -382,26 -381,24 +382,26 @@@
                   RETURN(-EINVAL);
           }
           name = lustre_cfg_string(lcfg, 0);
- -
           if (!LUSTRE_CFG_BUFLEN(lcfg, 2)) {
                   CERROR("No UUID passed!\n");
                   RETURN(-EINVAL);
           }
- -        uuid = lustre_cfg_string(lcfg, 2);
   
- -        CDEBUG(D_IOCTL, "attach type %s name: %s uuid: %s\n",
- -               MKSTR(typename), MKSTR(name), MKSTR(uuid));
+ +      uuid = lustre_cfg_string(lcfg, 2);
+ +      len = strlen(uuid);
+ +      if (len >= sizeof(obd->obd_uuid)) {
+ +              CERROR("%s: uuid must be < %d bytes long\n",
+ +                     name, (int)sizeof(obd->obd_uuid));
+ +              RETURN(-EINVAL);
+ +      }
   
- -        obd = class_newdev(typename, name);
+ +      obd = class_newdev(typename, name, uuid);
           if (IS_ERR(obd)) {
                   /* Already exists or out of obds */
                   rc = PTR_ERR(obd);
- -                obd = NULL;
                   CERROR("Cannot create device %s of type %s : %d\n",
                          name, typename, rc);
- -                GOTO(out, rc);
+ +              RETURN(rc);
           }
           LASSERTF(obd != NULL, "Cannot get obd device %s of type %s\n",
                    name, typename);
@@@ -411,30 -408,64 +411,30 @@@
           LASSERTF(strncmp(obd->obd_name, name, strlen(name)) == 0,
                    "%p obd_name %s != %s\n", obd, obd->obd_name, name);
   
- -      rwlock_init(&obd->obd_pool_lock);
- -      obd->obd_pool_limit = 0;
- -      obd->obd_pool_slv = 0;
- -
- -      INIT_LIST_HEAD(&obd->obd_exports);
- -      INIT_LIST_HEAD(&obd->obd_unlinked_exports);
- -      INIT_LIST_HEAD(&obd->obd_delayed_exports);
- -      INIT_LIST_HEAD(&obd->obd_exports_timed);
- -      INIT_LIST_HEAD(&obd->obd_nid_stats);
- -      spin_lock_init(&obd->obd_nid_lock);
- -      spin_lock_init(&obd->obd_dev_lock);
- -      mutex_init(&obd->obd_dev_mutex);
- -      spin_lock_init(&obd->obd_osfs_lock);
- -      /* obd->obd_osfs_age must be set to a value in the distant
- -       * past to guarantee a fresh statfs is fetched on mount. */
- -      obd->obd_osfs_age = cfs_time_shift_64(-1000);
- -
- -      /* XXX belongs in setup not attach  */
- -      init_rwsem(&obd->obd_observer_link_sem);
- -      /* recovery data */
- -      init_timer(&obd->obd_recovery_timer);
- -      spin_lock_init(&obd->obd_recovery_task_lock);
- -      init_waitqueue_head(&obd->obd_next_transno_waitq);
- -      init_waitqueue_head(&obd->obd_evict_inprogress_waitq);
- -      INIT_LIST_HEAD(&obd->obd_req_replay_queue);
- -      INIT_LIST_HEAD(&obd->obd_lock_replay_queue);
- -      INIT_LIST_HEAD(&obd->obd_final_req_queue);
- -      INIT_LIST_HEAD(&obd->obd_evict_list);
- -      INIT_LIST_HEAD(&obd->obd_lwp_list);
- -
- -      llog_group_init(&obd->obd_olg);
- -
- -      obd->obd_conn_inprogress = 0;
- -
- -        len = strlen(uuid);
- -        if (len >= sizeof(obd->obd_uuid)) {
- -                CERROR("uuid must be < %d bytes long\n",
- -                       (int)sizeof(obd->obd_uuid));
- -                GOTO(out, rc = -EINVAL);
- -        }
- -        memcpy(obd->obd_uuid.uuid, uuid, len);
+ +      exp = class_new_export_self(obd, &obd->obd_uuid);
+ +      if (IS_ERR(exp)) {
+ +              /* force free */
+ +              GOTO(out, rc = PTR_ERR(exp));
+ +              RETURN(PTR_ERR(exp));
+ +      }
   
- -        /* Detach drops this */
- -      spin_lock(&obd->obd_dev_lock);
- -      atomic_set(&obd->obd_refcount, 1);
- -      spin_unlock(&obd->obd_dev_lock);
- -        lu_ref_init(&obd->obd_reference);
- -        lu_ref_add(&obd->obd_reference, "attach", obd);
+ +      obd->obd_self_export = exp;
+ +      list_del_init(&exp->exp_obd_chain_timed);
+ +      class_export_put(exp);
+ +
+ +      rc = class_register_device(obd);
+ +      if (rc != 0)
+ +              GOTO(out, rc);
   
- -        obd->obd_attached = 1;
- -        CDEBUG(D_IOCTL, "OBD: dev %d attached type %s with refcount %d\n",
+ +      obd->obd_attached = 1;
+ +      CDEBUG(D_IOCTL, "OBD: dev %d attached type %s with refcount %d\n",
                obd->obd_minor, typename, atomic_read(&obd->obd_refcount));
- -        RETURN(0);
- - out:
- -        if (obd != NULL) {
- -                class_release_dev(obd);
- -        }
- -        return rc;
+ +      RETURN(0);
+ +out:
+ +      class_decref(obd, "newdev", obd);
+ +      class_free_dev(obd);
+ +
+ +      RETURN(rc);
   }
   EXPORT_SYMBOL(class_attach);
   
@@@ -444,6 -475,7 +444,6 @@@
   int class_setup(struct obd_device *obd, struct lustre_cfg *lcfg)
   {
           int err = 0;
- -        struct obd_export *exp;
           ENTRY;
   
           LASSERT(obd != NULL);
@@@ -492,7 -524,7 +492,7 @@@
                                                CFS_HASH_MAX_THETA,
                                                &uuid_hash_ops, CFS_HASH_DEFAULT);
           if (!obd->obd_uuid_hash)
- -                GOTO(err_hash, err = -ENOMEM);
+ +              GOTO(err_exit, err = -ENOMEM);
   
           /* create a nid-export lustre hash */
           obd->obd_nid_hash = cfs_hash_create("NID_HASH",
@@@ -503,7 -535,7 +503,7 @@@
                                               CFS_HASH_MAX_THETA,
                                               &nid_hash_ops, CFS_HASH_DEFAULT);
           if (!obd->obd_nid_hash)
- -                GOTO(err_hash, err = -ENOMEM);
+ +              GOTO(err_exit, err = -ENOMEM);
   
           /* create a nid-stats lustre hash */
           obd->obd_nid_stats_hash = cfs_hash_create("NID_STATS",
@@@ -513,8 -545,8 +513,8 @@@
                                                     CFS_HASH_MIN_THETA,
                                                     CFS_HASH_MAX_THETA,
                                                     &nid_stat_hash_ops, CFS_HASH_DEFAULT);
- -        if (!obd->obd_nid_stats_hash)
- -                GOTO(err_hash, err = -ENOMEM);
+ +      if (!obd->obd_nid_stats_hash)
+ +              GOTO(err_exit, err = -ENOMEM);
   
         /* create a client_generation-export lustre hash */
         obd->obd_gen_hash = cfs_hash_create("UUID_HASH",
@@@ -525,13 -557,21 +525,13 @@@
                                             CFS_HASH_MAX_THETA,
                                             &gen_hash_ops, CFS_HASH_DEFAULT);
         if (!obd->obd_gen_hash)
- -              GOTO(err_hash, err = -ENOMEM);
- -
- -        exp = class_new_export(obd, &obd->obd_uuid);
- -        if (IS_ERR(exp))
- -                GOTO(err_hash, err = PTR_ERR(exp));
+ +              GOTO(err_exit, err = -ENOMEM);
   
- -        obd->obd_self_export = exp;
- -      list_del_init(&exp->exp_obd_chain_timed);
- -        class_export_put(exp);
- -
- -        err = obd_setup(obd, lcfg);
- -        if (err)
- -                GOTO(err_exp, err);
+ +      err = obd_setup(obd, lcfg);
+ +      if (err)
+ +              GOTO(err_exit, err);
   
- -        obd->obd_set_up = 1;
+ +      obd->obd_set_up = 1;
   
         spin_lock(&obd->obd_dev_lock);
         /* cleanup drops this */
@@@ -542,7 -582,12 +542,7 @@@
                  obd->obd_name, obd->obd_uuid.uuid);
   
           RETURN(0);
- -err_exp:
- -        if (obd->obd_self_export) {
- -                class_unlink_export(obd->obd_self_export);
- -                obd->obd_self_export = NULL;
- -        }
- -err_hash:
+ +err_exit:
           if (obd->obd_uuid_hash) {
                   cfs_hash_putref(obd->obd_uuid_hash);
                   obd->obd_uuid_hash = NULL;
@@@ -586,14 -631,10 +586,14 @@@ int class_detach(struct obd_device *obd
         obd->obd_attached = 0;
         spin_unlock(&obd->obd_dev_lock);
   
+ +      /* cleanup in progress. we don't like to find this device after now */
+ +      class_unregister_device(obd);
+ +
           CDEBUG(D_IOCTL, "detach on obd %s (uuid %s)\n",
                  obd->obd_name, obd->obd_uuid.uuid);
   
- -        class_decref(obd, "attach", obd);
+ +      class_decref(obd, "newdev", obd);
+ +
           RETURN(0);
   }
   EXPORT_SYMBOL(class_detach);
@@@ -623,9 -664,6 +623,9 @@@ int class_cleanup(struct obd_device *ob
         }
         /* Leave this on forever */
         obd->obd_stopping = 1;
+ +      /* function can't return error after that point, so clear setup flag
+ +       * as early as possible to avoid finding via obd_devs / hash */
+ +      obd->obd_set_up = 0;
         spin_unlock(&obd->obd_dev_lock);
   
         /* wait for already-arrived-connections to finish. */
@@@ -658,11 -696,17 +658,11 @@@
   
         LASSERT(obd->obd_self_export);
   
- -      /* The three references that should be remaining are the
- -       * obd_self_export and the attach and setup references. */
- -      if (atomic_read(&obd->obd_refcount) > 3) {
- -              /* refcounf - 3 might be the number of real exports
- -                 (excluding self export). But class_incref is called
- -                 by other things as well, so don't count on it. */
- -              CDEBUG(D_IOCTL, "%s: forcing exports to disconnect: %d\n",
- -                     obd->obd_name, atomic_read(&obd->obd_refcount) - 3);
- -              dump_exports(obd, 0, D_HA);
- -              class_disconnect_exports(obd);
- -      }
+ +      CDEBUG(D_IOCTL, "%s: forcing exports to disconnect: %d/%d\n",
+ +             obd->obd_name, obd->obd_num_exports,
+ +             atomic_read(&obd->obd_refcount) - 2);
+ +      dump_exports(obd, 0, D_HA);
+ +      class_disconnect_exports(obd);
   
         /* Precleanup, we must make sure all exports get destroyed. */
         err = obd_precleanup(obd);
@@@ -714,31 -758,43 +714,31 @@@ EXPORT_SYMBOL(class_incref)
   
   void class_decref(struct obd_device *obd, const char *scope, const void *source)
   {
- -      int err;
- -      int refs;
+ +      int last;
   
- -      spin_lock(&obd->obd_dev_lock);
- -      atomic_dec(&obd->obd_refcount);
- -      refs = atomic_read(&obd->obd_refcount);
- -      spin_unlock(&obd->obd_dev_lock);
+ +      CDEBUG(D_INFO, "Decref %s (%p) now %d - %s\n", obd->obd_name, obd,
+ +             atomic_read(&obd->obd_refcount), scope);
+ +
+ +      LASSERT(obd->obd_num_exports >= 0);
+ +      last = atomic_dec_and_test(&obd->obd_refcount);
         lu_ref_del(&obd->obd_reference, scope, source);
   
- -      CDEBUG(D_INFO, "Decref %s (%p) now %d\n", obd->obd_name, obd, refs);
+ +      if (last) {
+ +              struct obd_export *exp;
   
- -      if ((refs == 1) && obd->obd_stopping) {
+ +              LASSERT(!obd->obd_attached);
                 /* All exports have been destroyed; there should
- -                 be no more in-progress ops by this point.*/
- -
- -              spin_lock(&obd->obd_self_export->exp_lock);
- -              obd->obd_self_export->exp_flags |= exp_flags_from_obd(obd);
- -              spin_unlock(&obd->obd_self_export->exp_lock);
- -
- -                /* note that we'll recurse into class_decref again */
- -                class_unlink_export(obd->obd_self_export);
- -                return;
- -        }
+ +               * be no more in-progress ops by this point.*/
+ +              exp = obd->obd_self_export;
   
- -        if (refs == 0) {
- -                CDEBUG(D_CONFIG, "finishing cleanup of obd %s (%s)\n",
- -                       obd->obd_name, obd->obd_uuid.uuid);
- -                LASSERT(!obd->obd_attached);
- -                if (obd->obd_stopping) {
- -                        /* If we're not stopping, we were never set up */
- -                        err = obd_cleanup(obd);
- -                        if (err)
- -                                CERROR("Cleanup %s returned %d\n",
- -                                       obd->obd_name, err);
+ +              if (exp) {
+ +                      exp->exp_flags |= exp_flags_from_obd(obd);
+ +                      /*
+ +                       * note that we'll recurse into class_decref again
+ +                       * but it's not a problem because we was last user
+ +                       */
+ +                      class_unlink_export(exp);
                   }
- -
- -                class_release_dev(obd);
           }
   }
   EXPORT_SYMBOL(class_decref);
@@@ -1239,7 -1295,6 +1239,6 @@@ int class_process_config(struct lustre_
   
                   GOTO(out, err = -EINVAL);
           }
- 
         switch(lcfg->lcfg_command) {
         case LCFG_SETUP: {
                 err = class_setup(obd, lcfg);
@@@ -1279,12 -1334,47 +1278,47 @@@
                   err = obd_pool_del(obd, lustre_cfg_string(lcfg, 2));
                   GOTO(out, err = 0);
           }
-         default: {
-                 err = obd_process_config(obd, sizeof(*lcfg), lcfg);
-                 GOTO(out, err);
+       /* Process config log ADD_MDC record twice to add MDC also to LOV
+        * for Data-on-MDT:
+        *
+        * add 0:lustre-clilmv 1:lustre-MDT0000_UUID 2:0 3:1
+        *     4:lustre-MDT0000-mdc_UUID
+        */
+       case LCFG_ADD_MDC: {
+               struct obd_device *lov_obd;
+               char *clilmv;
+ 
+               err = obd_process_config(obd, sizeof(*lcfg), lcfg);
+               if (err)
+                       GOTO(out, err);
+ 
+               /* make sure this is client LMV log entry */
+               clilmv = strstr(lustre_cfg_string(lcfg, 0), "clilmv");
+               if (!clilmv)
+                       GOTO(out, err);
+ 
+               /* replace 'lmv' with 'lov' name to address LOV device and
+                * process llog record to add MDC there. */
+               clilmv[4] = 'o';
+               lov_obd = class_name2obd(lustre_cfg_string(lcfg, 0));
+               if (lov_obd == NULL) {
+                       err = -ENOENT;
+                       CERROR("%s: Cannot find LOV by %s name, rc = %d\n",
+                              obd->obd_name, lustre_cfg_string(lcfg, 0), err);
+               } else {
+                       err = obd_process_config(lov_obd, sizeof(*lcfg), lcfg);
+               }
+               /* restore 'lmv' name */
+               clilmv[4] = 'm';
+               GOTO(out, err);
+       }
+       default: {
+               err = obd_process_config(obd, sizeof(*lcfg), lcfg);
+               GOTO(out, err);
   
           }
           }
+       EXIT;
   out:
           if ((err < 0) && !(lcfg->lcfg_command & LCFG_REQUIRED)) {
                   CWARN("Ignoring error %d on optional command %#x\n", err,
@@@ -1371,12 -1461,12 +1405,12 @@@ int class_process_proc_param(char *pref
                         /* rc = -EINVAL;        continue parsing other params */
                         skip++;
                 } else if (rc < 0) {
- -                      CERROR("%s: error writing proc '%s'='%s': rc = %d\n",
- -                             lustre_cfg_string(lcfg, 0), key, sval, rc);
+ +                      CERROR("%s: error writing parameter '%s': rc = %d\n",
+ +                             lustre_cfg_string(lcfg, 0), key, rc);
                         rc = 0;
                 } else {
- -                      CDEBUG(D_CONFIG, "%s: Set parameter '%s'='%s'\n",
- -                             lustre_cfg_string(lcfg, 0), key, sval);
+ +                      CDEBUG(D_CONFIG, "%s: set parameter '%s'\n",
+ +                             lustre_cfg_string(lcfg, 0), key);
                 }
         }
   
diff --combined lustre/ofd/lproc_ofd.c

index 85ed221,27fa94d..1953fcf
--- 1/lustre/ofd/lproc_ofd.c
--- 2/lustre/ofd/lproc_ofd.c
+++ b/lustre/ofd/lproc_ofd.c
@@@ -70,69 -70,6 +70,6 @@@ static int ofd_seqs_seq_show(struct seq
   LPROC_SEQ_FOPS_RO(ofd_seqs);
   
   /**
-  * Show estimate of total amount of dirty data on clients.
-  *
-  * \param[in] m               seq_file handle
-  * \param[in] data    unused for single entry
-  *
-  * \retval            0 on success
-  * \retval            negative value on error
-  */
- static int ofd_tot_dirty_seq_show(struct seq_file *m, void *data)
- {
-       struct obd_device *obd = m->private;
-       struct tg_grants_data *tgd;
- 
-       LASSERT(obd != NULL);
-       tgd = &obd->u.obt.obt_lut->lut_tgd;
-       seq_printf(m, "%llu\n", tgd->tgd_tot_dirty);
-       return 0;
- }
- LPROC_SEQ_FOPS_RO(ofd_tot_dirty);
- 
- /**
-  * Show total amount of space granted to clients.
-  *
-  * \param[in] m               seq_file handle
-  * \param[in] data    unused for single entry
-  *
-  * \retval            0 on success
-  * \retval            negative value on error
-  */
- static int ofd_tot_granted_seq_show(struct seq_file *m, void *data)
- {
-       struct obd_device *obd = m->private;
-       struct tg_grants_data *tgd;
- 
-       LASSERT(obd != NULL);
-       tgd = &obd->u.obt.obt_lut->lut_tgd;
-       seq_printf(m, "%llu\n", tgd->tgd_tot_granted);
-       return 0;
- }
- LPROC_SEQ_FOPS_RO(ofd_tot_granted);
- 
- /**
-  * Show total amount of space used by IO in progress.
-  *
-  * \param[in] m               seq_file handle
-  * \param[in] data    unused for single entry
-  *
-  * \retval            0 on success
-  * \retval            negative value on error
-  */
- static int ofd_tot_pending_seq_show(struct seq_file *m, void *data)
- {
-       struct obd_device *obd = m->private;
-       struct tg_grants_data *tgd;
- 
-       LASSERT(obd != NULL);
-       tgd = &obd->u.obt.obt_lut->lut_tgd;
-       seq_printf(m, "%llu\n", tgd->tgd_tot_pending);
-       return 0;
- }
- LPROC_SEQ_FOPS_RO(ofd_tot_pending);
- 
- /**
    * Show total number of grants for precreate.
    *
    * \param[in] m               seq_file handle
@@@ -298,6 -235,9 +235,6 @@@ LPROC_SEQ_FOPS(ofd_fmd_max_num)
   /**
    * Show the maximum age of FMD data in seconds.
    *
- - * Though it is shown in seconds, it is stored internally in units
- - * of jiffies for efficiency.
- - *
    * \param[in] m               seq_file handle
    * \param[in] data    unused for single entry
    *
@@@ -309,7 -249,8 +246,7 @@@ static int ofd_fmd_max_age_seq_show(str
         struct obd_device *obd = m->private;
         struct ofd_device *ofd = ofd_dev(obd->obd_lu_dev);
   
- -      seq_printf(m, "%ld\n", jiffies_to_msecs(ofd->ofd_fmd_max_age) /
- -                 MSEC_PER_SEC);
+ +      seq_printf(m, "%lld\n", ofd->ofd_fmd_max_age);
         return 0;
   }
   
@@@ -317,6 -258,7 +254,6 @@@
    * Set the maximum age of FMD data in seconds.
    *
    * This defines how long FMD data stays in the FMD list.
- - * It is stored internally in units of jiffies for efficiency.
    *
    * \param[in] file    proc file
    * \param[in] buffer  string which represents maximum number
@@@ -343,7 -285,7 +280,7 @@@ ofd_fmd_max_age_seq_write(struct file *
         if (val > 65536 || val < 1)
                 return -EINVAL;
   
- -      ofd->ofd_fmd_max_age = msecs_to_jiffies(val * MSEC_PER_SEC);
+ +      ofd->ofd_fmd_max_age = val;
         return count;
   }
   LPROC_SEQ_FOPS(ofd_fmd_max_age);
@@@ -629,70 -571,6 +566,6 @@@ ofd_sync_lock_cancel_seq_write(struct f
   LPROC_SEQ_FOPS(ofd_sync_lock_cancel);
   
   /**
-  * Show if grants compatibility mode is disabled.
-  *
-  * When tgd_grant_compat_disable is set, we don't grant any space to clients
-  * not supporting OBD_CONNECT_GRANT_PARAM. Otherwise, space granted to such
-  * a client is inflated since it consumes PAGE_SIZE of grant space per
-  * block, (i.e. typically 4kB units), but underlaying file system might have
-  * block size bigger than page size, e.g. ZFS. See LU-2049 for details.
-  *
-  * \param[in] m               seq_file handle
-  * \param[in] data    unused for single entry
-  *
-  * \retval            0 on success
-  * \retval            negative value on error
-  */
- static int ofd_grant_compat_disable_seq_show(struct seq_file *m, void *data)
- {
-       struct obd_device *obd = m->private;
-       struct tg_grants_data *tgd = &obd->u.obt.obt_lut->lut_tgd;
- 
-       seq_printf(m, "%u\n", tgd->tgd_grant_compat_disable);
-       return 0;
- }
- 
- /**
-  * Change grant compatibility mode.
-  *
-  * Setting tgd_grant_compat_disable prohibit any space granting to clients
-  * not supporting OBD_CONNECT_GRANT_PARAM. See details above.
-  *
-  * \param[in] file    proc file
-  * \param[in] buffer  string which represents mode
-  *                    1: disable compatibility mode
-  *                    0: enable compatibility mode
-  * \param[in] count   \a buffer length
-  * \param[in] off     unused for single entry
-  *
-  * \retval            \a count on success
-  * \retval            negative number on error
-  */
- static ssize_t
- ofd_grant_compat_disable_seq_write(struct file *file,
-                                  const char __user *buffer,
-                                  size_t count, loff_t *off)
- {
-       struct seq_file *m = file->private_data;
-       struct obd_device *obd = m->private;
-       struct tg_grants_data *tgd = &obd->u.obt.obt_lut->lut_tgd;
-       __s64 val;
-       int rc;
- 
-       rc = lprocfs_str_to_s64(buffer, count, &val);
-       if (rc)
-               return rc;
- 
-       if (val < 0)
-               return -EINVAL;
- 
-       tgd->tgd_grant_compat_disable = !!val;
- 
-       return count;
- }
- LPROC_SEQ_FOPS(ofd_grant_compat_disable);
- 
- /**
    * Show the limit of soft sync RPCs.
    *
    * This value defines how many IO RPCs with OBD_BRW_SOFT_SYNC flag
@@@ -893,6 -771,11 +766,11 @@@ LPROC_SEQ_FOPS_RW_TYPE(ofd, ir_factor)
   LPROC_SEQ_FOPS_RW_TYPE(ofd, checksum_dump);
   LPROC_SEQ_FOPS_RW_TYPE(ofd, job_interval);
   
+ LPROC_SEQ_FOPS_RO(tgt_tot_dirty);
+ LPROC_SEQ_FOPS_RO(tgt_tot_granted);
+ LPROC_SEQ_FOPS_RO(tgt_tot_pending);
+ LPROC_SEQ_FOPS(tgt_grant_compat_disable);
+ 
   struct lprocfs_vars lprocfs_ofd_obd_vars[] = {
         { .name =       "seqs_allocated",
           .fops =       &ofd_seqs_fops                  },
@@@ -901,11 -784,11 +779,11 @@@
         { .name =       "last_id",
           .fops =       &ofd_last_id_fops               },
         { .name =       "tot_dirty",
-         .fops =       &ofd_tot_dirty_fops             },
+         .fops =       &tgt_tot_dirty_fops             },
         { .name =       "tot_pending",
-         .fops =       &ofd_tot_pending_fops           },
+         .fops =       &tgt_tot_pending_fops           },
         { .name =       "tot_granted",
-         .fops =       &ofd_tot_granted_fops           },
+         .fops =       &tgt_tot_granted_fops           },
         { .name =       "grant_precreate",
           .fops =       &ofd_grant_precreate_fops       },
         { .name =       "precreate_batch",
@@@ -935,7 -818,7 +813,7 @@@
         { .name =       "checksum_dump",
           .fops =       &ofd_checksum_dump_fops         },
         { .name =       "grant_compat_disable",
-         .fops =       &ofd_grant_compat_disable_fops  },
+         .fops =       &tgt_grant_compat_disable_fops  },
         { .name =       "client_cache_count",
           .fops =       &ofd_fmd_max_num_fops           },
         { .name =       "client_cache_seconds",
diff --combined lustre/ofd/ofd_dev.c

index 4cf5e53,fcb477a..6c0abd0
--- 1/lustre/ofd/ofd_dev.c
--- 2/lustre/ofd/ofd_dev.c
+++ b/lustre/ofd/ofd_dev.c
@@@ -1698,10 -1698,10 +1698,10 @@@ static int ofd_create_hdl(struct tgt_se
                 }
         }
         if (diff > 0) {
- -              cfs_time_t       enough_time = cfs_time_shift(DISK_TIMEOUT);
- -              u64              next_id;
- -              int              created = 0;
- -              int              count;
+ +              time64_t enough_time = ktime_get_seconds() + DISK_TIMEOUT;
+ +              u64 next_id;
+ +              int created = 0;
+ +              int count;
   
                 if (!(oa->o_valid & OBD_MD_FLFLAGS) ||
                     !(oa->o_flags & OBD_FL_DELORPHAN)) {
@@@ -1749,7 -1749,7 +1749,7 @@@
                                count, seq, next_id);
   
                         if (!(lustre_msg_get_flags(req->rq_reqmsg) & MSG_REPLAY)
- -                          && cfs_time_after(jiffies, enough_time)) {
+ +                          && ktime_get_seconds() > enough_time) {
                                 CDEBUG(D_HA, "%s: Slow creates, %d/%lld objects"
                                       " created at a rate of %d/s\n",
                                       ofd_name(ofd), created, diff + created,
@@@ -2364,16 -2364,16 +2364,16 @@@ static int ofd_quotactl(struct tgt_sess
    *
    * \retval            amount of time to extend the timeout with
    */
- -static inline int prolong_timeout(struct ptlrpc_request *req)
+ +static inline time64_t prolong_timeout(struct ptlrpc_request *req)
   {
         struct ptlrpc_service_part *svcpt = req->rq_rqbd->rqbd_svcpt;
- -      time_t req_timeout;
+ +      time64_t req_timeout;
   
         if (AT_OFF)
                 return obd_timeout / 2;
   
         req_timeout = req->rq_deadline - req->rq_arrival_time.tv_sec;
- -      return max_t(time_t, at_est2timeout(at_get(&svcpt->scp_at_estimate)),
+ +      return max_t(time64_t, at_est2timeout(at_get(&svcpt->scp_at_estimate)),
                      req_timeout);
   }
   
@@@ -2902,7 -2902,6 +2902,6 @@@ static int ofd_init0(const struct lu_en
         struct ofd_thread_info *info = NULL;
         struct obd_device *obd;
         struct tg_grants_data *tgd = &m->ofd_lut.lut_tgd;
-       struct obd_statfs *osfs;
         struct lu_fid fid;
         struct nm_config_file *nodemap_config;
         struct obd_device_target *obt;
@@@ -2930,22 -2929,8 +2929,8 @@@
         m->ofd_raid_degraded = 0;
         m->ofd_syncjournal = 0;
         ofd_slc_set(m);
-       tgd->tgd_grant_compat_disable = 0;
         m->ofd_soft_sync_limit = OFD_SOFT_SYNC_LIMIT_DEFAULT;
   
-       /* statfs data */
-       spin_lock_init(&tgd->tgd_osfs_lock);
-       tgd->tgd_osfs_age = cfs_time_shift_64(-1000);
-       tgd->tgd_osfs_unstable = 0;
-       tgd->tgd_statfs_inflight = 0;
-       tgd->tgd_osfs_inflight = 0;
- 
-       /* grant data */
-       spin_lock_init(&tgd->tgd_grant_lock);
-       tgd->tgd_tot_dirty = 0;
-       tgd->tgd_tot_granted = 0;
-       tgd->tgd_tot_pending = 0;
- 
         m->ofd_seq_count = 0;
         init_waitqueue_head(&m->ofd_inconsistency_thread.t_ctl_waitq);
         INIT_LIST_HEAD(&m->ofd_inconsistency_list);
@@@ -3008,27 -2993,13 +2993,13 @@@
         ptlrpc_init_client(LDLM_CB_REQUEST_PORTAL, LDLM_CB_REPLY_PORTAL,
                            "filter_ldlm_cb_client", &obd->obd_ldlm_client);
   
-       dt_conf_get(env, m->ofd_osd, &m->ofd_lut.lut_dt_conf);
- 
         rc = tgt_init(env, &m->ofd_lut, obd, m->ofd_osd, ofd_common_slice,
                       OBD_FAIL_OST_ALL_REQUEST_NET,
                       OBD_FAIL_OST_ALL_REPLY_NET);
         if (rc)
                 GOTO(err_free_ns, rc);
   
-       /* populate cached statfs data */
-       osfs = &ofd_info(env)->fti_u.osfs;
-       rc = tgt_statfs_internal(env, &m->ofd_lut, osfs, 0, NULL);
-       if (rc != 0) {
-               CERROR("%s: can't get statfs data, rc %d\n", obd->obd_name, rc);
-               GOTO(err_fini_lut, rc);
-       }
-       if (!is_power_of_2(osfs->os_bsize)) {
-               CERROR("%s: blocksize (%d) is not a power of 2\n",
-                       obd->obd_name, osfs->os_bsize);
-               GOTO(err_fini_lut, rc = -EPROTO);
-       }
-       tgd->tgd_blockbits = fls(osfs->os_bsize) - 1;
+       tgd->tgd_reserved_pcnt = 0;
   
         if (DT_DEF_BRW_SIZE < (1U << tgd->tgd_blockbits))
                 m->ofd_brw_size = 1U << tgd->tgd_blockbits;
@@@ -3037,7 -3008,8 +3008,8 @@@
   
         m->ofd_cksum_types_supported = cksum_types_supported_server();
         m->ofd_precreate_batch = OFD_PRECREATE_BATCH_DEFAULT;
-       if (osfs->os_bsize * osfs->os_blocks < OFD_PRECREATE_SMALL_FS)
+       if (tgd->tgd_osfs.os_bsize * tgd->tgd_osfs.os_blocks <
+           OFD_PRECREATE_SMALL_FS)
                 m->ofd_precreate_batch = OFD_PRECREATE_BATCH_SMALL;
   
         rc = ofd_fs_setup(env, m, obd);
@@@ -3260,13 -3232,6 +3232,6 @@@ static int __init ofd_init(void
                 return(rc);
         }
   
-       rc = ofd_dlm_init();
-       if (rc) {
-               lu_kmem_fini(ofd_caches);
-               ofd_fmd_exit();
-               return rc;
-       }
- 
         rc = class_register_type(&ofd_obd_ops, NULL, true, NULL,
                                  LUSTRE_OST_NAME, &ofd_device_type);
         return rc;
@@@ -3281,7 -3246,6 +3246,6 @@@
   static void __exit ofd_exit(void)
   {
         ofd_fmd_exit();
-       ofd_dlm_exit();
         lu_kmem_fini(ofd_caches);
         class_unregister_type(LUSTRE_OST_NAME);
   }
diff --combined lustre/ofd/ofd_dlm.c

index 80420ef,76e2bd7..2755d7e
--- 1/lustre/ofd/ofd_dlm.c
--- 2/lustre/ofd/ofd_dlm.c
+++ b/lustre/ofd/ofd_dlm.c
@@@ -51,25 -51,6 +51,6 @@@ struct ofd_intent_args 
         int                     error;
   };
   
- int ofd_dlm_init(void)
- {
-       ldlm_glimpse_work_kmem = kmem_cache_create("ldlm_glimpse_work_kmem",
-                                            sizeof(struct ldlm_glimpse_work),
-                                            0, 0, NULL);
-       if (ldlm_glimpse_work_kmem == NULL)
-               return -ENOMEM;
-       else
-               return 0;
- }
- 
- void ofd_dlm_exit(void)
- {
-       if (ldlm_glimpse_work_kmem) {
-               kmem_cache_destroy(ldlm_glimpse_work_kmem);
-               ldlm_glimpse_work_kmem = NULL;
-       }
- }
- 
   /**
    * OFD interval callback.
    *
@@@ -138,6 -119,7 +119,6 @@@ static enum interval_iter ofd_intent_cb
   
         /* Find the 'victim' lock from this interval */
         list_for_each_entry(lck, &node->li_group, l_sl_policy) {
- -
                 victim_lock = LDLM_LOCK_GET(lck);
   
                 /* the same policy group - every lock has the
@@@ -253,6 -235,11 +234,11 @@@ int ofd_intent_policy(struct ldlm_names
         struct ldlm_glimpse_work *pos, *tmp;
         ENTRY;
   
+       /* update stats for intent in intent policy */
+       if (ptlrpc_req2svc(req)->srv_stats != NULL)
+               lprocfs_counter_incr(ptlrpc_req2svc(req)->srv_stats,
+                                    PTLRPC_LAST_CNTR + LDLM_GLIMPSE_ENQUEUE);
+ 
         INIT_LIST_HEAD(&arg.gl_list);
         arg.no_glimpse_ast = false;
         arg.error = 0;
diff --combined lustre/ofd/ofd_internal.h

index 2af6d26,6cc7952..9c7a582
--- 1/lustre/ofd/ofd_internal.h
--- 2/lustre/ofd/ofd_internal.h
+++ b/lustre/ofd/ofd_internal.h
@@@ -56,12 -56,12 +56,12 @@@ struct ofd_mod_data 
         struct list_head fmd_list;        /* linked to fed_mod_list */
         struct lu_fid    fmd_fid;         /* FID being written to */
         __u64            fmd_mactime_xid; /* xid highest {m,a,c}time setattr */
- -      cfs_time_t       fmd_expire;      /* time when the fmd should expire */
+ +      time64_t         fmd_expire;      /* time when the fmd should expire */
         int              fmd_refcount;    /* reference counter - list holds 1 */
   };
   
   #define OFD_FMD_MAX_NUM_DEFAULT 128
- -#define OFD_FMD_MAX_AGE_DEFAULT msecs_to_jiffies((obd_timeout+10)*MSEC_PER_SEC)
+ +#define OFD_FMD_MAX_AGE_DEFAULT (obd_timeout + 10)
   
   #define OFD_SOFT_SYNC_LIMIT_DEFAULT 16
   
@@@ -137,7 -137,7 +137,7 @@@ struct ofd_device 
   
         /* ofd mod data: ofd_device wide values */
         int                      ofd_fmd_max_num; /* per ofd ofd_mod_data */
- -      cfs_duration_t           ofd_fmd_max_age; /* time to fmd expiry */
+ +      time64_t                 ofd_fmd_max_age; /* time to fmd expiry */
   
         spinlock_t               ofd_flags_lock;
         unsigned long            ofd_raid_degraded:1,
@@@ -419,8 -419,7 +419,7 @@@ extern struct ldlm_valblock_ops ofd_lvb
   
   /* ofd_dlm.c */
   extern struct kmem_cache *ldlm_glimpse_work_kmem;
- int ofd_dlm_init(void);
- void ofd_dlm_exit(void);
+ 
   int ofd_intent_policy(struct ldlm_namespace *ns, struct ldlm_lock **lockp,
                       void *req_cookie, enum ldlm_mode mode, __u64 flags,
                       void *data);
diff --combined lustre/osc/osc_page.c

index e929211,6813f12..a141681
--- 1/lustre/osc/osc_page.c
--- 2/lustre/osc/osc_page.c
+++ b/lustre/osc/osc_page.c
@@@ -301,6 -301,7 +301,7 @@@ int osc_page_init(const struct lu_env *
   
         return result;
   }
+ EXPORT_SYMBOL(osc_page_init);
   
   /**
    * Helper function called by osc_io_submit() for every page in an immediate
@@@ -684,6 -685,7 +685,7 @@@ long osc_lru_shrink(const struct lu_en
         }
         RETURN(count > 0 ? count : rc);
   }
+ EXPORT_SYMBOL(osc_lru_shrink);
   
   /**
    * Reclaim LRU pages by an IO thread. The caller wants to reclaim at least
@@@ -776,6 -778,7 +778,7 @@@ static int osc_lru_alloc(const struct l
         struct l_wait_info lwi = LWI_INTR(LWI_ON_SIGNAL_NOOP, NULL);
         struct osc_io *oio = osc_env_io(env);
         int rc = 0;
+ 
         ENTRY;
   
         if (cli->cl_cache == NULL) /* shall not be in LRU */
@@@ -874,27 -877,17 +877,27 @@@ void osc_lru_unreserve(struct client_ob
    * are likely from the same page zone.
    */
   static inline void unstable_page_accounting(struct ptlrpc_bulk_desc *desc,
+ +                                          struct osc_brw_async_args *aa,
                                             int factor)
   {
- -      int page_count = desc->bd_iov_count;
+ +      int page_count;
         void *zone = NULL;
         int count = 0;
         int i;
   
- -      LASSERT(ptlrpc_is_bulk_desc_kiov(desc->bd_type));
+ +      if (desc != NULL) {
+ +              LASSERT(ptlrpc_is_bulk_desc_kiov(desc->bd_type));
+ +              page_count = desc->bd_iov_count;
+ +      } else {
+ +              page_count = aa->aa_page_count;
+ +      }
   
         for (i = 0; i < page_count; i++) {
- -              void *pz = page_zone(BD_GET_KIOV(desc, i).kiov_page);
+ +              void *pz;
+ +              if (desc)
+ +                      pz = page_zone(BD_GET_KIOV(desc, i).kiov_page);
+ +              else
+ +                      pz = page_zone(aa->aa_ppga[i]->pg);
   
                 if (likely(pz == zone)) {
                         ++count;
@@@ -913,16 -906,14 +916,16 @@@
                 mod_zone_page_state(zone, NR_UNSTABLE_NFS, factor * count);
   }
   
- -static inline void add_unstable_page_accounting(struct ptlrpc_bulk_desc *desc)
+ +static inline void add_unstable_page_accounting(struct ptlrpc_bulk_desc *desc,
+ +                                              struct osc_brw_async_args *aa)
   {
- -      unstable_page_accounting(desc, 1);
+ +      unstable_page_accounting(desc, aa, 1);
   }
   
- -static inline void dec_unstable_page_accounting(struct ptlrpc_bulk_desc *desc)
+ +static inline void dec_unstable_page_accounting(struct ptlrpc_bulk_desc *desc,
+ +                                              struct osc_brw_async_args *aa)
   {
- -      unstable_page_accounting(desc, -1);
+ +      unstable_page_accounting(desc, aa, -1);
   }
   
   /**
@@@ -939,19 -930,12 +942,19 @@@
   void osc_dec_unstable_pages(struct ptlrpc_request *req)
   {
         struct ptlrpc_bulk_desc *desc       = req->rq_bulk;
+ +      struct osc_brw_async_args *aa = (void *)&req->rq_async_args;
         struct client_obd       *cli        = &req->rq_import->imp_obd->u.cli;
- -      int                      page_count = desc->bd_iov_count;
+ +      int                      page_count;
         long                     unstable_count;
   
+ +      if (desc)
+ +              page_count = desc->bd_iov_count;
+ +      else
+ +              page_count = aa->aa_page_count;
+ +
         LASSERT(page_count >= 0);
- -      dec_unstable_page_accounting(desc);
+ +
+ +      dec_unstable_page_accounting(desc, aa);
   
         unstable_count = atomic_long_sub_return(page_count,
                                                 &cli->cl_unstable_count);
@@@ -973,20 -957,14 +976,20 @@@
   void osc_inc_unstable_pages(struct ptlrpc_request *req)
   {
         struct ptlrpc_bulk_desc *desc = req->rq_bulk;
+ +      struct osc_brw_async_args *aa = (void *)&req->rq_async_args;
         struct client_obd       *cli  = &req->rq_import->imp_obd->u.cli;
- -      long                     page_count = desc->bd_iov_count;
+ +      long                     page_count;
   
         /* No unstable page tracking */
         if (cli->cl_cache == NULL || !cli->cl_cache->ccc_unstable_check)
                 return;
   
- -      add_unstable_page_accounting(desc);
+ +      if (desc)
+ +              page_count = desc->bd_iov_count;
+ +      else
+ +              page_count = aa->aa_page_count;
+ +
+ +      add_unstable_page_accounting(desc, aa);
         atomic_long_add(page_count, &cli->cl_unstable_count);
         atomic_long_add(page_count, &cli->cl_cache->ccc_unstable_nr);
   
diff --combined lustre/osc/osc_request.c

index 63ae9b0,e4c6a04..5c6438c
--- 1/lustre/osc/osc_request.c
--- 2/lustre/osc/osc_request.c
+++ b/lustre/osc/osc_request.c
@@@ -58,6 -58,18 +58,6 @@@ struct ptlrpc_request_pool *osc_rq_pool
   static unsigned int osc_reqpool_mem_max = 5;
   module_param(osc_reqpool_mem_max, uint, 0444);
   
- -struct osc_brw_async_args {
- -      struct obdo              *aa_oa;
- -      int                       aa_requested_nob;
- -      int                       aa_nio_count;
- -      u32                       aa_page_count;
- -      int                       aa_resends;
- -      struct brw_page **aa_ppga;
- -      struct client_obd        *aa_cli;
- -      struct list_head          aa_oaps;
- -      struct list_head          aa_exts;
- -};
- -
   #define osc_grant_args osc_brw_async_args
   
   struct osc_setattr_args {
@@@ -79,18 -91,6 +79,6 @@@ struct osc_ladvise_args 
         void                    *la_cookie;
   };
   
- struct osc_enqueue_args {
-       struct obd_export       *oa_exp;
-       enum ldlm_type          oa_type;
-       enum ldlm_mode          oa_mode;
-       __u64                   *oa_flags;
-       osc_enqueue_upcall_f    oa_upcall;
-       void                    *oa_cookie;
-       struct ost_lvb          *oa_lvb;
-       struct lustre_handle    oa_lockh;
-       bool                    oa_speculative;
- };
- 
   static void osc_release_ppga(struct brw_page **ppga, size_t count);
   static int brw_interpret(const struct lu_env *env, struct ptlrpc_request *req,
                          void *data, int rc);
@@@ -396,31 -396,34 +384,34 @@@ out
         RETURN(rc);
   }
   
- int osc_punch_base(struct obd_export *exp, struct obdo *oa,
-                    obd_enqueue_update_f upcall, void *cookie,
-                    struct ptlrpc_request_set *rqset)
+ int osc_punch_send(struct obd_export *exp, struct obdo *oa,
+                  obd_enqueue_update_f upcall, void *cookie)
   {
-         struct ptlrpc_request   *req;
-         struct osc_setattr_args *sa;
-         struct ost_body         *body;
-         int                      rc;
-         ENTRY;
+       struct ptlrpc_request *req;
+       struct osc_setattr_args *sa;
+       struct obd_import *imp = class_exp2cliimp(exp);
+       struct ost_body *body;
+       int rc;
   
-         req = ptlrpc_request_alloc(class_exp2cliimp(exp), &RQF_OST_PUNCH);
-         if (req == NULL)
-                 RETURN(-ENOMEM);
+       ENTRY;
   
-         rc = ptlrpc_request_pack(req, LUSTRE_OST_VERSION, OST_PUNCH);
-         if (rc) {
-                 ptlrpc_request_free(req);
-                 RETURN(rc);
-         }
-         req->rq_request_portal = OST_IO_PORTAL; /* bug 7198 */
-         ptlrpc_at_set_req_timeout(req);
+       req = ptlrpc_request_alloc(imp, &RQF_OST_PUNCH);
+       if (req == NULL)
+               RETURN(-ENOMEM);
+ 
+       rc = ptlrpc_request_pack(req, LUSTRE_OST_VERSION, OST_PUNCH);
+       if (rc < 0) {
+               ptlrpc_request_free(req);
+               RETURN(rc);
+       }
+ 
+       osc_set_io_portal(req);
+ 
+       ptlrpc_at_set_req_timeout(req);
   
         body = req_capsule_client_get(&req->rq_pill, &RMF_OST_BODY);
-       LASSERT(body);
-       lustre_set_wire_obdo(&req->rq_import->imp_connect_data, &body->oa, oa);
+ 
+       lustre_set_wire_obdo(&imp->imp_connect_data, &body->oa, oa);
   
         ptlrpc_request_set_replen(req);
   
@@@ -430,13 -433,12 +421,12 @@@
         sa->sa_oa = oa;
         sa->sa_upcall = upcall;
         sa->sa_cookie = cookie;
-       if (rqset == PTLRPCD_SET)
-               ptlrpcd_add_req(req);
-       else
-               ptlrpc_set_add_req(rqset, req);
+ 
+       ptlrpcd_add_req(req);
   
         RETURN(0);
   }
+ EXPORT_SYMBOL(osc_punch_send);
   
   static int osc_sync_interpret(const struct lu_env *env,
                                 struct ptlrpc_request *req,
@@@ -709,11 -711,10 +699,11 @@@ static void osc_announce_cached(struct 
   
   void osc_update_next_shrink(struct client_obd *cli)
   {
- -        cli->cl_next_shrink_grant =
- -                cfs_time_shift(cli->cl_grant_shrink_interval);
- -        CDEBUG(D_CACHE, "next time %ld to shrink grant \n",
- -               cli->cl_next_shrink_grant);
+ +      cli->cl_next_shrink_grant = ktime_get_seconds() +
+ +                                  cli->cl_grant_shrink_interval;
+ +
+ +      CDEBUG(D_CACHE, "next time %lld to shrink grant\n",
+ +             cli->cl_next_shrink_grant);
   }
   
   static void __osc_update_grant(struct client_obd *cli, u64 grant)
@@@ -731,11 -732,6 +721,6 @@@ static void osc_update_grant(struct cli
           }
   }
   
- static int osc_set_info_async(const struct lu_env *env, struct obd_export *exp,
-                             u32 keylen, void *key,
-                             u32 vallen, void *val,
-                             struct ptlrpc_request_set *set);
- 
   static int osc_shrink_grant_interpret(const struct lu_env *env,
                                         struct ptlrpc_request *req,
                                         void *aa, int rc)
@@@ -835,13 -831,14 +820,13 @@@ int osc_shrink_grant_to_target(struct c
   
   static int osc_should_shrink_grant(struct client_obd *client)
   {
- -        cfs_time_t time = cfs_time_current();
- -        cfs_time_t next_shrink = client->cl_next_shrink_grant;
+ +      time64_t next_shrink = client->cl_next_shrink_grant;
   
           if ((client->cl_import->imp_connect_data.ocd_connect_flags &
                OBD_CONNECT_GRANT_SHRINK) == 0)
                   return 0;
   
- -      if (cfs_time_aftereq(time, next_shrink - 5 * CFS_TICK)) {
+ +      if (ktime_get_seconds() >= next_shrink - 5) {
                 /* Get the current RPC size directly, instead of going via:
                  * cli_brw_size(obd->u.cli.cl_import->imp_obd->obd_self_export)
                  * Keep comment here so that it can be found by searching. */
@@@ -890,7 -887,7 +875,7 @@@ static int osc_del_shrink_grant(struct 
                                            TIMEOUT_GRANT);
   }
   
- static void osc_init_grant(struct client_obd *cli, struct obd_connect_data *ocd)
+ void osc_init_grant(struct client_obd *cli, struct obd_connect_data *ocd)
   {
         /*
          * ocd_grant is the total grant amount we're expect to hold: if we've
@@@ -947,6 -944,7 +932,7 @@@
             list_empty(&cli->cl_grant_shrink_list))
                 osc_add_shrink_grant(cli);
   }
+ EXPORT_SYMBOL(osc_init_grant);
   
   /* We assume that the reason this OSC got a short read is because it read
    * beyond the end of a stripe file; i.e. lustre is reading a sparse file
@@@ -1013,8 -1011,8 +999,8 @@@ static int check_write_rcs(struct ptlrp
                           return(-EPROTO);
                   }
           }
- -
- -        if (req->rq_bulk->bd_nob_transferred != requested_nob) {
+ +      if (req->rq_bulk != NULL &&
+ +          req->rq_bulk->bd_nob_transferred != requested_nob) {
                   CERROR("Unexpected # bytes transferred: %d (requested %d)\n",
                          req->rq_bulk->bd_nob_transferred, requested_nob);
                   return(-EPROTO);
@@@ -1107,11 -1105,10 +1093,11 @@@ osc_brw_prep_request(int cmd, struct cl
           struct ost_body         *body;
           struct obd_ioobj        *ioobj;
           struct niobuf_remote    *niobuf;
- -        int niocount, i, requested_nob, opc, rc;
+ +      int niocount, i, requested_nob, opc, rc, short_io_size;
           struct osc_brw_async_args *aa;
           struct req_capsule      *pill;
           struct brw_page *pg_prev;
+ +      void *short_io_buf;
   
           ENTRY;
           if (OBD_FAIL_CHECK(OBD_FAIL_OSC_BRW_PREP_REQ))
@@@ -1142,38 -1139,18 +1128,38 @@@
           req_capsule_set_size(pill, &RMF_NIOBUF_REMOTE, RCL_CLIENT,
                                niocount * sizeof(*niobuf));
   
+ +      for (i = 0; i < page_count; i++)
+ +              short_io_size += pga[i]->count;
+ +
+ +      /* Check if we can do a short io. */
+ +      if (!(short_io_size <= cli->cl_short_io_bytes && niocount == 1 &&
+ +          imp_connect_shortio(cli->cl_import)))
+ +              short_io_size = 0;
+ +
+ +      req_capsule_set_size(pill, &RMF_SHORT_IO, RCL_CLIENT,
+ +                           opc == OST_READ ? 0 : short_io_size);
+ +      if (opc == OST_READ)
+ +              req_capsule_set_size(pill, &RMF_SHORT_IO, RCL_SERVER,
+ +                                   short_io_size);
+ +
           rc = ptlrpc_request_pack(req, LUSTRE_OST_VERSION, opc);
           if (rc) {
                   ptlrpc_request_free(req);
                   RETURN(rc);
           }
-         req->rq_request_portal = OST_IO_PORTAL; /* bug 7198 */
-         ptlrpc_at_set_req_timeout(req);
+       osc_set_io_portal(req);
   
+       ptlrpc_at_set_req_timeout(req);
         /* ask ptlrpc not to resend on EINPROGRESS since BRWs have their own
          * retry logic */
         req->rq_no_retry_einprogress = 1;
   
+ +      if (short_io_size != 0) {
+ +              desc = NULL;
+ +              short_io_buf = NULL;
+ +              goto no_bulk;
+ +      }
+ +
         desc = ptlrpc_prep_bulk_imp(req, page_count,
                 cli->cl_import->imp_connect_data.ocd_brw_size >> LNET_MTU_BITS,
                 (opc == OST_WRITE ? PTLRPC_BULK_GET_SOURCE :
@@@ -1185,7 -1162,7 +1171,7 @@@
           if (desc == NULL)
                   GOTO(out, rc = -ENOMEM);
           /* NB request now owns desc and will free it when it gets freed */
- -
+ +no_bulk:
           body = req_capsule_client_get(pill, &RMF_OST_BODY);
           ioobj = req_capsule_client_get(pill, &RMF_OBD_IOOBJ);
           niobuf = req_capsule_client_get(pill, &RMF_NIOBUF_REMOTE);
@@@ -1200,26 -1177,7 +1186,26 @@@
          * when the RPC is finally sent in ptlrpc_register_bulk(). It sends
          * "max - 1" for old client compatibility sending "0", and also so the
          * the actual maximum is a power-of-two number, not one less. LU-1431 */
- -      ioobj_max_brw_set(ioobj, desc->bd_md_max_brw);
+ +      if (desc != NULL)
+ +              ioobj_max_brw_set(ioobj, desc->bd_md_max_brw);
+ +      else /* short io */
+ +              ioobj_max_brw_set(ioobj, 0);
+ +
+ +      if (short_io_size != 0) {
+ +              if ((body->oa.o_valid & OBD_MD_FLFLAGS) == 0) {
+ +                      body->oa.o_valid |= OBD_MD_FLFLAGS;
+ +                      body->oa.o_flags = 0;
+ +              }
+ +              body->oa.o_flags |= OBD_FL_SHORT_IO;
+ +              CDEBUG(D_CACHE, "Using short io for data transfer, size = %d\n",
+ +                     short_io_size);
+ +              if (opc == OST_WRITE) {
+ +                      short_io_buf = req_capsule_client_get(pill,
+ +                                                            &RMF_SHORT_IO);
+ +                      LASSERT(short_io_buf != NULL);
+ +              }
+ +      }
+ +
         LASSERT(page_count > 0);
         pg_prev = pga[0];
           for (requested_nob = i = 0; i < page_count; i++, niobuf++) {
@@@ -1244,19 -1202,9 +1230,19 @@@
                            pg_prev->pg->index, pg_prev->off);
                   LASSERT((pga[0]->flag & OBD_BRW_SRVLOCK) ==
                           (pg->flag & OBD_BRW_SRVLOCK));
- -
- -              desc->bd_frag_ops->add_kiov_frag(desc, pg->pg, poff, pg->count);
- -                requested_nob += pg->count;
+ +              if (short_io_size != 0 && opc == OST_WRITE) {
+ +                      unsigned char *ptr = ll_kmap_atomic(pg->pg, KM_USER0);
+ +
+ +                      LASSERT(short_io_size >= requested_nob + pg->count);
+ +                      memcpy(short_io_buf + requested_nob,
+ +                             ptr + poff,
+ +                             pg->count);
+ +                      ll_kunmap_atomic(ptr, KM_USER0);
+ +              } else if (short_io_size == 0) {
+ +                      desc->bd_frag_ops->add_kiov_frag(desc, pg->pg, poff,
+ +                                                       pg->count);
+ +              }
+ +              requested_nob += pg->count;
   
                   if (i > 0 && can_merge_pages(pg_prev, pg)) {
                           niobuf--;
@@@ -1331,17 -1279,17 +1317,17 @@@
                  * resent due to cksum error, this will allow Server to
                  * check+dump pages on its side */
         }
- -        ptlrpc_request_set_replen(req);
+ +      ptlrpc_request_set_replen(req);
   
- -        CLASSERT(sizeof(*aa) <= sizeof(req->rq_async_args));
- -        aa = ptlrpc_req_async_args(req);
- -        aa->aa_oa = oa;
- -        aa->aa_requested_nob = requested_nob;
- -        aa->aa_nio_count = niocount;
- -        aa->aa_page_count = page_count;
- -        aa->aa_resends = 0;
- -        aa->aa_ppga = pga;
- -        aa->aa_cli = cli;
+ +      CLASSERT(sizeof(*aa) <= sizeof(req->rq_async_args));
+ +      aa = ptlrpc_req_async_args(req);
+ +      aa->aa_oa = oa;
+ +      aa->aa_requested_nob = requested_nob;
+ +      aa->aa_nio_count = niocount;
+ +      aa->aa_page_count = page_count;
+ +      aa->aa_resends = 0;
+ +      aa->aa_ppga = pga;
+ +      aa->aa_cli = cli;
         INIT_LIST_HEAD(&aa->aa_oaps);
   
         *reqp = req;
@@@ -1525,9 -1473,9 +1511,9 @@@ static int osc_brw_fini_request(struct 
                           CERROR("Unexpected +ve rc %d\n", rc);
                           RETURN(-EPROTO);
                   }
- -                LASSERT(req->rq_bulk->bd_nob == aa->aa_requested_nob);
   
- -                if (sptlrpc_cli_unwrap_bulk_write(req, req->rq_bulk))
+ +              if (req->rq_bulk != NULL &&
+ +                  sptlrpc_cli_unwrap_bulk_write(req, req->rq_bulk))
                           RETURN(-EAGAIN);
   
                   if ((aa->aa_oa->o_valid & OBD_MD_FLCKSUM) && client_cksum &&
@@@ -1542,14 -1490,8 +1528,14 @@@
   
           /* The rest of this function executes only for OST_READs */
   
- -        /* if unwrap_bulk failed, return -EAGAIN to retry */
- -        rc = sptlrpc_cli_unwrap_bulk_read(req, req->rq_bulk, rc);
+ +      if (req->rq_bulk == NULL) {
+ +              rc = req_capsule_get_size(&req->rq_pill, &RMF_SHORT_IO,
+ +                                        RCL_SERVER);
+ +              LASSERT(rc == req->rq_status);
+ +      } else {
+ +              /* if unwrap_bulk failed, return -EAGAIN to retry */
+ +              rc = sptlrpc_cli_unwrap_bulk_read(req, req->rq_bulk, rc);
+ +      }
           if (rc < 0)
                   GOTO(out, rc = -EAGAIN);
   
@@@ -1559,41 -1501,12 +1545,41 @@@
                   RETURN(-EPROTO);
           }
   
- -        if (rc != req->rq_bulk->bd_nob_transferred) {
+ +      if (req->rq_bulk != NULL && rc != req->rq_bulk->bd_nob_transferred) {
                   CERROR ("Unexpected rc %d (%d transferred)\n",
                           rc, req->rq_bulk->bd_nob_transferred);
                   return (-EPROTO);
           }
   
+ +      if (req->rq_bulk == NULL) {
+ +              /* short io */
+ +              int nob, pg_count, i = 0;
+ +              unsigned char *buf;
+ +
+ +              CDEBUG(D_CACHE, "Using short io read, size %d\n", rc);
+ +              pg_count = aa->aa_page_count;
+ +              buf = req_capsule_server_sized_get(&req->rq_pill, &RMF_SHORT_IO,
+ +                                                 rc);
+ +              nob = rc;
+ +              while (nob > 0 && pg_count > 0) {
+ +                      unsigned char *ptr;
+ +                      int count = aa->aa_ppga[i]->count > nob ?
+ +                                  nob : aa->aa_ppga[i]->count;
+ +
+ +                      CDEBUG(D_CACHE, "page %p count %d\n",
+ +                             aa->aa_ppga[i]->pg, count);
+ +                      ptr = ll_kmap_atomic(aa->aa_ppga[i]->pg, KM_USER0);
+ +                      memcpy(ptr + (aa->aa_ppga[i]->off & ~PAGE_MASK), buf,
+ +                             count);
+ +                      ll_kunmap_atomic((void *) ptr, KM_USER0);
+ +
+ +                      buf += count;
+ +                      nob -= count;
+ +                      i++;
+ +                      pg_count--;
+ +              }
+ +      }
+ +
           if (rc < aa->aa_requested_nob)
                   handle_short_read(rc, aa->aa_page_count, aa->aa_ppga);
   
@@@ -1610,8 -1523,7 +1596,8 @@@
                                                    aa->aa_ppga, OST_READ,
                                                    cksum_type);
   
- -              if (peer->nid != req->rq_bulk->bd_sender) {
+ +              if (req->rq_bulk != NULL &&
+ +                  peer->nid != req->rq_bulk->bd_sender) {
                         via = " via ";
                         router = libcfs_nid2str(req->rq_bulk->bd_sender);
                 }
@@@ -1785,7 -1697,6 +1771,7 @@@ static int brw_interpret(const struct l
         struct osc_extent *ext;
         struct osc_extent *tmp;
         struct client_obd *cli = aa->aa_cli;
+ +      unsigned long           transferred = 0;
           ENTRY;
   
           rc = osc_brw_fini_request(req, rc);
@@@ -1878,12 -1789,8 +1864,12 @@@
         LASSERT(list_empty(&aa->aa_exts));
         LASSERT(list_empty(&aa->aa_oaps));
   
+ +      transferred = (req->rq_bulk == NULL ? /* short io */
+ +                     aa->aa_requested_nob :
+ +                     req->rq_bulk->bd_nob_transferred);
+ +
         osc_release_ppga(aa->aa_ppga, aa->aa_page_count);
- -      ptlrpc_lprocfs_brw(req, req->rq_bulk->bd_nob_transferred);
+ +      ptlrpc_lprocfs_brw(req, transferred);
   
         spin_lock(&cli->cl_loi_list_lock);
         /* We need to decrement before osc_ap_completion->osc_wake_cache_waiters
@@@ -2112,10 -2019,10 +2098,10 @@@ static int osc_set_lock_data(struct ldl
         return set;
   }
   
- static int osc_enqueue_fini(struct ptlrpc_request *req,
-                           osc_enqueue_upcall_f upcall, void *cookie,
-                           struct lustre_handle *lockh, enum ldlm_mode mode,
-                           __u64 *flags, bool speculative, int errcode)
+ int osc_enqueue_fini(struct ptlrpc_request *req, osc_enqueue_upcall_f upcall,
+                    void *cookie, struct lustre_handle *lockh,
+                    enum ldlm_mode mode, __u64 *flags, bool speculative,
+                    int errcode)
   {
         bool intent = *flags & LDLM_FL_HAS_INTENT;
         int rc;
@@@ -2147,12 -2054,11 +2133,11 @@@
         if (errcode == ELDLM_OK && lustre_handle_is_used(lockh))
                 ldlm_lock_decref(lockh, mode);
   
-         RETURN(rc);
+       RETURN(rc);
   }
   
- static int osc_enqueue_interpret(const struct lu_env *env,
-                                struct ptlrpc_request *req,
-                                struct osc_enqueue_args *aa, int rc)
+ int osc_enqueue_interpret(const struct lu_env *env, struct ptlrpc_request *req,
+                         struct osc_enqueue_args *aa, int rc)
   {
         struct ldlm_lock *lock;
         struct lustre_handle *lockh = &aa->oa_lockh;
@@@ -2196,7 -2102,7 +2181,7 @@@
         rc = osc_enqueue_fini(req, aa->oa_upcall, aa->oa_cookie, lockh, mode,
                               aa->oa_flags, aa->oa_speculative, rc);
   
-         OBD_FAIL_TIMEOUT(OBD_FAIL_OSC_CP_CANCEL_RACE, 10);
+       OBD_FAIL_TIMEOUT(OBD_FAIL_OSC_CP_CANCEL_RACE, 10);
   
         ldlm_lock_decref(lockh, mode);
         LDLM_LOCK_PUT(lock);
@@@ -2485,13 -2391,13 +2470,13 @@@ static int osc_statfs_async(struct obd_
                   req->rq_no_delay = 1;
           }
   
- -        req->rq_interpret_reply = (ptlrpc_interpterer_t)osc_statfs_interpret;
- -        CLASSERT (sizeof(*aa) <= sizeof(req->rq_async_args));
- -        aa = ptlrpc_req_async_args(req);
- -        aa->aa_oi = oinfo;
+ +      req->rq_interpret_reply = (ptlrpc_interpterer_t)osc_statfs_interpret;
+ +      CLASSERT(sizeof(*aa) <= sizeof(req->rq_async_args));
+ +      aa = ptlrpc_req_async_args(req);
+ +      aa->aa_oi = oinfo;
   
- -        ptlrpc_set_add_req(rqset, req);
- -        RETURN(0);
+ +      ptlrpc_set_add_req(rqset, req);
+ +      RETURN(0);
   }
   
   static int osc_statfs(const struct lu_env *env, struct obd_export *exp,
@@@ -2595,10 -2501,9 +2580,9 @@@ out
         return err;
   }
   
- static int osc_set_info_async(const struct lu_env *env, struct obd_export *exp,
-                             u32 keylen, void *key,
-                             u32 vallen, void *val,
-                             struct ptlrpc_request_set *set)
+ int osc_set_info_async(const struct lu_env *env, struct obd_export *exp,
+                      u32 keylen, void *key, u32 vallen, void *val,
+                      struct ptlrpc_request_set *set)
   {
           struct ptlrpc_request *req;
           struct obd_device     *obd = exp->exp_obd;
@@@ -2685,23 -2590,23 +2669,23 @@@
         tmp = req_capsule_client_get(&req->rq_pill, KEY_IS(KEY_GRANT_SHRINK) ?
                                                         &RMF_OST_BODY :
                                                         &RMF_SETINFO_VAL);
- -        memcpy(tmp, val, vallen);
+ +      memcpy(tmp, val, vallen);
   
         if (KEY_IS(KEY_GRANT_SHRINK)) {
- -                struct osc_grant_args *aa;
- -                struct obdo *oa;
- -
- -                CLASSERT(sizeof(*aa) <= sizeof(req->rq_async_args));
- -                aa = ptlrpc_req_async_args(req);
- -                OBDO_ALLOC(oa);
- -                if (!oa) {
- -                        ptlrpc_req_finished(req);
- -                        RETURN(-ENOMEM);
- -                }
- -                *oa = ((struct ost_body *)val)->oa;
- -                aa->aa_oa = oa;
- -                req->rq_interpret_reply = osc_shrink_grant_interpret;
- -        }
+ +              struct osc_grant_args *aa;
+ +              struct obdo *oa;
+ +
+ +              CLASSERT(sizeof(*aa) <= sizeof(req->rq_async_args));
+ +              aa = ptlrpc_req_async_args(req);
+ +              OBDO_ALLOC(oa);
+ +              if (!oa) {
+ +                      ptlrpc_req_finished(req);
+ +                      RETURN(-ENOMEM);
+ +              }
+ +              *oa = ((struct ost_body *)val)->oa;
+ +              aa->aa_oa = oa;
+ +              req->rq_interpret_reply = osc_shrink_grant_interpret;
+ +      }
   
         ptlrpc_request_set_replen(req);
         if (!KEY_IS(KEY_GRANT_SHRINK)) {
@@@ -2714,17 -2619,16 +2698,16 @@@
   
         RETURN(0);
   }
+ EXPORT_SYMBOL(osc_set_info_async);
   
- static int osc_reconnect(const struct lu_env *env,
-                          struct obd_export *exp, struct obd_device *obd,
-                          struct obd_uuid *cluuid,
-                          struct obd_connect_data *data,
-                          void *localdata)
+ int osc_reconnect(const struct lu_env *env, struct obd_export *exp,
+                 struct obd_device *obd, struct obd_uuid *cluuid,
+                 struct obd_connect_data *data, void *localdata)
   {
-         struct client_obd *cli = &obd->u.cli;
+       struct client_obd *cli = &obd->u.cli;
   
-         if (data != NULL && (data->ocd_connect_flags & OBD_CONNECT_GRANT)) {
-                 long lost_grant;
+       if (data != NULL && (data->ocd_connect_flags & OBD_CONNECT_GRANT)) {
+               long lost_grant;
                 long grant;
   
                 spin_lock(&cli->cl_loi_list_lock);
@@@ -2745,8 -2649,9 +2728,9 @@@
   
         RETURN(0);
   }
+ EXPORT_SYMBOL(osc_reconnect);
   
- static int osc_disconnect(struct obd_export *exp)
+ int osc_disconnect(struct obd_export *exp)
   {
         struct obd_device *obd = class_exp2obd(exp);
         int rc;
@@@ -2773,9 -2678,10 +2757,10 @@@
                   osc_del_shrink_grant(&obd->u.cli);
           return rc;
   }
+ EXPORT_SYMBOL(osc_disconnect);
   
- static int osc_ldlm_resource_invalidate(struct cfs_hash *hs,
-       struct cfs_hash_bd *bd, struct hlist_node *hnode, void *arg)
+ int osc_ldlm_resource_invalidate(struct cfs_hash *hs, struct cfs_hash_bd *bd,
+                                struct hlist_node *hnode, void *arg)
   {
         struct lu_env *env = arg;
         struct ldlm_resource *res = cfs_hash_object(hs, hnode);
@@@ -2804,6 -2710,7 +2789,7 @@@
   
         RETURN(0);
   }
+ EXPORT_SYMBOL(osc_ldlm_resource_invalidate);
   
   static int osc_import_event(struct obd_device *obd,
                               struct obd_import *imp,
@@@ -2911,15 -2818,12 +2897,12 @@@ static int brw_queue_work(const struct 
         RETURN(0);
   }
   
- int osc_setup(struct obd_device *obd, struct lustre_cfg *lcfg)
+ int osc_setup_common(struct obd_device *obd, struct lustre_cfg *lcfg)
   {
         struct client_obd *cli = &obd->u.cli;
-       struct obd_type   *type;
-       void              *handler;
-       int                rc;
-       int                adding;
-       int                added;
-       int                req_count;
+       void *handler;
+       int rc;
+ 
         ENTRY;
   
         rc = ptlrpcd_addref();
@@@ -2930,9 -2834,10 +2913,10 @@@
         if (rc)
                 GOTO(out_ptlrpcd, rc);
   
+ 
         handler = ptlrpcd_alloc_work(cli->cl_import, brw_queue_work, cli);
         if (IS_ERR(handler))
-               GOTO(out_client_setup, rc = PTR_ERR(handler));
+               GOTO(out_ptlrpcd_work, rc = PTR_ERR(handler));
         cli->cl_writeback_work = handler;
   
         handler = ptlrpcd_alloc_work(cli->cl_import, lru_queue_work, cli);
@@@ -2946,6 -2851,40 +2930,40 @@@
   
         cli->cl_grant_shrink_interval = GRANT_SHRINK_INTERVAL;
   
+       INIT_LIST_HEAD(&cli->cl_grant_shrink_list);
+       RETURN(rc);
+ 
+ out_ptlrpcd_work:
+       if (cli->cl_writeback_work != NULL) {
+               ptlrpcd_destroy_work(cli->cl_writeback_work);
+               cli->cl_writeback_work = NULL;
+       }
+       if (cli->cl_lru_work != NULL) {
+               ptlrpcd_destroy_work(cli->cl_lru_work);
+               cli->cl_lru_work = NULL;
+       }
+       client_obd_cleanup(obd);
+ out_ptlrpcd:
+       ptlrpcd_decref();
+       RETURN(rc);
+ }
+ EXPORT_SYMBOL(osc_setup_common);
+ 
+ int osc_setup(struct obd_device *obd, struct lustre_cfg *lcfg)
+ {
+       struct client_obd *cli = &obd->u.cli;
+       struct obd_type   *type;
+       int                adding;
+       int                added;
+       int                req_count;
+       int                rc;
+ 
+       ENTRY;
+ 
+       rc = osc_setup_common(obd, lcfg);
+       if (rc < 0)
+               RETURN(rc);
+ 
   #ifdef CONFIG_PROC_FS
         obd->obd_vars = lprocfs_osc_obd_vars;
   #endif
@@@ -3000,24 -2939,9 +3018,9 @@@
         spin_unlock(&osc_shrink_lock);
   
         RETURN(0);
- 
- out_ptlrpcd_work:
-       if (cli->cl_writeback_work != NULL) {
-               ptlrpcd_destroy_work(cli->cl_writeback_work);
-               cli->cl_writeback_work = NULL;
-       }
-       if (cli->cl_lru_work != NULL) {
-               ptlrpcd_destroy_work(cli->cl_lru_work);
-               cli->cl_lru_work = NULL;
-       }
- out_client_setup:
-       client_obd_cleanup(obd);
- out_ptlrpcd:
-       ptlrpcd_decref();
-       RETURN(rc);
   }
   
- static int osc_precleanup(struct obd_device *obd)
+ int osc_precleanup_common(struct obd_device *obd)
   {
         struct client_obd *cli = &obd->u.cli;
         ENTRY;
@@@ -3043,12 -2967,22 +3046,22 @@@
         }
   
         obd_cleanup_client_import(obd);
+       RETURN(0);
+ }
+ EXPORT_SYMBOL(osc_precleanup_common);
+ 
+ static int osc_precleanup(struct obd_device *obd)
+ {
+       ENTRY;
+ 
+       osc_precleanup_common(obd);
+ 
         ptlrpc_lprocfs_unregister_obd(obd);
         lprocfs_obd_cleanup(obd);
         RETURN(0);
   }
   
- int osc_cleanup(struct obd_device *obd)
+ int osc_cleanup_common(struct obd_device *obd)
   {
         struct client_obd *cli = &obd->u.cli;
         int rc;
@@@ -3078,6 -3012,7 +3091,7 @@@
         ptlrpcd_decref();
         RETURN(rc);
   }
+ EXPORT_SYMBOL(osc_cleanup_common);
   
   int osc_process_config_base(struct obd_device *obd, struct lustre_cfg *lcfg)
   {
@@@ -3094,7 -3029,7 +3108,7 @@@ static struct obd_ops osc_obd_ops = 
           .o_owner                = THIS_MODULE,
           .o_setup                = osc_setup,
           .o_precleanup           = osc_precleanup,
-         .o_cleanup              = osc_cleanup,
+       .o_cleanup              = osc_cleanup_common,
           .o_add_conn             = client_import_add_conn,
           .o_del_conn             = client_import_del_conn,
           .o_connect              = client_connect_import,
diff --combined lustre/osd-zfs/osd_object.c

index e9b127d,e173964..8c495ef
--- 1/lustre/osd-zfs/osd_object.c
--- 2/lustre/osd-zfs/osd_object.c
+++ b/lustre/osd-zfs/osd_object.c
@@@ -214,25 -214,6 +214,25 @@@ int __osd_object_attr_get(const struct 
         if (rc)
                 GOTO(out_sa, rc);
   
+ +#ifdef ZFS_PROJINHERIT
+ +      if (o->od_projectused_dn && osa->flags & ZFS_PROJID) {
+ +              rc = -sa_lookup(obj->oo_sa_hdl, SA_ZPL_PROJID(o),
+ +                              &osa->projid, 8);
+ +              if (rc)
+ +                      GOTO(out_sa, rc);
+ +
+ +              la->la_projid = osa->projid;
+ +              la->la_valid |= LA_PROJID;
+ +              obj->oo_with_projid = 1;
+ +      } else {
+ +              la->la_projid = ZFS_DEFAULT_PROJID;
+ +              la->la_valid &= ~LA_PROJID;
+ +      }
+ +#else
+ +      la->la_projid = 0;
+ +      la->la_valid &= ~LA_PROJID;
+ +#endif
+ +
         la->la_atime = osa->atime[0];
         la->la_mtime = osa->mtime[0];
         la->la_ctime = osa->ctime[0];
@@@ -414,11 -395,6 +414,11 @@@ static dnode_t *osd_quota_fid2dmu(cons
         case ACCT_GROUP_OID:
                 dn = osd->od_groupused_dn;
                 break;
+ +#ifdef ZFS_PROJINHERIT
+ +      case ACCT_PROJECT_OID:
+ +              dn = osd->od_projectused_dn;
+ +              break;
+ +#endif
         default:
                 break;
         }
@@@ -572,15 -548,13 +572,15 @@@ static int osd_declare_destroy(const st
   
         /* one less inode */
         rc = osd_declare_quota(env, osd, obj->oo_attr.la_uid,
- -                             obj->oo_attr.la_gid, -1, oh, false, NULL, false);
+ +                             obj->oo_attr.la_gid, obj->oo_attr.la_projid,
+ +                             -1, oh, NULL, OSD_QID_INODE);
         if (rc)
                 RETURN(rc);
   
         /* data to be truncated */
         rc = osd_declare_quota(env, osd, obj->oo_attr.la_uid,
- -                             obj->oo_attr.la_gid, 0, oh, true, NULL, false);
+ +                             obj->oo_attr.la_gid, obj->oo_attr.la_projid,
+ +                             0, oh, NULL, OSD_QID_BLK);
         if (rc)
                 RETURN(rc);
   
@@@ -931,7 -905,7 +931,7 @@@ static int osd_declare_attr_set(const s
                  * anything else */
         }
   
- -      if (attr && (attr->la_valid & (LA_UID | LA_GID))) {
+ +      if (attr && (attr->la_valid & (LA_UID | LA_GID | LA_PROJID))) {
                 sa_object_size(obj->oo_sa_hdl, &blksize, &bspace);
                 bspace = toqb(bspace * blksize);
         }
@@@ -958,38 -932,7 +958,38 @@@
                                 GOTO(out, rc);
                 }
         }
- -
+ +#ifdef ZFS_PROJINHERIT
+ +      if (attr && attr->la_valid & LA_PROJID) {
+ +              if (!osd->od_projectused_dn)
+ +                      GOTO(out, rc = -EOPNOTSUPP);
+ +
+ +              /* Usually, if project quota is upgradable for the device,
+ +               * then the upgrade will be done before or when mount the
+ +               * device. So when we come here, this project should have
+ +               * project ID attribute already (that is zero by default).
+ +               * Otherwise, there was something wrong during the former
+ +               * upgrade, let's return failure to report that.
+ +               *
+ +               * Please note that, different from other attributes, you
+ +               * can NOT simply set the project ID attribute under such
+ +               * case, because adding (NOT change) project ID attribute
+ +               * needs to change the object's attribute layout to match
+ +               * zfs backend quota accounting requirement. */
+ +              if (unlikely(!obj->oo_with_projid))
+ +                      GOTO(out, rc = -ENXIO);
+ +
+ +              /* quota enforcement for project */
+ +              if (attr->la_projid != obj->oo_attr.la_projid) {
+ +                      rc = qsd_transfer(env, osd->od_quota_slave,
+ +                                        &oh->ot_quota_trans, PRJQUOTA,
+ +                                        obj->oo_attr.la_projid,
+ +                                        attr->la_projid, bspace,
+ +                                        &info->oti_qi);
+ +                      if (rc)
+ +                              GOTO(out, rc);
+ +              }
+ +      }
+ +#endif
   out:
         up_read(&obj->oo_guard);
         RETURN(rc);
@@@ -1074,30 -1017,13 +1074,30 @@@ static int osd_attr_set(const struct lu
                         if (rc < 0) {
                                 CWARN("%s: failed to set LMA flags: rc = %d\n",
                                        osd->od_svname, rc);
- -                              RETURN(rc);
+ +                              GOTO(out, rc);
                         }
                 }
         }
   
         write_lock(&obj->oo_attr_lock);
         cnt = 0;
+ +
+ +      if (valid & LA_PROJID) {
+ +#ifdef ZFS_PROJINHERIT
+ +              /* osd_declare_attr_set() must be called firstly.
+ +               * If osd::od_projectused_dn is not set, then we
+ +               * can not arrive at here. */
+ +              LASSERT(osd->od_projectused_dn);
+ +              LASSERT(obj->oo_with_projid);
+ +
+ +              osa->projid = obj->oo_attr.la_projid = la->la_projid;
+ +              SA_ADD_BULK_ATTR(bulk, cnt, SA_ZPL_PROJID(osd), NULL,
+ +                               &osa->projid, 8);
+ +#else
+ +              valid &= ~LA_PROJID;
+ +#endif
+ +      }
+ +
         if (valid & LA_ATIME) {
                 osa->atime[0] = obj->oo_attr.la_atime = la->la_atime;
                 SA_ADD_BULK_ATTR(bulk, cnt, SA_ZPL_ATIME(osd), NULL,
@@@ -1141,10 -1067,6 +1141,10 @@@
                 /* many flags are not supported by zfs, so ensure a good cached
                  * copy */
                 obj->oo_attr.la_flags = attrs_zfs2fs(osa->flags);
+ +#ifdef ZFS_PROJINHERIT
+ +              if (obj->oo_with_projid)
+ +                      osa->flags |= ZFS_PROJID;
+ +#endif
                 SA_ADD_BULK_ATTR(bulk, cnt, SA_ZPL_FLAGS(osd), NULL,
                                  &osa->flags, 8);
         }
@@@ -1258,14 -1180,14 +1258,14 @@@ static int osd_declare_create(const str
         /* will help to find FID->ino mapping at dt_insert() */
         osd_idc_find_and_init(env, osd, obj);
   
- -      rc = osd_declare_quota(env, osd, attr->la_uid, attr->la_gid, 1, oh,
- -                             false, NULL, false);
+ +      rc = osd_declare_quota(env, osd, attr->la_uid, attr->la_gid,
+ +                             attr->la_projid, 1, oh, NULL, OSD_QID_INODE);
   
         RETURN(rc);
   }
   
   int __osd_attr_init(const struct lu_env *env, struct osd_device *osd,
- -                  sa_handle_t *sa_hdl, dmu_tx_t *tx,
+ +                  struct osd_object *obj, sa_handle_t *sa_hdl, dmu_tx_t *tx,
                     struct lu_attr *la, uint64_t parent,
                     nvlist_t *xattr)
   {
@@@ -1294,32 -1216,16 +1294,32 @@@
         osa->gid = la->la_gid;
         osa->rdev = la->la_rdev;
         osa->nlink = la->la_nlink;
- -      osa->flags = attrs_fs2zfs(la->la_flags);
+ +      if (la->la_valid & LA_FLAGS)
+ +              osa->flags = attrs_fs2zfs(la->la_flags);
+ +      else
+ +              osa->flags = 0;
         osa->size  = la->la_size;
+ +#ifdef ZFS_PROJINHERIT
+ +      if (osd->od_projectused_dn) {
+ +              if (la->la_valid & LA_PROJID)
+ +                      osa->projid = la->la_projid;
+ +              else
+ +                      osa->projid = ZFS_DEFAULT_PROJID;
+ +              osa->flags |= ZFS_PROJID;
+ +              if (obj)
+ +                      obj->oo_with_projid = 1;
+ +      } else {
+ +              osa->flags &= ~ZFS_PROJID;
+ +      }
+ +#endif
   
         /*
          * we need to create all SA below upon object create.
          *
          * XXX The attribute order matters since the accounting callback relies
          * on static offsets (i.e. SA_*_OFFSET, see zfs_space_delta_cb()) to
- -       * look up the UID/GID attributes. Moreover, the callback does not seem
- -       * to support the spill block.
+ +       * look up the UID/GID/PROJID attributes. Moreover, the callback does
+ +       * not seem to support the spill block.
          * We define attributes in the same order as SA_*_OFFSET in order to
          * work around the problem. See ORI-610.
          */
@@@ -1336,11 -1242,6 +1336,11 @@@
         SA_ADD_BULK_ATTR(bulk, cnt, SA_ZPL_CTIME(osd), NULL, osa->ctime, 16);
         SA_ADD_BULK_ATTR(bulk, cnt, SA_ZPL_CRTIME(osd), NULL, crtime, 16);
         SA_ADD_BULK_ATTR(bulk, cnt, SA_ZPL_LINKS(osd), NULL, &osa->nlink, 8);
+ +#ifdef ZFS_PROJINHERIT
+ +      if (osd->od_projectused_dn)
+ +              SA_ADD_BULK_ATTR(bulk, cnt, SA_ZPL_PROJID(osd), NULL,
+ +                               &osa->projid, 8);
+ +#endif
         SA_ADD_BULK_ATTR(bulk, cnt, SA_ZPL_RDEV(osd), NULL, &osa->rdev, 8);
         LASSERT(cnt <= ARRAY_SIZE(osd_oti_get(env)->oti_attr_bulk));
   
@@@ -1408,43 -1309,6 +1408,43 @@@ static int osd_find_new_dnode(const str
         return rc;
   }
   
+ +#ifdef HAVE_DMU_OBJECT_ALLOC_DNSIZE
+ +static int osd_find_dnsize(struct osd_object *obj)
+ +{
+ +      struct osd_device *osd = osd_obj2dev(obj);
+ +      int dnsize;
+ +
+ +      if (osd->od_dnsize == ZFS_DNSIZE_AUTO) {
+ +              dnsize = DNODE_MIN_SIZE;
+ +              do {
+ +                      if (DN_BONUS_SIZE(dnsize) >= obj->oo_ea_in_bonus + 32)
+ +                              break;
+ +                      dnsize <<= 1;
+ +              } while (dnsize < DNODE_MAX_SIZE);
+ +              if (dnsize > DNODE_MAX_SIZE)
+ +                      dnsize = DNODE_MAX_SIZE;
+ +      } else if (osd->od_dnsize == ZFS_DNSIZE_1K) {
+ +              dnsize = 1024;
+ +      } else if (osd->od_dnsize == ZFS_DNSIZE_2K) {
+ +              dnsize = 2048;
+ +      } else if (osd->od_dnsize == ZFS_DNSIZE_4K) {
+ +              dnsize = 4096;
+ +      } else if (osd->od_dnsize == ZFS_DNSIZE_8K) {
+ +              dnsize = 8192;
+ +      } else if (osd->od_dnsize == ZFS_DNSIZE_16K) {
+ +              dnsize = 16384;
+ +      } else {
+ +              dnsize = DNODE_MIN_SIZE;
+ +      }
+ +      return dnsize;
+ +}
+ +#else
+ +static int inline osd_find_dnsize(struct osd_object *obj)
+ +{
+ +      return DN_MAX_BONUSLEN;
+ +}
+ +#endif
+ +
   /*
    * The transaction passed to this routine must have
    * dmu_tx_hold_bonus(tx, DMU_NEW_OBJECT) called and then assigned
@@@ -1465,8 -1329,7 +1465,8 @@@ int __osd_object_create(const struct lu
                 type = DMU_OTN_UINT8_METADATA;
   
         /* Create a new DMU object using the default dnode size. */
- -      oid = osd_dmu_object_alloc(osd->od_os, type, 0, 0, tx);
+ +      oid = osd_dmu_object_alloc(osd->od_os, type, 0,
+ +                                 osd_find_dnsize(obj), tx);
   
         LASSERT(la->la_valid & LA_MODE);
         la->la_size = 0;
@@@ -1487,7 -1350,7 +1487,7 @@@
    * a conversion from the different internal ZAP hash formats being used. */
   int __osd_zap_create(const struct lu_env *env, struct osd_device *osd,
                      dnode_t **dnp, dmu_tx_t *tx, struct lu_attr *la,
- -                   zap_flags_t flags)
+ +                   unsigned dnsize, zap_flags_t flags)
   {
         uint64_t oid;
   
@@@ -1500,7 -1363,7 +1500,7 @@@
                                    DMU_OT_DIRECTORY_CONTENTS,
                                    14, /* == ZFS fzap_default_blockshift */
                                    DN_MAX_INDBLKSHIFT, /* indirect blockshift */
- -                                 0, tx);
+ +                                 dnsize, tx);
   
         la->la_size = 2;
         la->la_nlink = 1;
@@@ -1520,7 -1383,7 +1520,7 @@@ static dnode_t *osd_mkidx(const struct 
          * binary keys */
         LASSERT(S_ISREG(la->la_mode));
         rc = __osd_zap_create(env, osd_obj2dev(obj), &dn, oh->ot_tx, la,
- -                            ZAP_FLAG_UINT64_KEY);
+ +                            osd_find_dnsize(obj), ZAP_FLAG_UINT64_KEY);
         if (rc)
                 return ERR_PTR(rc);
         return dn;
@@@ -1533,8 -1396,7 +1533,8 @@@ static dnode_t *osd_mkdir(const struct 
         int rc;
   
         LASSERT(S_ISDIR(la->la_mode));
- -      rc = __osd_zap_create(env, osd_obj2dev(obj), &dn, oh->ot_tx, la, 0);
+ +      rc = __osd_zap_create(env, osd_obj2dev(obj), &dn, oh->ot_tx, la,
+ +                            osd_find_dnsize(obj), 0);
         if (rc)
                 return ERR_PTR(rc);
         return dn;
@@@ -1553,8 -1415,7 +1553,7 @@@ static dnode_t *osd_mkreg(const struct 
         if (rc)
                 return ERR_PTR(rc);
   
-       if ((fid_is_idif(fid) || fid_is_norm(fid) || fid_is_echo(fid)) &&
-           osd->od_is_ost) {
+       if ((fid_is_idif(fid) || fid_is_norm(fid) || fid_is_echo(fid))) {
                 /* The minimum block size must be at least page size otherwise
                  * it will break the assumption in tgt_thread_big_cache where
                  * the array size is PTLRPC_MAX_BRW_PAGES. It will also affect
@@@ -1680,14 -1541,6 +1679,14 @@@ static int osd_create(const struct lu_e
         obj->oo_attr = *attr;
         obj->oo_attr.la_valid |= LA_SIZE | LA_NLINK | LA_TYPE;
   
+ +#ifdef ZFS_PROJINHERIT
+ +      if (osd->od_projectused_dn) {
+ +              if (!(obj->oo_attr.la_valid & LA_PROJID))
+ +                      obj->oo_attr.la_projid = ZFS_DEFAULT_PROJID;
+ +              obj->oo_with_projid = 1;
+ +      }
+ +#endif
+ +
         dn = osd_create_type_f(dof->dof_type)(env, obj, &obj->oo_attr, oh);
         if (IS_ERR(dn)) {
                 rc = PTR_ERR(dn);
diff --combined lustre/ptlrpc/pack_generic.c

index cd523f5,80f9120..74262a5
--- 1/lustre/ptlrpc/pack_generic.c
--- 2/lustre/ptlrpc/pack_generic.c
+++ b/lustre/ptlrpc/pack_generic.c
@@@ -1886,8 -1886,8 +1886,8 @@@ void lustre_swab_mdt_body (struct mdt_b
         __swab32s(&b->mbo_uid_h);
         __swab32s(&b->mbo_gid_h);
         __swab32s(&b->mbo_projid);
-       CLASSERT(offsetof(typeof(*b), mbo_padding_6) != 0);
-       CLASSERT(offsetof(typeof(*b), mbo_padding_7) != 0);
+       __swab64s(&b->mbo_dom_size);
+       __swab64s(&b->mbo_dom_blocks);
         CLASSERT(offsetof(typeof(*b), mbo_padding_8) != 0);
         CLASSERT(offsetof(typeof(*b), mbo_padding_9) != 0);
         CLASSERT(offsetof(typeof(*b), mbo_padding_10) != 0);
@@@ -1903,39 -1903,38 +1903,39 @@@ void lustre_swab_mdt_ioepoch(struct mdt
   
   void lustre_swab_mgs_target_info(struct mgs_target_info *mti)
   {
- -        int i;
- -        __swab32s(&mti->mti_lustre_ver);
- -        __swab32s(&mti->mti_stripe_index);
- -        __swab32s(&mti->mti_config_ver);
- -        __swab32s(&mti->mti_flags);
- -        __swab32s(&mti->mti_instance);
- -        __swab32s(&mti->mti_nid_count);
- -        CLASSERT(sizeof(lnet_nid_t) == sizeof(__u64));
- -        for (i = 0; i < MTI_NIDS_MAX; i++)
- -                __swab64s(&mti->mti_nids[i]);
+ +      int i;
+ +
+ +      __swab32s(&mti->mti_lustre_ver);
+ +      __swab32s(&mti->mti_stripe_index);
+ +      __swab32s(&mti->mti_config_ver);
+ +      __swab32s(&mti->mti_flags);
+ +      __swab32s(&mti->mti_instance);
+ +      __swab32s(&mti->mti_nid_count);
+ +      CLASSERT(sizeof(lnet_nid_t) == sizeof(__u64));
+ +      for (i = 0; i < MTI_NIDS_MAX; i++)
+ +              __swab64s(&mti->mti_nids[i]);
   }
   
   void lustre_swab_mgs_nidtbl_entry(struct mgs_nidtbl_entry *entry)
   {
         __u8 i;
   
- -        __swab64s(&entry->mne_version);
- -        __swab32s(&entry->mne_instance);
- -        __swab32s(&entry->mne_index);
- -        __swab32s(&entry->mne_length);
- -
- -        /* mne_nid_(count|type) must be one byte size because we're gonna
- -         * access it w/o swapping. */
- -        CLASSERT(sizeof(entry->mne_nid_count) == sizeof(__u8));
- -        CLASSERT(sizeof(entry->mne_nid_type) == sizeof(__u8));
- -
- -        /* remove this assertion if ipv6 is supported. */
- -        LASSERT(entry->mne_nid_type == 0);
- -        for (i = 0; i < entry->mne_nid_count; i++) {
- -                CLASSERT(sizeof(lnet_nid_t) == sizeof(__u64));
- -                __swab64s(&entry->u.nids[i]);
- -        }
+ +      __swab64s(&entry->mne_version);
+ +      __swab32s(&entry->mne_instance);
+ +      __swab32s(&entry->mne_index);
+ +      __swab32s(&entry->mne_length);
+ +
+ +      /* mne_nid_(count|type) must be one byte size because we're gonna
+ +       * access it w/o swapping. */
+ +      CLASSERT(sizeof(entry->mne_nid_count) == sizeof(__u8));
+ +      CLASSERT(sizeof(entry->mne_nid_type) == sizeof(__u8));
+ +
+ +      /* remove this assertion if ipv6 is supported. */
+ +      LASSERT(entry->mne_nid_type == 0);
+ +      for (i = 0; i < entry->mne_nid_count; i++) {
+ +              CLASSERT(sizeof(lnet_nid_t) == sizeof(__u64));
+ +              __swab64s(&entry->u.nids[i]);
+ +      }
   }
   EXPORT_SYMBOL(lustre_swab_mgs_nidtbl_entry);
   
diff --combined lustre/target/tgt_grant.c

index 7887623,62da5c9..4fa0623
--- 1/lustre/target/tgt_grant.c
--- 2/lustre/target/tgt_grant.c
+++ b/lustre/target/tgt_grant.c
@@@ -138,6 -138,11 +138,6 @@@ static int tgt_check_export_grants(stru
         struct tg_export_data *ted = &exp->exp_target_data;
         int level = D_CACHE;
   
- -      if (exp->exp_obd->obd_self_export == exp)
- -              CDEBUG(D_CACHE, "%s: processing self export: %ld %ld "
- -                     "%ld\n", exp->exp_obd->obd_name, ted->ted_grant,
- -                     ted->ted_pending, ted->ted_dirty);
- -
         if (ted->ted_grant < 0 || ted->ted_pending < 0 || ted->ted_dirty < 0)
                 level = D_ERROR;
         CDEBUG_LIMIT(level, "%s: cli %s/%p dirty %ld pend %ld grant %ld\n",
@@@ -183,7 -188,6 +183,7 @@@ void tgt_grant_sanity_check(struct obd_
         struct lu_target *lut = obd->u.obt.obt_lut;
         struct tg_grants_data *tgd = &lut->lut_tgd;
         struct obd_export *exp;
+ +      struct tg_export_data *ted;
         u64                maxsize;
         u64                tot_dirty = 0;
         u64                tot_pending = 0;
@@@ -205,15 -209,6 +205,15 @@@
   
         spin_lock(&obd->obd_dev_lock);
         spin_lock(&tgd->tgd_grant_lock);
+ +      exp = obd->obd_self_export;
+ +      ted = &exp->exp_target_data;
+ +      CDEBUG(D_CACHE, "%s: processing self export: %ld %ld "
+ +             "%ld\n", obd->obd_name, ted->ted_grant,
+ +             ted->ted_pending, ted->ted_dirty);
+ +      tot_granted += ted->ted_grant + ted->ted_pending;
+ +      tot_pending += ted->ted_pending;
+ +      tot_dirty += ted->ted_dirty;
+ +
         list_for_each_entry(exp, &obd->obd_exports, exp_obd_chain) {
                 error = tgt_check_export_grants(exp, &tot_dirty, &tot_pending,
                                                 &tot_granted, maxsize);
@@@ -313,6 -308,8 +313,8 @@@ int tgt_statfs_internal(const struct lu
                 if (unlikely(rc))
                         GOTO(out, rc);
   
+               osfs->os_namelen = min_t(__u32, osfs->os_namelen, NAME_MAX);
+ 
                 spin_lock(&tgd->tgd_grant_lock);
                 spin_lock(&tgd->tgd_osfs_lock);
                 /* calculate how much space was written while we released the
@@@ -433,6 -430,7 +435,7 @@@ static u64 tgt_grant_space_left(struct 
         u64                      left;
         u64                      avail;
         u64                      unstable;
+       u64                      reserved;
   
         ENTRY;
         assert_spin_locked(&tgd->tgd_grant_lock);
@@@ -443,7 -441,8 +446,8 @@@
         unstable = tgd->tgd_osfs_unstable; /* those might be accounted twice */
         spin_unlock(&tgd->tgd_osfs_lock);
   
-       tot_granted = tgd->tgd_tot_granted;
+       reserved = left * tgd->tgd_reserved_pcnt / 100;
+       tot_granted = tgd->tgd_tot_granted + reserved;
   
         if (left < tot_granted) {
                 int mask = (left + unstable <
@@@ -1505,3 -1504,130 +1509,130 @@@ int tgt_grant_commit_cb_add(struct than
         RETURN(rc);
   }
   EXPORT_SYMBOL(tgt_grant_commit_cb_add);
+ 
+ /**
+  * Show estimate of total amount of dirty data on clients.
+  *
+  * \param[in] m               seq_file handle
+  * \param[in] data    unused for single entry
+  *
+  * \retval            0 on success
+  * \retval            negative value on error
+  */
+ int tgt_tot_dirty_seq_show(struct seq_file *m, void *data)
+ {
+       struct obd_device *obd = m->private;
+       struct tg_grants_data *tgd;
+ 
+       LASSERT(obd != NULL);
+       tgd = &obd->u.obt.obt_lut->lut_tgd;
+       seq_printf(m, "%llu\n", tgd->tgd_tot_dirty);
+       return 0;
+ }
+ EXPORT_SYMBOL(tgt_tot_dirty_seq_show);
+ 
+ /**
+  * Show total amount of space granted to clients.
+  *
+  * \param[in] m               seq_file handle
+  * \param[in] data    unused for single entry
+  *
+  * \retval            0 on success
+  * \retval            negative value on error
+  */
+ int tgt_tot_granted_seq_show(struct seq_file *m, void *data)
+ {
+       struct obd_device *obd = m->private;
+       struct tg_grants_data *tgd;
+ 
+       LASSERT(obd != NULL);
+       tgd = &obd->u.obt.obt_lut->lut_tgd;
+       seq_printf(m, "%llu\n", tgd->tgd_tot_granted);
+       return 0;
+ }
+ EXPORT_SYMBOL(tgt_tot_granted_seq_show);
+ 
+ /**
+  * Show total amount of space used by IO in progress.
+  *
+  * \param[in] m               seq_file handle
+  * \param[in] data    unused for single entry
+  *
+  * \retval            0 on success
+  * \retval            negative value on error
+  */
+ int tgt_tot_pending_seq_show(struct seq_file *m, void *data)
+ {
+       struct obd_device *obd = m->private;
+       struct tg_grants_data *tgd;
+ 
+       LASSERT(obd != NULL);
+       tgd = &obd->u.obt.obt_lut->lut_tgd;
+       seq_printf(m, "%llu\n", tgd->tgd_tot_pending);
+       return 0;
+ }
+ EXPORT_SYMBOL(tgt_tot_pending_seq_show);
+ 
+ /**
+  * Show if grants compatibility mode is disabled.
+  *
+  * When tgd_grant_compat_disable is set, we don't grant any space to clients
+  * not supporting OBD_CONNECT_GRANT_PARAM. Otherwise, space granted to such
+  * a client is inflated since it consumes PAGE_SIZE of grant space per
+  * block, (i.e. typically 4kB units), but underlaying file system might have
+  * block size bigger than page size, e.g. ZFS. See LU-2049 for details.
+  *
+  * \param[in] m               seq_file handle
+  * \param[in] data    unused for single entry
+  *
+  * \retval            0 on success
+  * \retval            negative value on error
+  */
+ int tgt_grant_compat_disable_seq_show(struct seq_file *m, void *data)
+ {
+       struct obd_device *obd = m->private;
+       struct tg_grants_data *tgd = &obd->u.obt.obt_lut->lut_tgd;
+ 
+       seq_printf(m, "%u\n", tgd->tgd_grant_compat_disable);
+       return 0;
+ }
+ EXPORT_SYMBOL(tgt_grant_compat_disable_seq_show);
+ 
+ /**
+  * Change grant compatibility mode.
+  *
+  * Setting tgd_grant_compat_disable prohibit any space granting to clients
+  * not supporting OBD_CONNECT_GRANT_PARAM. See details above.
+  *
+  * \param[in] file    proc file
+  * \param[in] buffer  string which represents mode
+  *                    1: disable compatibility mode
+  *                    0: enable compatibility mode
+  * \param[in] count   \a buffer length
+  * \param[in] off     unused for single entry
+  *
+  * \retval            \a count on success
+  * \retval            negative number on error
+  */
+ ssize_t tgt_grant_compat_disable_seq_write(struct file *file,
+                                          const char __user *buffer,
+                                          size_t count, loff_t *off)
+ {
+       struct seq_file *m = file->private_data;
+       struct obd_device *obd = m->private;
+       struct tg_grants_data *tgd = &obd->u.obt.obt_lut->lut_tgd;
+       __s64 val;
+       int rc;
+ 
+       rc = lprocfs_str_to_s64(buffer, count, &val);
+       if (rc)
+               return rc;
+ 
+       if (val < 0)
+               return -EINVAL;
+ 
+       tgd->tgd_grant_compat_disable = !!val;
+ 
+       return count;
+ }
+ EXPORT_SYMBOL(tgt_grant_compat_disable_seq_write);
diff --combined lustre/target/tgt_handler.c

index e359462,9026f21..b81882c
--- 1/lustre/target/tgt_handler.c
--- 2/lustre/target/tgt_handler.c
+++ b/lustre/target/tgt_handler.c
@@@ -434,19 -434,6 +434,19 @@@ static int tgt_handle_request0(struct t
                                              &RMF_ACL, RCL_SERVER,
                                              LUSTRE_POSIX_ACL_MAX_SIZE_OLD);
   
+ +              if (req_capsule_has_field(tsi->tsi_pill, &RMF_SHORT_IO,
+ +                                        RCL_SERVER)) {
+ +                      struct niobuf_remote *remote_nb =
+ +                              req_capsule_client_get(tsi->tsi_pill,
+ +                                                     &RMF_NIOBUF_REMOTE);
+ +                      struct ost_body *body = tsi->tsi_ost_body;
+ +
+ +                      req_capsule_set_size(tsi->tsi_pill, &RMF_SHORT_IO,
+ +                                       RCL_SERVER,
+ +                                       (body->oa.o_flags & OBD_FL_SHORT_IO) ?
+ +                                       remote_nb[0].rnb_len : 0);
+ +              }
+ +
                 rc = req_capsule_server_pack(tsi->tsi_pill);
         }
   
@@@ -1583,6 -1570,35 +1583,35 @@@ void tgt_io_thread_done(struct ptlrpc_t
         EXIT;
   }
   EXPORT_SYMBOL(tgt_io_thread_done);
+ 
+ /**
+  * Helper function for getting Data-on-MDT file server DLM lock
+  * if asked by client.
+  */
+ int tgt_mdt_data_lock(struct ldlm_namespace *ns, struct ldlm_res_id *res_id,
+                     struct lustre_handle *lh, int mode, __u64 *flags)
+ {
+       union ldlm_policy_data policy;
+       int rc;
+ 
+       ENTRY;
+ 
+       LASSERT(lh != NULL);
+       LASSERT(ns != NULL);
+       LASSERT(!lustre_handle_is_used(lh));
+ 
+       policy.l_inodebits.bits = MDS_INODELOCK_DOM | MDS_INODELOCK_UPDATE;
+       policy.l_inodebits.try_bits = 0;
+ 
+       rc = ldlm_cli_enqueue_local(ns, res_id, LDLM_IBITS, &policy, mode,
+                                   flags, ldlm_blocking_ast,
+                                   ldlm_completion_ast, ldlm_glimpse_ast,
+                                   NULL, 0, LVB_T_NONE, NULL, lh);
+ 
+       RETURN(rc == ELDLM_OK ? 0 : -EIO);
+ }
+ EXPORT_SYMBOL(tgt_mdt_data_lock);
+ 
   /**
    * Helper function for getting server side [start, start+count] DLM lock
    * if asked by client.
@@@ -1627,13 -1643,15 +1656,15 @@@ void tgt_extent_unlock(struct lustre_ha
   }
   EXPORT_SYMBOL(tgt_extent_unlock);
   
- int tgt_brw_lock(struct ldlm_namespace *ns, struct ldlm_res_id *res_id,
-                struct obd_ioobj *obj, struct niobuf_remote *nb,
-                struct lustre_handle *lh, enum ldlm_mode mode)
+ static int tgt_brw_lock(struct obd_export *exp, struct ldlm_res_id *res_id,
+                       struct obd_ioobj *obj, struct niobuf_remote *nb,
+                       struct lustre_handle *lh, enum ldlm_mode mode)
   {
+       struct ldlm_namespace   *ns = exp->exp_obd->obd_namespace;
         __u64                    flags = 0;
         int                      nrbufs = obj->ioo_bufcnt;
         int                      i;
+       int                      rc;
   
         ENTRY;
   
@@@ -1650,14 -1668,19 +1681,19 @@@
                 if (!(nb[i].rnb_flags & OBD_BRW_SRVLOCK))
                         RETURN(-EFAULT);
   
-       RETURN(tgt_extent_lock(ns, res_id, nb[0].rnb_offset,
-                              nb[nrbufs - 1].rnb_offset +
-                              nb[nrbufs - 1].rnb_len - 1,
-                              lh, mode, &flags));
+       /* MDT IO for data-on-mdt */
+       if (exp->exp_connect_data.ocd_connect_flags & OBD_CONNECT_IBITS)
+               rc = tgt_mdt_data_lock(ns, res_id, lh, mode, &flags);
+       else
+               rc = tgt_extent_lock(ns, res_id, nb[0].rnb_offset,
+                                    nb[nrbufs - 1].rnb_offset +
+                                    nb[nrbufs - 1].rnb_len - 1,
+                                    lh, mode, &flags);
+       RETURN(rc);
   }
   
- void tgt_brw_unlock(struct obd_ioobj *obj, struct niobuf_remote *niob,
-                   struct lustre_handle *lh, enum ldlm_mode mode)
+ static void tgt_brw_unlock(struct obd_ioobj *obj, struct niobuf_remote *niob,
+                          struct lustre_handle *lh, enum ldlm_mode mode)
   {
         ENTRY;
   
@@@ -1670,9 -1693,10 +1706,9 @@@
                 tgt_extent_unlock(lh, mode);
         EXIT;
   }
- -
- -static __u32 tgt_checksum_bulk(struct lu_target *tgt,
- -                             struct ptlrpc_bulk_desc *desc, int opc,
- -                             enum cksum_types cksum_type)
+ +static __u32 tgt_checksum_niobuf(struct lu_target *tgt,
+ +                               struct niobuf_local *local_nb, int npages,
+ +                               int opc, enum cksum_types cksum_type)
   {
         struct cfs_crypto_hash_desc     *hdesc;
         unsigned int                    bufsize;
@@@ -1680,6 -1704,8 +1716,6 @@@
         unsigned char                   cfs_alg = cksum_obd2cfs(cksum_type);
         __u32                           cksum;
   
- -      LASSERT(ptlrpc_is_bulk_desc_kiov(desc->bd_type));
- -
         hdesc = cfs_crypto_hash_init(cfs_alg, NULL, 0);
         if (IS_ERR(hdesc)) {
                 CERROR("%s: unable to initialize checksum hash %s\n",
@@@ -1688,64 -1714,65 +1724,64 @@@
         }
   
         CDEBUG(D_INFO, "Checksum for algo %s\n", cfs_crypto_hash_name(cfs_alg));
- -      for (i = 0; i < desc->bd_iov_count; i++) {
+ +      for (i = 0; i < npages; i++) {
                 /* corrupt the data before we compute the checksum, to
                  * simulate a client->OST data error */
                 if (i == 0 && opc == OST_WRITE &&
                     OBD_FAIL_CHECK(OBD_FAIL_OST_CHECKSUM_RECEIVE)) {
- -                      int off = BD_GET_KIOV(desc, i).kiov_offset &
- -                              ~PAGE_MASK;
- -                      int len = BD_GET_KIOV(desc, i).kiov_len;
+ +                      int off = local_nb[i].lnb_page_offset & ~PAGE_MASK;
+ +                      int len = local_nb[i].lnb_len;
                         struct page *np = tgt_page_to_corrupt;
- -                      char *ptr = kmap(BD_GET_KIOV(desc, i).kiov_page) + off;
   
                         if (np) {
- -                              char *ptr2 = kmap(np) + off;
+ +                              char *ptr = ll_kmap_atomic(local_nb[i].lnb_page,
+ +                                                      KM_USER0);
+ +                              char *ptr2 = page_address(np);
   
- -                              memcpy(ptr2, ptr, len);
- -                              memcpy(ptr2, "bad3", min(4, len));
- -                              kunmap(np);
+ +                              memcpy(ptr2 + off, ptr + off, len);
+ +                              memcpy(ptr2 + off, "bad3", min(4, len));
+ +                              ll_kunmap_atomic(ptr, KM_USER0);
   
                                 /* LU-8376 to preserve original index for
                                  * display in dump_all_bulk_pages() */
- -                              np->index = BD_GET_KIOV(desc,
- -                                                      i).kiov_page->index;
+ +                              np->index = i;
   
- -                              BD_GET_KIOV(desc, i).kiov_page = np;
+ +                              cfs_crypto_hash_update_page(hdesc, np, off,
+ +                                                          len);
+ +                              continue;
                         } else {
                                 CERROR("%s: can't alloc page for corruption\n",
                                        tgt_name(tgt));
                         }
                 }
- -              cfs_crypto_hash_update_page(hdesc,
- -                                BD_GET_KIOV(desc, i).kiov_page,
- -                                BD_GET_KIOV(desc, i).kiov_offset &
- -                                      ~PAGE_MASK,
- -                                BD_GET_KIOV(desc, i).kiov_len);
+ +              cfs_crypto_hash_update_page(hdesc, local_nb[i].lnb_page,
+ +                                local_nb[i].lnb_page_offset & ~PAGE_MASK,
+ +                                local_nb[i].lnb_len);
   
                  /* corrupt the data after we compute the checksum, to
                  * simulate an OST->client data error */
                 if (i == 0 && opc == OST_READ &&
                     OBD_FAIL_CHECK(OBD_FAIL_OST_CHECKSUM_SEND)) {
- -                      int off = BD_GET_KIOV(desc, i).kiov_offset
- -                        & ~PAGE_MASK;
- -                      int len = BD_GET_KIOV(desc, i).kiov_len;
+ +                      int off = local_nb[i].lnb_page_offset & ~PAGE_MASK;
+ +                      int len = local_nb[i].lnb_len;
                         struct page *np = tgt_page_to_corrupt;
- -                      char *ptr =
- -                        kmap(BD_GET_KIOV(desc, i).kiov_page) + off;
   
                         if (np) {
- -                              char *ptr2 = kmap(np) + off;
+ +                              char *ptr = ll_kmap_atomic(local_nb[i].lnb_page,
+ +                                                      KM_USER0);
+ +                              char *ptr2 = page_address(np);
   
- -                              memcpy(ptr2, ptr, len);
- -                              memcpy(ptr2, "bad4", min(4, len));
- -                              kunmap(np);
+ +                              memcpy(ptr2 + off, ptr + off, len);
+ +                              memcpy(ptr2 + off, "bad4", min(4, len));
+ +                              ll_kunmap_atomic(ptr, KM_USER0);
   
                                 /* LU-8376 to preserve original index for
                                  * display in dump_all_bulk_pages() */
- -                              np->index = BD_GET_KIOV(desc,
- -                                                      i).kiov_page->index;
+ +                              np->index = i;
   
- -                              BD_GET_KIOV(desc, i).kiov_page = np;
+ +                              cfs_crypto_hash_update_page(hdesc, np, off,
+ +                                                          len);
+ +                              continue;
                         } else {
                                 CERROR("%s: can't alloc page for corruption\n",
                                        tgt_name(tgt));
@@@ -1762,8 -1789,8 +1798,8 @@@
   char dbgcksum_file_name[PATH_MAX];
   
   static void dump_all_bulk_pages(struct obdo *oa, int count,
- -                                  lnet_kiov_t *iov, __u32 server_cksum,
- -                                  __u32 client_cksum)
+ +                              struct niobuf_local *local_nb,
+ +                              __u32 server_cksum, __u32 client_cksum)
   {
         struct file *filp;
         int rc, i;
@@@ -1781,9 -1808,9 +1817,9 @@@
                  oa->o_valid & OBD_MD_FLFID ? oa->o_parent_seq : (__u64)0,
                  oa->o_valid & OBD_MD_FLFID ? oa->o_parent_oid : 0,
                  oa->o_valid & OBD_MD_FLFID ? oa->o_parent_ver : 0,
- -               (__u64)iov[0].kiov_page->index << PAGE_SHIFT,
- -               ((__u64)iov[count - 1].kiov_page->index << PAGE_SHIFT) +
- -               iov[count - 1].kiov_len - 1, client_cksum, server_cksum);
+ +               local_nb[0].lnb_file_offset,
+ +               local_nb[count-1].lnb_file_offset +
+ +               local_nb[count-1].lnb_len - 1, client_cksum, server_cksum);
         filp = filp_open(dbgcksum_file_name,
                          O_CREAT | O_EXCL | O_WRONLY | O_LARGEFILE, 0600);
         if (IS_ERR(filp)) {
@@@ -1801,8 -1828,8 +1837,8 @@@
         oldfs = get_fs();
         set_fs(KERNEL_DS);
         for (i = 0; i < count; i++) {
- -              len = iov[i].kiov_len;
- -              buf = kmap(iov[i].kiov_page);
+ +              len = local_nb[i].lnb_len;
+ +              buf = kmap(local_nb[i].lnb_page);
                 while (len != 0) {
                         rc = vfs_write(filp, (__force const char __user *)buf,
                                        len, &filp->f_pos);
@@@ -1816,7 -1843,7 +1852,7 @@@
                         CDEBUG(D_INFO, "%s: wrote %d bytes\n",
                                dbgcksum_file_name, rc);
                 }
- -              kunmap(iov[i].kiov_page);
+ +              kunmap(local_nb[i].lnb_page);
         }
         set_fs(oldfs);
   
@@@ -1827,15 -1854,13 +1863,15 @@@
         return;
   }
   
- -static int check_read_checksum(struct ptlrpc_bulk_desc *desc, struct obdo *oa,
+ +static int check_read_checksum(struct niobuf_local *local_nb, int npages,
+ +                             struct obd_export *exp, struct obdo *oa,
                                const lnet_process_id_t *peer,
                                __u32 client_cksum, __u32 server_cksum,
                                enum cksum_types server_cksum_type)
   {
         char *msg;
         enum cksum_types cksum_type;
+ +      loff_t start, end;
   
         /* unlikely to happen and only if resend does not occur due to cksum
          * control failure on Client */
@@@ -1845,8 -1870,9 +1881,8 @@@
                 return 0;
         }
   
- -      if (desc->bd_export->exp_obd->obd_checksum_dump)
- -              dump_all_bulk_pages(oa, desc->bd_iov_count,
- -                                  &BD_GET_KIOV(desc, 0), server_cksum,
+ +      if (exp->exp_obd->obd_checksum_dump)
+ +              dump_all_bulk_pages(oa, npages, local_nb, server_cksum,
                                     client_cksum);
   
         cksum_type = cksum_type_unpack(oa->o_valid & OBD_MD_FLFLAGS ?
@@@ -1858,49 -1884,24 +1894,49 @@@
         else
                 msg = "should have changed on the client or in transit";
   
+ +      start = local_nb[0].lnb_file_offset;
+ +      end = local_nb[npages-1].lnb_file_offset +
+ +                                      local_nb[npages-1].lnb_len - 1;
+ +
         LCONSOLE_ERROR_MSG(0x132, "%s: BAD READ CHECKSUM: %s: from %s inode "
                 DFID " object "DOSTID" extent [%llu-%llu], client returned csum"
                 " %x (type %x), server csum %x (type %x)\n",
- -              desc->bd_export->exp_obd->obd_name,
+ +              exp->exp_obd->obd_name,
                 msg, libcfs_nid2str(peer->nid),
                 oa->o_valid & OBD_MD_FLFID ? oa->o_parent_seq : 0ULL,
                 oa->o_valid & OBD_MD_FLFID ? oa->o_parent_oid : 0,
                 oa->o_valid & OBD_MD_FLFID ? oa->o_parent_ver : 0,
                 POSTID(&oa->o_oi),
- -              (__u64)BD_GET_KIOV(desc, 0).kiov_page->index << PAGE_SHIFT,
- -              ((__u64)BD_GET_KIOV(desc,
- -                                  desc->bd_iov_count - 1).kiov_page->index
- -                      << PAGE_SHIFT) +
- -                      BD_GET_KIOV(desc, desc->bd_iov_count - 1).kiov_len - 1,
- -              client_cksum, cksum_type, server_cksum, server_cksum_type);
+ +              start, end, client_cksum, cksum_type, server_cksum,
+ +              server_cksum_type);
+ +
         return 1;
   }
   
+ +static int tgt_pages2shortio(struct niobuf_local *local, int npages,
+ +                           unsigned char *buf, int size)
+ +{
+ +      int     i, off, len, copied = size;
+ +      char    *ptr;
+ +
+ +      for (i = 0; i < npages; i++) {
+ +              off = local[i].lnb_page_offset & ~PAGE_MASK;
+ +              len = local[i].lnb_len;
+ +
+ +              CDEBUG(D_PAGE, "index %d offset = %d len = %d left = %d\n",
+ +                     i, off, len, size);
+ +              if (len > size)
+ +                      return -EINVAL;
+ +
+ +              ptr = ll_kmap_atomic(local[i].lnb_page, KM_USER0);
+ +              memcpy(buf + off, ptr, len);
+ +              ll_kunmap_atomic(ptr, KM_USER0);
+ +              buf += len;
+ +              size -= len;
+ +      }
+ +      return copied - size;
+ +}
+ +
   int tgt_brw_read(struct tgt_session_info *tsi)
   {
         struct ptlrpc_request   *req = tgt_ses_req(tsi);
@@@ -1912,13 -1913,13 +1948,14 @@@
         struct ost_body         *body, *repbody;
         struct l_wait_info       lwi;
         struct lustre_handle     lockh = { 0 };
- -      int                      npages, nob = 0, rc, i, no_reply = 0;
+ +      int                      npages, nob = 0, rc, i, no_reply = 0,
+ +                               npages_read;
         struct tgt_thread_big_cache *tbc = req->rq_svc_thread->t_data;
   
         ENTRY;
   
-       if (ptlrpc_req2svc(req)->srv_req_portal != OST_IO_PORTAL) {
+       if (ptlrpc_req2svc(req)->srv_req_portal != OST_IO_PORTAL &&
+           ptlrpc_req2svc(req)->srv_req_portal != MDS_IO_PORTAL) {
                 CERROR("%s: deny read request from %s to portal %u\n",
                        tgt_name(tsi->tsi_tgt),
                        obd_export_nid2str(req->rq_export),
@@@ -1961,8 -1962,8 +1998,8 @@@
   
         local_nb = tbc->local;
   
-       rc = tgt_brw_lock(exp->exp_obd->obd_namespace, &tsi->tsi_resid, ioo,
-                         remote_nb, &lockh, LCK_PR);
+       rc = tgt_brw_lock(exp, &tsi->tsi_resid, ioo, remote_nb, &lockh,
+                         LCK_PR);
         if (rc != 0)
                 RETURN(rc);
   
@@@ -1989,41 -1990,33 +2026,41 @@@
         if (rc != 0)
                 GOTO(out_lock, rc);
   
- -      desc = ptlrpc_prep_bulk_exp(req, npages, ioobj_max_brw_get(ioo),
- -                                  PTLRPC_BULK_PUT_SOURCE |
- -                                      PTLRPC_BULK_BUF_KIOV,
- -                                  OST_BULK_PORTAL,
- -                                  &ptlrpc_bulk_kiov_nopin_ops);
- -      if (desc == NULL)
- -              GOTO(out_commitrw, rc = -ENOMEM);
+ +      if (body->oa.o_flags & OBD_FL_SHORT_IO) {
+ +              desc = NULL;
+ +      } else {
+ +              desc = ptlrpc_prep_bulk_exp(req, npages, ioobj_max_brw_get(ioo),
+ +                                          PTLRPC_BULK_PUT_SOURCE |
+ +                                              PTLRPC_BULK_BUF_KIOV,
+ +                                          OST_BULK_PORTAL,
+ +                                          &ptlrpc_bulk_kiov_nopin_ops);
+ +              if (desc == NULL)
+ +                      GOTO(out_commitrw, rc = -ENOMEM);
+ +      }
   
         nob = 0;
+ +      npages_read = npages;
         for (i = 0; i < npages; i++) {
                 int page_rc = local_nb[i].lnb_rc;
   
                 if (page_rc < 0) {
                         rc = page_rc;
+ +                      npages_read = i;
                         break;
                 }
   
                 nob += page_rc;
- -              if (page_rc != 0) { /* some data! */
+ +              if (page_rc != 0 && desc != NULL) { /* some data! */
                         LASSERT(local_nb[i].lnb_page != NULL);
                         desc->bd_frag_ops->add_kiov_frag
                           (desc, local_nb[i].lnb_page,
- -                         local_nb[i].lnb_page_offset,
+ +                         local_nb[i].lnb_page_offset & ~PAGE_MASK,
                            page_rc);
                 }
   
                 if (page_rc != local_nb[i].lnb_len) { /* short read */
+ +                      local_nb[i].lnb_len = page_rc;
+ +                      npages_read = i + (page_rc != 0 ? 1 : 0);
                         /* All subsequent pages should be 0 */
                         while (++i < npages)
                                 LASSERT(local_nb[i].lnb_rc == 0);
@@@ -2041,9 -2034,8 +2078,9 @@@
   
                 repbody->oa.o_flags = cksum_type_pack(cksum_type);
                 repbody->oa.o_valid = OBD_MD_FLCKSUM | OBD_MD_FLFLAGS;
- -              repbody->oa.o_cksum = tgt_checksum_bulk(tsi->tsi_tgt, desc,
- -                                                      OST_READ, cksum_type);
+ +              repbody->oa.o_cksum = tgt_checksum_niobuf(tsi->tsi_tgt,
+ +                                                       local_nb, npages_read,
+ +                                                       OST_READ, cksum_type);
                 CDEBUG(D_PAGE, "checksum at read origin: %x\n",
                        repbody->oa.o_cksum);
   
@@@ -2052,8 -2044,7 +2089,8 @@@
                  * zero-cksum case) */
                 if ((body->oa.o_valid & OBD_MD_FLFLAGS) &&
                     (body->oa.o_flags & OBD_FL_RECOV_RESEND))
- -                      check_read_checksum(desc, &body->oa, &req->rq_peer,
+ +                      check_read_checksum(local_nb, npages_read, exp,
+ +                                          &body->oa, &req->rq_peer,
                                             body->oa.o_cksum,
                                             repbody->oa.o_cksum, cksum_type);
         } else {
@@@ -2063,31 -2054,11 +2100,31 @@@
   
         /* Check if client was evicted while we were doing i/o before touching
          * network */
- -      if (likely(rc == 0 &&
- -                 !CFS_FAIL_PRECHECK(OBD_FAIL_PTLRPC_CLIENT_BULK_CB2) &&
- -                 !CFS_FAIL_CHECK(OBD_FAIL_PTLRPC_DROP_BULK))) {
- -              rc = target_bulk_io(exp, desc, &lwi);
+ +      if (rc == 0) {
+ +              if (body->oa.o_flags & OBD_FL_SHORT_IO) {
+ +                      unsigned char *short_io_buf;
+ +                      int short_io_size;
+ +
+ +                      short_io_buf = req_capsule_server_get(&req->rq_pill,
+ +                                                            &RMF_SHORT_IO);
+ +                      short_io_size = req_capsule_get_size(&req->rq_pill,
+ +                                                           &RMF_SHORT_IO,
+ +                                                           RCL_SERVER);
+ +                      rc = tgt_pages2shortio(local_nb, npages_read,
+ +                                             short_io_buf, short_io_size);
+ +                      if (rc >= 0)
+ +                              req_capsule_shrink(&req->rq_pill,
+ +                                                 &RMF_SHORT_IO, rc,
+ +                                                 RCL_SERVER);
+ +                      rc = rc > 0 ? 0 : rc;
+ +              } else if (!CFS_FAIL_PRECHECK(OBD_FAIL_PTLRPC_CLIENT_BULK_CB2)) {
+ +                      rc = target_bulk_io(exp, desc, &lwi);
+ +              }
                 no_reply = rc != 0;
+ +      } else {
+ +              if (body->oa.o_flags & OBD_FL_SHORT_IO)
+ +                      req_capsule_shrink(&req->rq_pill, &RMF_SHORT_IO, 0,
+ +                                         RCL_SERVER);
         }
   
   out_commitrw:
@@@ -2115,10 -2086,8 +2152,10 @@@ out_lock
                               obd_export_nid2str(exp), rc);
         }
         /* send a bulk after reply to simulate a network delay or reordering
- -       * by a router */
- -      if (unlikely(CFS_FAIL_PRECHECK(OBD_FAIL_PTLRPC_CLIENT_BULK_CB2))) {
+ +       * by a router - Note that !desc implies short io, so there is no bulk
+ +       * to reorder. */
+ +      if (unlikely(CFS_FAIL_PRECHECK(OBD_FAIL_PTLRPC_CLIENT_BULK_CB2)) &&
+ +          desc) {
                 wait_queue_head_t        waitq;
                 struct l_wait_info       lwi1;
   
@@@ -2135,32 -2104,6 +2172,32 @@@
   }
   EXPORT_SYMBOL(tgt_brw_read);
   
+ +static int tgt_shortio2pages(struct niobuf_local *local, int npages,
+ +                           unsigned char *buf, int size)
+ +{
+ +      int     i, off, len;
+ +      char    *ptr;
+ +
+ +      for (i = 0; i < npages; i++) {
+ +              off = local[i].lnb_page_offset & ~PAGE_MASK;
+ +              len = local[i].lnb_len;
+ +
+ +              if (len == 0)
+ +                      continue;
+ +
+ +              CDEBUG(D_PAGE, "index %d offset = %d len = %d left = %d\n",
+ +                     i, off, len, size);
+ +              ptr = ll_kmap_atomic(local[i].lnb_page, KM_USER0);
+ +              if (ptr == NULL)
+ +                      return -EINVAL;
+ +              memcpy(ptr + off, buf, len < size ? len : size);
+ +              ll_kunmap_atomic(ptr, KM_USER0);
+ +              buf += len;
+ +              size -= len;
+ +      }
+ +      return 0;
+ +}
+ +
   static void tgt_warn_on_cksum(struct ptlrpc_request *req,
                               struct ptlrpc_bulk_desc *desc,
                               struct niobuf_local *local_nb, int npages,
@@@ -2175,13 -2118,14 +2212,13 @@@
         body = req_capsule_client_get(&req->rq_pill, &RMF_OST_BODY);
         LASSERT(body != NULL);
   
- -      if (req->rq_peer.nid != desc->bd_sender) {
+ +      if (desc && req->rq_peer.nid != desc->bd_sender) {
                 via = " via ";
                 router = libcfs_nid2str(desc->bd_sender);
         }
   
         if (exp->exp_obd->obd_checksum_dump)
- -              dump_all_bulk_pages(&body->oa, desc->bd_iov_count,
- -                                  &BD_GET_KIOV(desc, 0), server_cksum,
+ +              dump_all_bulk_pages(&body->oa, npages, local_nb, server_cksum,
                                     client_cksum);
   
         if (mmap) {
@@@ -2229,7 -2173,8 +2266,8 @@@ int tgt_brw_write(struct tgt_session_in
   
         ENTRY;
   
-       if (ptlrpc_req2svc(req)->srv_req_portal != OST_IO_PORTAL) {
+       if (ptlrpc_req2svc(req)->srv_req_portal != OST_IO_PORTAL &&
+           ptlrpc_req2svc(req)->srv_req_portal != MDS_IO_PORTAL) {
                 CERROR("%s: deny write request from %s to portal %u\n",
                        tgt_name(tsi->tsi_tgt),
                        obd_export_nid2str(req->rq_export),
@@@ -2293,8 -2238,8 +2331,8 @@@
   
         local_nb = tbc->local;
   
-       rc = tgt_brw_lock(exp->exp_obd->obd_namespace, &tsi->tsi_resid, ioo,
-                         remote_nb, &lockh, LCK_PW);
+       rc = tgt_brw_lock(exp, &tsi->tsi_resid, ioo, remote_nb, &lockh,
+                         LCK_PW);
         if (rc != 0)
                 GOTO(out, rc);
   
@@@ -2331,45 -2276,26 +2369,45 @@@
                         objcount, ioo, remote_nb, &npages, local_nb);
         if (rc < 0)
                 GOTO(out_lock, rc);
+ +      if (body->oa.o_flags & OBD_FL_SHORT_IO) {
+ +              int short_io_size;
+ +              unsigned char *short_io_buf;
+ +
+ +              short_io_size = req_capsule_get_size(&req->rq_pill,
+ +                                                   &RMF_SHORT_IO,
+ +                                                   RCL_CLIENT);
+ +              short_io_buf = req_capsule_client_get(&req->rq_pill,
+ +                                                    &RMF_SHORT_IO);
+ +              CDEBUG(D_INFO, "Client use short io for data transfer,"
+ +                             " size = %d\n", short_io_size);
+ +
+ +              /* Copy short io buf to pages */
+ +              rc = tgt_shortio2pages(local_nb, npages, short_io_buf,
+ +                                     short_io_size);
+ +              desc = NULL;
+ +      } else {
+ +              desc = ptlrpc_prep_bulk_exp(req, npages, ioobj_max_brw_get(ioo),
+ +                                          PTLRPC_BULK_GET_SINK |
+ +                                          PTLRPC_BULK_BUF_KIOV,
+ +                                          OST_BULK_PORTAL,
+ +                                          &ptlrpc_bulk_kiov_nopin_ops);
+ +              if (desc == NULL)
+ +                      GOTO(skip_transfer, rc = -ENOMEM);
+ +
+ +              /* NB Having prepped, we must commit... */
+ +              for (i = 0; i < npages; i++)
+ +                      desc->bd_frag_ops->add_kiov_frag(desc,
+ +                                      local_nb[i].lnb_page,
+ +                                      local_nb[i].lnb_page_offset & ~PAGE_MASK,
+ +                                      local_nb[i].lnb_len);
+ +
+ +              rc = sptlrpc_svc_prep_bulk(req, desc);
+ +              if (rc != 0)
+ +                      GOTO(skip_transfer, rc);
   
- -      desc = ptlrpc_prep_bulk_exp(req, npages, ioobj_max_brw_get(ioo),
- -                                  PTLRPC_BULK_GET_SINK | PTLRPC_BULK_BUF_KIOV,
- -                                  OST_BULK_PORTAL,
- -                                  &ptlrpc_bulk_kiov_nopin_ops);
- -      if (desc == NULL)
- -              GOTO(skip_transfer, rc = -ENOMEM);
- -
- -      /* NB Having prepped, we must commit... */
- -      for (i = 0; i < npages; i++)
- -              desc->bd_frag_ops->add_kiov_frag(desc,
- -                                               local_nb[i].lnb_page,
- -                                               local_nb[i].lnb_page_offset,
- -                                               local_nb[i].lnb_len);
- -
- -      rc = sptlrpc_svc_prep_bulk(req, desc);
- -      if (rc != 0)
- -              GOTO(skip_transfer, rc);
+ +              rc = target_bulk_io(exp, desc, &lwi);
+ +      }
   
- -      rc = target_bulk_io(exp, desc, &lwi);
         no_reply = rc != 0;
   
   skip_transfer:
@@@ -2382,10 -2308,8 +2420,10 @@@
                 repbody->oa.o_valid |= OBD_MD_FLCKSUM | OBD_MD_FLFLAGS;
                 repbody->oa.o_flags &= ~OBD_FL_CKSUM_ALL;
                 repbody->oa.o_flags |= cksum_type_pack(cksum_type);
- -              repbody->oa.o_cksum = tgt_checksum_bulk(tsi->tsi_tgt, desc,
- -                                                      OST_WRITE, cksum_type);
+ +              repbody->oa.o_cksum = tgt_checksum_niobuf(tsi->tsi_tgt,
+ +                                                        local_nb, npages,
+ +                                                        OST_WRITE,
+ +                                                        cksum_type);
                 cksum_counter++;
   
                 if (unlikely(body->oa.o_cksum != repbody->oa.o_cksum)) {
diff --combined lustre/target/tgt_main.c

index ef85c9c,3783674..da09bcb
--- 1/lustre/target/tgt_main.c
--- 2/lustre/target/tgt_main.c
+++ b/lustre/target/tgt_main.c
@@@ -152,6 -152,8 +152,8 @@@ int tgt_init(const struct lu_env *env, 
         struct lu_attr           attr;
         struct lu_fid            fid;
         struct dt_object        *o;
+       struct tg_grants_data   *tgd = &lut->lut_tgd;
+       struct obd_statfs       *osfs;
         int i, rc = 0;
   
         ENTRY;
@@@ -188,6 -190,38 +190,38 @@@
         if (!obd->obd_replayable)
                 RETURN(0);
   
+       /* initialize grant and statfs data in target */
+       dt_conf_get(env, lut->lut_bottom, &lut->lut_dt_conf);
+ 
+       /* statfs data */
+       spin_lock_init(&tgd->tgd_osfs_lock);
+       tgd->tgd_osfs_age = cfs_time_shift_64(-1000);
+       tgd->tgd_osfs_unstable = 0;
+       tgd->tgd_statfs_inflight = 0;
+       tgd->tgd_osfs_inflight = 0;
+ 
+       /* grant data */
+       spin_lock_init(&tgd->tgd_grant_lock);
+       tgd->tgd_tot_dirty = 0;
+       tgd->tgd_tot_granted = 0;
+       tgd->tgd_tot_pending = 0;
+       tgd->tgd_grant_compat_disable = 0;
+ 
+       /* populate cached statfs data */
+       osfs = &tgt_th_info(env)->tti_u.osfs;
+       rc = tgt_statfs_internal(env, lut, osfs, 0, NULL);
+       if (rc != 0) {
+               CERROR("%s: can't get statfs data, rc %d\n", tgt_name(lut),
+                       rc);
+               GOTO(out, rc);
+       }
+       if (!is_power_of_2(osfs->os_bsize)) {
+               CERROR("%s: blocksize (%d) is not a power of 2\n",
+                       tgt_name(lut), osfs->os_bsize);
+               GOTO(out, rc = -EPROTO);
+       }
+       tgd->tgd_blockbits = fls(osfs->os_bsize) - 1;
+ 
         spin_lock_init(&lut->lut_translock);
         spin_lock_init(&lut->lut_client_bitmap_lock);
   
@@@ -338,37 -372,8 +372,37 @@@ void tgt_fini(const struct lu_env *env
   }
   EXPORT_SYMBOL(tgt_fini);
   
+ +static struct kmem_cache *tgt_thread_kmem;
+ +static struct kmem_cache *tgt_session_kmem;
+ +static struct lu_kmem_descr tgt_caches[] = {
+ +      {
+ +              .ckd_cache = &tgt_thread_kmem,
+ +              .ckd_name  = "tgt_thread_kmem",
+ +              .ckd_size  = sizeof(struct tgt_thread_info),
+ +      },
+ +      {
+ +              .ckd_cache = &tgt_session_kmem,
+ +              .ckd_name  = "tgt_session_kmem",
+ +              .ckd_size  = sizeof(struct tgt_session_info)
+ +      },
+ +      {
+ +              .ckd_cache = NULL
+ +      }
+ +};
+ +
+ +
   /* context key constructor/destructor: tg_key_init, tg_key_fini */
- -LU_KEY_INIT(tgt, struct tgt_thread_info);
+ +static void *tgt_key_init(const struct lu_context *ctx,
+ +                                struct lu_context_key *key)
+ +{
+ +      struct tgt_thread_info *thread;
+ +
+ +      OBD_SLAB_ALLOC_PTR_GFP(thread, tgt_thread_kmem, GFP_NOFS);
+ +      if (thread == NULL)
+ +              return ERR_PTR(-ENOMEM);
+ +
+ +      return thread;
+ +}
   
   static void tgt_key_fini(const struct lu_context *ctx,
                          struct lu_context_key *key, void *data)
@@@ -385,7 -390,7 +419,7 @@@
         if (args->ta_args != NULL)
                 OBD_FREE(args->ta_args, sizeof(args->ta_args[0]) *
                                         args->ta_alloc_args);
- -      OBD_FREE_PTR(info);
+ +      OBD_SLAB_FREE_PTR(info, tgt_thread_kmem);
   }
   
   static void tgt_key_exit(const struct lu_context *ctx,
@@@ -407,25 -412,8 +441,25 @@@ struct lu_context_key tgt_thread_key = 
   
   LU_KEY_INIT_GENERIC(tgt);
   
- -/* context key constructor/destructor: tgt_ses_key_init, tgt_ses_key_fini */
- -LU_KEY_INIT_FINI(tgt_ses, struct tgt_session_info);
+ +static void *tgt_ses_key_init(const struct lu_context *ctx,
+ +                            struct lu_context_key *key)
+ +{
+ +      struct tgt_session_info *session;
+ +
+ +      OBD_SLAB_ALLOC_PTR_GFP(session, tgt_session_kmem, GFP_NOFS);
+ +      if (session == NULL)
+ +              return ERR_PTR(-ENOMEM);
+ +
+ +      return session;
+ +}
+ +
+ +static void tgt_ses_key_fini(const struct lu_context *ctx,
+ +                           struct lu_context_key *key, void *data)
+ +{
+ +      struct tgt_session_info *session = data;
+ +
+ +      OBD_SLAB_FREE_PTR(session, tgt_session_kmem);
+ +}
   
   /* context key: tgt_session_key */
   struct lu_context_key tgt_session_key = {
@@@ -448,13 -436,8 +482,13 @@@ struct page *tgt_page_to_corrupt
   
   int tgt_mod_init(void)
   {
+ +      int     result;
         ENTRY;
   
+ +      result = lu_kmem_init(tgt_caches);
+ +      if (result != 0)
+ +              RETURN(result);
+ +
         tgt_page_to_corrupt = alloc_page(GFP_KERNEL);
   
         tgt_key_init_generic(&tgt_thread_key, NULL);
@@@ -478,7 -461,5 +512,7 @@@ void tgt_mod_exit(void
         lu_context_key_degister(&tgt_thread_key);
         lu_context_key_degister(&tgt_session_key);
         update_info_fini();
+ +
+ +      lu_kmem_fini(tgt_caches);
   }
   
diff --combined lustre/tests/conf-sanity.sh

index 7088394,13c1857..829f33a
--- 1/lustre/tests/conf-sanity.sh
--- 2/lustre/tests/conf-sanity.sh
+++ b/lustre/tests/conf-sanity.sh
@@@ -64,8 -64,8 +64,8 @@@ OSTDEV1_2=$fs2ost_DE
   OSTDEV2_2=$fs3ost_DEV
   
   if ! combined_mgs_mds; then
- -      # bug number for skipped test: LU-9860 LU-9860 LU-9860 LU-9860
- -      ALWAYS_EXCEPT="$ALWAYS_EXCEPT  33a     43b     53b     54b"
+ +      # bug number for skipped test: LU-9860 LU-9860 LU-9860
+ +      ALWAYS_EXCEPT="$ALWAYS_EXCEPT  43b     53b     54b"
         # bug number for skipped test: LU-9875 LU-9879 LU-9879 LU-9879 LU-9879
         ALWAYS_EXCEPT="$ALWAYS_EXCEPT  70e     80      84      87      100"
         # bug number for skipped test: LU-8110 LU-9400 LU-9879 LU-9879 LU-9879
@@@ -255,9 -255,7 +255,9 @@@ cleanup_nocli() 
   }
   
   cleanup() {
- -      umount_client $MOUNT || return 200
+ +      local force=""
+ +      [ "x$1" != "x" ] && force='-f'
+ +      umount_client $MOUNT $force|| return 200
         cleanup_nocli || return $?
   }
   
@@@ -1689,6 -1687,7 +1689,7 @@@ t32_test() 
         local tarball=$1
         local writeconf=$2
         local dne_upgrade=${dne_upgrade:-"no"}
+       local dom_upgrade=${dom_upgrade:-"no"}
         local ff_convert=${ff_convert:-"no"}
         local shall_cleanup_mdt=false
         local shall_cleanup_mdt1=false
@@@ -2025,11 -2024,6 +2026,6 @@@
                 shall_cleanup_lustre=true
                 $r $LCTL set_param debug="$PTLDEBUG"
   
-               t32_verify_quota $node $fsname $tmp/mnt/lustre || {
-                       error_noexit "verify quota failed"
-                       return 1
-               }
- 
                 if $r test -f $tmp/list; then
                         #
                         # There is not a Test Framework API to copy files to or
@@@ -2081,6 -2075,43 +2077,43 @@@
                         echo "list verification skipped"
                 fi
   
+               if [ "$dom_upgrade" != "no" ]; then
+                       echo "Check DoM file can be created"
+                       $LFS setstripe -E 1M -L mdt -E EOF $tmp/mnt/lustre/dom || {
+                               error_noexit "Verify DoM creation"
+                               return 1
+                       }
+                       [ $($LFS getstripe -L $tmp/mnt/lustre/dom) == 100 ] || {
+                               error_noexit "Verify a DoM file"
+                               return 1
+                       }
+                       dd if=/dev/urandom of=$tmp/mnt/lustre/dom bs=4096 \
+                               count=1 conv=fsync || {
+                               error_noexit "Cannot write to DoM file"
+                               return 1
+                       }
+                       [ $(stat -c%s $tmp/mnt/lustre/dom) == 4096 ] || {
+                               error_noexit "DoM: bad size after write"
+                               return 1
+                       }
+                       rm $tmp/mnt/lustre/dom
+ 
+                       $r $LCTL get_param -n lod.*MDT0000*.dom_stripesize || {
+                               error_noexit "Getting \"dom_stripesize\""
+                               return 1
+                       }
+                       $r $LCTL conf_param \
+                               $fsname-MDT0000.lod.dom_stripesize=0 || {
+                               error_noexit "Changing \"dom_stripesize\""
+                               return 1
+                       }
+                       wait_update $(facet_host mds) "$LCTL get_param \
+                               -n lod.*MDT0000*.dom_stripesize" 0 || {
+                               error_noexit "Verifying \"dom_stripesize\""
+                               return 1
+                       }
+               fi
+ 
                 if [ "$dne_upgrade" != "no" ]; then
                         $LFS mkdir -i 1 -c2 $tmp/mnt/lustre/striped_dir || {
                                 error_noexit "set striped dir failed"
@@@ -2385,6 -2416,21 +2418,21 @@@ test_32d() 
   }
   run_test 32d "convert ff test"
   
+ test_32e() {
+       local tarballs
+       local tarball
+       local rc=0
+ 
+       t32_check
+       for tarball in $tarballs; do
+               echo $tarball | grep "2_9" || continue
+               #load_modules
+               dom_upgrade=yes t32_test $tarball writeconf || let "rc += $?"
+       done
+       return $rc
+ }
+ run_test 32e "dom upgrade test"
+ 
   test_33a() { # bug 12333, was test_33
         local FSNAME2=test-123
         local MDSDEV=$(mdsdevname ${SINGLEMDS//mds/})
@@@ -2409,20 -2455,15 +2457,20 @@@
                 mkfsoptions="--mkfsoptions=\\\"-J size=8\\\"" # See bug 17931.
         fi
   
- -      add fs2mds $(mkfs_opts mds1 ${fs2mdsdev}) --mgs --fsname=${FSNAME2} \
- -              --reformat $mkfsoptions $fs2mdsdev $fs2mdsvdev || exit 10
+ +      if combined_mgs_mds; then
+ +              local mgs_flag="--mgs"
+ +      fi
+ +
+ +      add fs2mds $(mkfs_opts mds1 ${fs2mdsdev}) --fsname=${FSNAME2} \
+ +              --reformat $mgs_flag $mkfsoptions $fs2mdsdev $fs2mdsvdev ||
+ +              exit 10
         add fs2ost $(mkfs_opts ost1 ${fs2ostdev}) --mgsnode=$MGSNID \
                 --fsname=${FSNAME2} --index=8191 --reformat $fs2ostdev \
                 $fs2ostvdev || exit 10
   
         start fs2mds $fs2mdsdev $MDS_MOUNT_OPTS && trap cleanup_fs2 EXIT INT
         start fs2ost $fs2ostdev $OST_MOUNT_OPTS
- -      do_facet $SINGLEMDS "$LCTL conf_param $FSNAME2.sys.timeout=200" ||
+ +      do_facet mgs "$LCTL conf_param $FSNAME2.sys.timeout=200" ||
                 error "$LCTL conf_param $FSNAME2.sys.timeout=200 failed"
         mkdir -p $MOUNT2 || error "mkdir $MOUNT2 failed"
         $MOUNT_CMD $MGSNID:/${FSNAME2} $MOUNT2 || error "$MOUNT_CMD failed"
@@@ -2897,7 -2938,7 +2945,7 @@@ test_41b() 
         echo "blah blah" > $MOUNT/$tfile
         cat $MOUNT/$tfile || error "cat $MOUNT/$tfile failed"
   
- -      umount_client $MOUNT || error "umount_client $MOUNT failed"
+ +      umount_client $MOUNT -f || error "umount_client $MOUNT failed"
         stop_ost || error "Unable to stop OST1"
         stop_mds || error "Unable to stop MDS"
         stop_mds || error "Unable to stop MDS on second try"
@@@ -5029,7 -5070,6 +5077,7 @@@ test_70e() 
         soc=$(do_facet mds1 "$LCTL get_param -n \
                 mdt.*MDT0000.sync_lock_cancel")
         [ $soc == "never" ] || error "SoC enabled on single MDS"
+ +      umount_client $MOUNT -f > /dev/null
   
         cleanup || error "cleanup failed with $?"
   }
@@@ -7162,7 -7202,7 +7210,7 @@@ test_99(
         do_facet ost1 $DEBUGFS -c -R stats `ostdevname 1` | grep "meta_bg" ||
                 error "meta_bg is not set"
   
- -      return 0
+ +      reformat
   }
   run_test 99 "Adding meta_bg option"
   
@@@ -7439,7 -7479,7 +7487,7 @@@ error_and_umount() 
   }
   
   test_105() {
- -      cleanup
+ +      cleanup -f
         reformat
         setup
         mkdir -p $TMP/$tdir
diff --combined lustre/tests/sanity.sh

index 6c1b7ed,929fdc4..881a3d7
--- 1/lustre/tests/sanity.sh
--- 2/lustre/tests/sanity.sh
+++ b/lustre/tests/sanity.sh
@@@ -12,8 -12,8 +12,8 @@@ ONLY=${ONLY:-"$*"
   ALWAYS_EXCEPT="                42a    42b      42c     45   68b $SANITY_EXCEPT"
   # UPDATE THE COMMENT ABOVE WITH BUG NUMBERS WHEN CHANGING ALWAYS_EXCEPT!
   
- -# skipped tests: LU-2036 LU-8411 LU-9096 LU-9054
- -ALWAYS_EXCEPT="  76    407     253     312 $ALWAYS_EXCEPT"
+ +# skipped tests: LU-8411 LU-9096 LU-9054 LU-10199
+ +ALWAYS_EXCEPT="  407     253     312     56xb     $ALWAYS_EXCEPT"
   
   # Check Grants after these tests
   GRANT_CHECK_LIST="$GRANT_CHECK_LIST 42a 42b 42c 42d 42e 63a 63b 64a 64b 64c"
@@@ -40,6 -40,7 +40,7 @@@ SRCDIR=$(cd $(dirname $0); echo $PWD
   export PATH=$PATH:/sbin
   
   TMP=${TMP:-/tmp}
+ OSC=${OSC:-"osc"}
   
   CC=${CC:-cc}
   CHECKSTAT=${CHECKSTAT:-"checkstat -v"}
@@@ -1712,10 -1713,10 +1713,10 @@@ test_27w() { # bug 1099
         $LFS setstripe -S 65536 $DIR/$tdir/f0 || error "setstripe failed"
         [ $($LFS getstripe -S $DIR/$tdir/f0) -ne 65536 ] &&
                 error "stripe size $size != 65536" || true
- -      [ $($LFS getstripe -d $DIR/$tdir | grep -c "stripe_count") -ne 1 ] &&
- -              error "$LFS getstripe -d $DIR/$tdir failed" || true
+ +      [ $($LFS getstripe -d $DIR/$tdir | grep -c "stripe_count") -eq 0 ] &&
+ +              error "$LFS getstripe -d $DIR/$tdir no 'stripe_count'" || true
   }
- -run_test 27w "check $LFS setstripe -S option"
+ +run_test 27w "check $LFS setstripe -S and getstrip -d options"
   
   test_27wa() {
         [[ $OSTCOUNT -lt 2 ]] &&
@@@ -3587,7 -3588,7 +3588,7 @@@ test_41() 
   run_test 41 "test small file write + fstat ====================="
   
   count_ost_writes() {
-       lctl get_param -n osc.*.stats |
+       lctl get_param -n ${OSC}.*.stats |
                 awk -vwrites=0 '/ost_write/ { writes += $2 } \
                         END { printf("%0.0f", writes) }'
   }
@@@ -3647,7 -3648,7 +3648,7 @@@ setup_test42() 
   test_42a() {
         [ $PARALLEL == "yes" ] && skip "skip parallel run" && return
         setup_test42
-       cancel_lru_locks osc
+       cancel_lru_locks $OSC
         stop_writeback
         sync; sleep 1; sync # just to be safe
         BEFOREWRITES=`count_ost_writes`
@@@ -3663,7 -3664,7 +3664,7 @@@ run_test 42a "ensure that we don't flus
   test_42b() {
         [ $PARALLEL == "yes" ] && skip "skip parallel run" && return
         setup_test42
-       cancel_lru_locks osc
+       cancel_lru_locks $OSC
         stop_writeback
         sync
         dd if=/dev/zero of=$DIR/f42b bs=1024 count=100
@@@ -3699,21 -3700,21 +3700,21 @@@ run_test 42b "test destroy of file wit
   # start the file with a full-file pw lock to match against
   # until the truncate.
   trunc_test() {
-         test=$1
-         file=$DIR/$test
-         offset=$2
-       cancel_lru_locks osc
+       test=$1
+       file=$DIR/$test
+       offset=$2
+       cancel_lru_locks $OSC
         stop_writeback
         # prime the file with 0,EOF PW to match
         touch $file
           $TRUNCATE $file 0
           sync; sync
         # now the real test..
-         dd if=/dev/zero of=$file bs=1024 count=100
-         BEFOREWRITES=`count_ost_writes`
-         $TRUNCATE $file $offset
-         cancel_lru_locks osc
-         AFTERWRITES=`count_ost_writes`
+       dd if=/dev/zero of=$file bs=1024 count=100
+       BEFOREWRITES=`count_ost_writes`
+       $TRUNCATE $file $offset
+       cancel_lru_locks $OSC
+       AFTERWRITES=`count_ost_writes`
         start_writeback
   }
   
@@@ -3912,7 -3913,7 +3913,7 @@@ run_test 44a "test sparse pwrite ======
   
   dirty_osc_total() {
         tot=0
-       for d in `lctl get_param -n osc.*.cur_dirty_bytes`; do
+       for d in `lctl get_param -n ${OSC}.*.cur_dirty_bytes`; do
                 tot=$(($tot + $d))
         done
         echo $tot
@@@ -4153,13 -4154,11 +4154,13 @@@ test_51b() 
         [[ $numfree -lt $nrdirs ]] && skip "not enough blocks ($numfree)" &&
                 return
   
- -      trap cleanup_print_lfsdf EXIT
+ +      trap cleanup_print_lfs_df EXIT
   
         # create files
- -      createmany -d $dir/d $nrdirs ||
+ +      createmany -d $dir/d $nrdirs || {
+ +              unlinkmany $dir/d $nrdirs
                 error "failed to create $nrdirs subdirs in MDT$mdtidx:$dir"
+ +      }
   
         # really created :
         nrdirs=$(ls -U $dir | wc -l)
@@@ -4268,10 -4267,8 +4269,10 @@@ test_51f() 
                 echo "left ulimit at $ulimit_old"
         fi
   
- -      createmany -o -k -t 120 $DIR/$tdir/f $numfree ||
+ +      createmany -o -k -t 120 $DIR/$tdir/f $numfree || {
+ +              unlinkmany $DIR/$tdir/f $numfree
                 error "create+open $numfree files in $DIR/$tdir failed"
+ +      }
         ulimit -n $ulimit_old
   
         # if createmany exits at 120s there will be fewer than $numfree files
@@@ -5154,15 -5151,16 +5155,15 @@@ test_56x() 
         check_swap_layouts_support && return 0
         [[ $OSTCOUNT -lt 2 ]] && skip_env "needs >= 2 OSTs" && return
   
- -      local dir0=$DIR/$tdir/$testnum
- -      test_mkdir -p $dir0
- -
+ +      local dir0=$DIR/$tdir
         local ref1=/etc/passwd
         local file1=$dir0/file1
   
- -      $SETSTRIPE -c 2 $file1
+ +      test_mkdir $dir0 || error "creating dir $dir0"
+ +      $LFS setstripe -c 2 $file1
         cp $ref1 $file1
         $LFS migrate -c 1 $file1 || error "migrate failed rc = $?"
- -      stripe=$($GETSTRIPE -c $file1)
+ +      stripe=$($LFS getstripe -c $file1)
         [[ $stripe == 1 ]] || error "stripe of $file1 is $stripe != 1"
         cmp $file1 $ref1 || error "content mismatch $file1 differs from $ref1"
   
@@@ -5181,10 -5179,10 +5182,10 @@@ test_56xa() 
         local ref1=/etc/passwd
         local file1=$dir0/file1
   
- -      $SETSTRIPE -c 2 $file1
+ +      $LFS setstripe -c 2 $file1
         cp $ref1 $file1
         $LFS migrate --block -c 1 $file1 || error "migrate failed rc = $?"
- -      local stripe=$($GETSTRIPE -c $file1)
+ +      local stripe=$($LFS getstripe -c $file1)
         [[ $stripe == 1 ]] || error "stripe of $file1 is $stripe != 1"
         cmp $file1 $ref1 || error "content mismatch $file1 differs from $ref1"
   
@@@ -5193,110 -5191,6 +5194,110 @@@
   }
   run_test 56xa "lfs migration --block support"
   
+ +check_migrate_links() {
+ +      local dir="$1"
+ +      local file1="$dir/file1"
+ +      local begin="$2"
+ +      local count="$3"
+ +      local total_count=$(($begin + $count - 1))
+ +      local symlink_count=10
+ +      local uniq_count=10
+ +
+ +      if [ ! -f "$file1" ]; then
+ +              echo -n "creating initial file..."
+ +              $LFS setstripe -c 1 -S "512k" "$file1" ||
+ +                      error "cannot setstripe initial file"
+ +              echo "done"
+ +
+ +              echo -n "creating symlinks..."
+ +              for s in $(seq 1 $symlink_count); do
+ +                      ln -s "$file1" "$dir/slink$s" ||
+ +                              error "cannot create symlinks"
+ +              done
+ +              echo "done"
+ +
+ +              echo -n "creating nonlinked files..."
+ +              createmany -o "$dir/uniq" 1 10 &> /dev/null ||
+ +                      error "cannot create nonlinked files"
+ +              echo "done"
+ +      fi
+ +
+ +      # create hard links
+ +      if [ ! -f "$dir/file$total_count" ]; then
+ +              echo -n "creating hard links $begin:$total_count..."
+ +              createmany -l"$file1" "$dir/file" "$begin" "$count" &>  \
+ +                      /dev/null || error "cannot create hard links"
+ +              echo "done"
+ +      fi
+ +
+ +      echo -n "checking number of hard links listed in xattrs..."
+ +      local fid=$($LFS getstripe -F "$file1")
+ +      local paths=($($LFS fid2path "$MOUNT" "$fid" 2> /dev/null))
+ +
+ +      echo "${#paths[*]}"
+ +      if [ ${#paths[*]} -lt $total_count -a "$begin" -eq 2  ]; then
+ +                      echo "hard link list has unexpected size, skipping test"
+ +                      return 0
+ +      fi
+ +      if [ ${#paths[*]} -ge $total_count -a "$begin" -ne 2  ]; then
+ +                      error "link names should exceed xattrs size"
+ +      fi
+ +
+ +      echo -n "migrating files..."
+ +      local migrate_out=$($LFS_MIGRATE -y -S '1m' $dir)
+ +      local rc=$?
+ +      [ $rc -eq 0 ] || error "migrate failed rc = $rc"
+ +      echo "done"
+ +
+ +      # make sure all links have been properly migrated
+ +      echo -n "verifying files..."
+ +      fid=$($LFS getstripe -F "$file1") ||
+ +              error "cannot get fid for file $file1"
+ +      for i in $(seq 2 $total_count); do
+ +              local fid2=$($LFS getstripe -F $dir/file$i)
+ +              [ "$fid2" == "$fid" ] ||
+ +                      error "migrated hard link has mismatched FID"
+ +      done
+ +
+ +      # make sure hard links were properly detected, and migration was
+ +      # performed only once for the entire link set; nonlinked files should
+ +      # also be migrated
+ +      local actual=$(grep -c 'done migrate' <<< "$migrate_out")
+ +      local expected=$(($uniq_count + 1))
+ +      [ "$actual" -eq  "$expected" ] ||
+ +              error "hard links individually migrated ($actual != $expected)"
+ +
+ +      # make sure the correct number of hard links are present
+ +      local hardlinks=$(stat -c '%h' "$file1")
+ +      [ $hardlinks -eq $total_count ] ||
+ +              error "num hard links $hardlinks != $total_count"
+ +      echo "done"
+ +
+ +      return 0
+ +}
+ +
+ +test_56xb() {
+ +      local dir0="$DIR/$tdir"
+ +
+ +      test_mkdir "$dir0" || error "cannot create dir $dir0"
+ +
+ +      echo "testing lfs migrate mode when all links fit within xattrs"
+ +      LFS_MIGRATE_RSYNC=false check_migrate_links "$dir0" 2 99
+ +
+ +      echo "testing rsync mode when all links fit within xattrs"
+ +      LFS_MIGRATE_RSYNC=true check_migrate_links "$dir0" 2 99
+ +
+ +      echo "testing lfs migrate mode when all links do not fit within xattrs"
+ +      LFS_MIGRATE_RSYNC=false check_migrate_links "$dir0" 101 100
+ +
+ +      echo "testing rsync mode when all links do not fit within xattrs"
+ +      LFS_MIGRATE_RSYNC=true check_migrate_links "$dir0" 101 100
+ +
+ +      # clean up
+ +      rm -rf $dir0
+ +}
+ +run_test 56xb "lfs migration hard link support"
+ +
   test_56y() {
         [ $(lustre_version_code $SINGLEMDS) -lt $(version_code 2.4.53) ] &&
                 skip "No HSM $(lustre_build_version $SINGLEMDS) MDS < 2.4.53" &&
@@@ -6114,9 -6008,21 +6115,9 @@@ num_inodes() 
         awk '/lustre_inode_cache/ {print $2; exit}' /proc/slabinfo
   }
   
- -get_inode_slab_tunables() {
- -      awk '/lustre_inode_cache/ {print $9," ",$10," ",$11; exit}' /proc/slabinfo
- -}
- -
- -set_inode_slab_tunables() {
- -      echo "lustre_inode_cache $1" > /proc/slabinfo
- -}
- -
   test_76() { # Now for bug 20433, added originally in bug 1443
         [ $PARALLEL == "yes" ] && skip "skip parallel run" && return
- -      local SLAB_SETTINGS=$(get_inode_slab_tunables)
         local CPUS=$(getconf _NPROCESSORS_ONLN 2>/dev/null)
- -      # we cannot set limit below 1 which means 1 inode in each
- -      # per-cpu cache is still allowed
- -      set_inode_slab_tunables "1 1 0"
         cancel_lru_locks osc
         BEFORE_INODES=$(num_inodes)
         echo "before inodes: $BEFORE_INODES"
@@@ -6139,6 -6045,7 +6140,6 @@@
                         error "inode slab grew from $BEFORE_INODES to $AFTER_INODES"
                 fi
         done
- -      set_inode_slab_tunables "$SLAB_SETTINGS"
   }
   run_test 76 "confirm clients recycle inodes properly ===="
   
@@@ -6878,7 -6785,7 +6879,7 @@@ test_101e() 
         done
   
         echo "Cancel LRU locks on lustre client to flush the client cache"
-       cancel_lru_locks osc
+       cancel_lru_locks $OSC
   
         echo "Reset readahead stats"
         $LCTL set_param -n llite.*.read_ahead_stats 0
@@@ -7026,7 -6933,7 +7027,7 @@@ setup_test102() 
         done
   
         cd $DIR
- -      $1 $TAR cf $TMP/f102.tar $tdir --xattrs
+ +      $1 tar cf $TMP/f102.tar $tdir --xattrs
   }
   
   cleanup_test102() {
@@@ -7185,17 -7092,20 +7186,17 @@@ compare_stripe_info1() 
         return 0
   }
   
- -find_lustre_tar() {
- -      [ -n "$(which tar 2>/dev/null)" ] &&
- -              strings $(which tar) | grep -q "lustre" && echo tar
+ +have_xattrs_include() {
+ +      tar --help | grep -q xattrs-include &&
+ +              echo --xattrs-include="lustre.*"
   }
   
   test_102d() {
         [ $PARALLEL == "yes" ] && skip "skip parallel run" && return
- -      # b10930: tar test for trusted.lov xattr
- -      TAR=$(find_lustre_tar)
- -      [ -z "$TAR" ] && skip_env "lustre-aware tar is not installed" && return
         [[ $OSTCOUNT -lt 2 ]] && skip_env "needs >= 2 OSTs" && return
+ +      XINC=$(have_xattrs_include)
         setup_test102
- -      test_mkdir $DIR/$tdir
- -      $TAR xf $TMP/$tfile.tar -C $DIR/$tdir --xattrs
+ +      tar xf $TMP/f102.tar -C $DIR/$tdir --xattrs $XINC
         cd $DIR/$tdir/$tdir
         compare_stripe_info1
   }
@@@ -7203,13 -7113,14 +7204,13 @@@ run_test 102d "tar restore stripe info 
   
   test_102f() {
         [ $PARALLEL == "yes" ] && skip "skip parallel run" && return
- -      # b10930: tar test for trusted.lov xattr
- -      TAR=$(find_lustre_tar)
- -      [ -z "$TAR" ] && skip_env "lustre-aware tar is not installed" && return
         [[ $OSTCOUNT -lt 2 ]] && skip_env "needs >= 2 OSTs" && return
+ +      XINC=$(have_xattrs_include)
         setup_test102
         test_mkdir $DIR/$tdir.restore
         cd $DIR
- -      $TAR cf - --xattrs $tdir | $TAR xf - --xattrs -C $DIR/$tdir.restore
+ +      tar cf - --xattrs $tdir | tar xf - \
+ +              -C $DIR/$tdir.restore --xattrs $XINC
         cd $DIR/$tdir.restore/$tdir
         compare_stripe_info1
   }
@@@ -7283,11 -7194,13 +7284,11 @@@ run_test 102i "lgetxattr test on symbol
   
   test_102j() {
         [ $PARALLEL == "yes" ] && skip "skip parallel run" && return
- -      TAR=$(find_lustre_tar)
- -      [ -z "$TAR" ] && skip_env "lustre-aware tar is not installed" && return
         [[ $OSTCOUNT -lt 2 ]] && skip_env "needs >= 2 OSTs" && return
+ +      XINC=$(have_xattrs_include)
         setup_test102 "$RUNAS"
- -      test_mkdir $DIR/$tdir
         chown $RUNAS_ID $DIR/$tdir
- -      $RUNAS $TAR xf $TMP/f102.tar -C $DIR/$tdir --xattrs
+ +      $RUNAS tar xf $TMP/f102.tar -C $DIR/$tdir --xattrs $XINC
         cd $DIR/$tdir/$tdir
         compare_stripe_info1 "$RUNAS"
   }
@@@ -10166,31 -10079,31 +10167,31 @@@ test_150() 
         [ $PARALLEL == "yes" ] && skip "skip parallel run" && return
         local TF="$TMP/$tfile"
   
-         dd if=/dev/urandom of=$TF bs=6096 count=1 || error "dd failed"
-         cp $TF $DIR/$tfile
-         cancel_lru_locks osc
-         cmp $TF $DIR/$tfile || error "$TMP/$tfile $DIR/$tfile differ"
-         remount_client $MOUNT
-         df -P $MOUNT
-         cmp $TF $DIR/$tfile || error "$TF $DIR/$tfile differ (remount)"
+       dd if=/dev/urandom of=$TF bs=6096 count=1 || error "dd failed"
+       cp $TF $DIR/$tfile
+       cancel_lru_locks $OSC
+       cmp $TF $DIR/$tfile || error "$TMP/$tfile $DIR/$tfile differ"
+       remount_client $MOUNT
+       df -P $MOUNT
+       cmp $TF $DIR/$tfile || error "$TF $DIR/$tfile differ (remount)"
   
-         $TRUNCATE $TF 6000
-         $TRUNCATE $DIR/$tfile 6000
-         cancel_lru_locks osc
-         cmp $TF $DIR/$tfile || error "$TF $DIR/$tfile differ (truncate1)"
+       $TRUNCATE $TF 6000
+       $TRUNCATE $DIR/$tfile 6000
+       cancel_lru_locks $OSC
+       cmp $TF $DIR/$tfile || error "$TF $DIR/$tfile differ (truncate1)"
   
-         echo "12345" >>$TF
-         echo "12345" >>$DIR/$tfile
-         cancel_lru_locks osc
-         cmp $TF $DIR/$tfile || error "$TF $DIR/$tfile differ (append1)"
+       echo "12345" >>$TF
+       echo "12345" >>$DIR/$tfile
+       cancel_lru_locks $OSC
+       cmp $TF $DIR/$tfile || error "$TF $DIR/$tfile differ (append1)"
   
-         echo "12345" >>$TF
-         echo "12345" >>$DIR/$tfile
-         cancel_lru_locks osc
-         cmp $TF $DIR/$tfile || error "$TF $DIR/$tfile differ (append2)"
+       echo "12345" >>$TF
+       echo "12345" >>$DIR/$tfile
+       cancel_lru_locks $OSC
+       cmp $TF $DIR/$tfile || error "$TF $DIR/$tfile differ (append2)"
   
-         rm -f $TF
-         true
+       rm -f $TF
+       true
   }
   run_test 150 "truncate/append tests"
   
@@@ -10738,7 -10651,7 +10739,7 @@@ test_155_small_load() 
       dd if=/dev/urandom of=$temp bs=6096 count=1 || \
           error "dd of=$temp bs=6096 count=1 failed"
       cp $temp $file
-     cancel_lru_locks osc
+     cancel_lru_locks $OSC
       cmp $temp $file || error "$temp $file differ"
   
       $TRUNCATE $temp 6000
@@@ -12704,12 -12617,16 +12705,12 @@@ run_test 214 "hash-indexed directory te
   
   # having "abc" as 1st arg, creates $TMP/lnet_abc.out and $TMP/lnet_abc.sys
   create_lnet_proc_files() {
- -      lctl get_param -n $1 >$TMP/lnet_$1.out || error "cannot read lnet.$1"
- -      sysctl lnet.$1 >$TMP/lnet_$1.sys_tmp || error "cannot read lnet.$1"
- -
- -      sed "s/^lnet.$1\ =\ //g" "$TMP/lnet_$1.sys_tmp" >$TMP/lnet_$1.sys
- -      rm -f "$TMP/lnet_$1.sys_tmp"
+ +      lctl get_param -n $1 >$TMP/lnet_$1.sys || error "cannot read lnet.$1"
   }
   
   # counterpart of create_lnet_proc_files
   remove_lnet_proc_files() {
- -      rm -f $TMP/lnet_$1.out $TMP/lnet_$1.sys
+ +      rm -f $TMP/lnet_$1.sys
   }
   
   # uses 1st arg as trailing part of filename, 2nd arg as description for reports,
@@@ -12819,6 -12736,7 +12820,6 @@@ test_215() { # for bugs 18102, 21079, 2
   
         # can we successfully write to lnet.stats?
         lctl set_param -n stats=0 || error "cannot write to lnet.stats"
- -      sysctl -w lnet.stats=0 || error "cannot write to lnet.stats"
   }
   run_test 215 "lnet exists and has proper content - bugs 18102, 21079, 21517"
   
@@@ -14054,10 -13972,9 +14055,10 @@@ test_239() 
         mkdir -p $DIR/$tdir
         createmany -o $DIR/$tdir/f- 5000
         unlinkmany $DIR/$tdir/f- 5000
- -      do_nodes $list "lctl set_param -n osp*.*.sync_changes 1"
- -      changes=$(do_nodes $list "lctl get_param -n osc.*MDT*.sync_changes \
- -                      osc.*MDT*.sync_in_flight" | calc_sum)
+ +      [ $(lustre_version_code $SINGLEMDS) -gt $(version_code 2.10.53) ] &&
+ +              do_nodes $list "lctl set_param -n osp.*.force_sync=1"
+ +      changes=$(do_nodes $list "lctl get_param -n osp.*MDT*.sync_changes \
+ +                      osp.*MDT*.sync_in_flight" | calc_sum)
         [ "$changes" -eq 0 ] || error "$changes not synced"
   }
   run_test 239 "osp_sync test"
@@@ -14110,7 -14027,7 +14111,7 @@@ run_test 240 "race between ldlm enqueu
   test_241_bio() {
         for LOOP in $(seq $1); do
                 dd if=$DIR/$tfile of=/dev/null bs=40960 count=1 2>/dev/null
-               cancel_lru_locks osc || true
+               cancel_lru_locks $OSC || true
         done
   }
   
@@@ -14124,7 -14041,7 +14125,7 @@@ test_241_dio() 
   test_241a() { # was test_241
         dd if=/dev/zero of=$DIR/$tfile count=1 bs=40960
         ls -la $DIR/$tfile
-       cancel_lru_locks osc
+       cancel_lru_locks $OSC
         test_241_bio 1000 &
         PID=$!
         test_241_dio 1000
@@@ -14986,12 -14903,11 +14987,12 @@@ test_256() 
   
         #after mount new plainllog is used
         touch $DIR/$tdir/{11..19}
- -      local TEMP256FILE=$(mktemp TEMP256XXXXXX)
+ +      do_facet mds1 sync
+ +      local TEMP256FILE=$(mktemp -u TEMP256XXXXXX)
         cat_sl=$(do_facet mds1 \
         "$DEBUGFS -R \\\"dump changelog_catalog $TEMP256FILE\\\" $mdt_dev; \
          llog_reader $TEMP256FILE | grep \\\"type=1064553b\\\" | wc -l")
- -      rm $TEMP256FILE
+ +      do_facet mds1 rm $TEMP256FILE
   
         if (( cat_sl != 2 )); then
                 do_facet mds1 $LCTL --device $MDT0 changelog_deregister $cl_user
@@@ -15000,12 -14916,11 +15001,12 @@@
   
         $LFS changelog_clear $MDT0 $cl_user 0
   
- -      TEMP256FILE=$(mktemp TEMP256XXXXXX)
+ +      do_facet mds1 sync
+ +      TEMP256FILE=$(mktemp -u TEMP256XXXXXX)
         cat_sl=$(do_facet mds1 \
         "$DEBUGFS -R \\\"dump changelog_catalog $TEMP256FILE\\\" $mdt_dev; \
          llog_reader $TEMP256FILE | grep \\\"type=1064553b\\\" | wc -l")
- -      rm $TEMP256FILE
+ +      do_facet mds1 rm $TEMP256FILE
   
         do_facet mds1 $LCTL --device $MDT0 changelog_deregister $cl_user
   
@@@ -15082,6 -14997,370 +15083,370 @@@ test_260() 
   }
   run_test 260 "Check mdc_close fail"
   
+ ### Data-on-MDT sanity tests ###
+ test_270a() {
+       # create DoM file
+       local dom=$DIR/$tdir/dom_file
+       local tmp=$DIR/$tdir/tmp_file
+ 
+       mkdir -p $DIR/$tdir
+ 
+       # basic checks for DoM component creation
+       $LFS setstripe -E 1024K -E 1024K -L mdt $dom 2>/dev/null &&
+               error "Can set MDT layout to non-first entry"
+ 
+       $LFS setstripe -E 1024K -L mdt -E 1024K -L mdt $dom 2>/dev/null &&
+               error "Can define multiple entries as MDT layout"
+ 
+       $LFS setstripe -E 1M -L mdt $dom ||
+               error "Can't create DoM layout"
+ 
+       [ $($LFS getstripe -L $dom) == 100 ] || error "bad pattern"
+       [ $($LFS getstripe -c $dom) == 0 ] || error "bad stripe count"
+       [ $($LFS getstripe -S $dom) == 1048576 ] || error "bad stripe size"
+ 
+       local mdtidx=$($GETSTRIPE -M $dom)
+       local mdtname=MDT$(printf %04x $mdtidx)
+       local facet=mds$((mdtidx + 1))
+       local space_check=1
+ 
+       # Skip free space checks with ZFS
+       if [ "$(facet_fstype $facet)" == "zfs" ]; then
+               space_check=0
+       fi
+ 
+       # write
+       sync
+       local mdtfree1=$(do_facet $facet \
+               lctl get_param -n osd*.*$mdtname.kbytesfree)
+       dd if=/dev/urandom of=$tmp bs=1024 count=100
+       # check also direct IO along write
+       dd if=$tmp of=$dom bs=102400 count=1 oflag=direct
+       sync
+       cmp $tmp $dom || error "file data is different"
+       [ $(stat -c%s $dom) == 102400 ] || error "bad size after write"
+       if [ $space_check == 1 ]; then
+               local mdtfree2=$(do_facet $facet \
+                               lctl get_param -n osd*.*$mdtname.kbytesfree)
+               [ $(($mdtfree1 - $mdtfree2)) -ge 102 ] ||
+                       error "MDT free space is wrong after write"
+       fi
+ 
+       # truncate
+       $TRUNCATE $dom 10000
+       [ $(stat -c%s $dom) == 10000 ] || error "bad size after truncate"
+       if [ $space_check == 1 ]; then
+               mdtfree1=$(do_facet $facet \
+                               lctl get_param -n osd*.*$mdtname.kbytesfree)
+               [ $(($mdtfree1 - $mdtfree2)) -ge 92 ] ||
+                       error "MDT free space is wrong after truncate"
+       fi
+ 
+       # append
+       cat $tmp >> $dom
+       sync
+       [ $(stat -c%s $dom) == 112400 ] || error "bad size after append"
+       if [ $space_check == 1 ]; then
+               mdtfree2=$(do_facet $facet \
+                               lctl get_param -n osd*.*$mdtname.kbytesfree)
+               [ $(($mdtfree1 - $mdtfree2)) -ge 102 ] ||
+                       error "MDT free space is wrong after append"
+       fi
+ 
+       # delete
+       rm $dom
+       if [ $space_check == 1 ]; then
+               mdtfree1=$(do_facet $facet \
+                               lctl get_param -n osd*.*$mdtname.kbytesfree)
+               [ $(($mdtfree1 - $mdtfree2)) -ge 112 ] ||
+                       error "MDT free space is wrong after removal"
+       fi
+ 
+       # combined striping
+       $LFS setstripe -E 1024K -L mdt -E EOF $dom ||
+               error "Can't create DoM + OST striping"
+ 
+       dd if=/dev/urandom of=$tmp bs=1024 count=2000
+       # check also direct IO along write
+       dd if=$tmp of=$dom bs=102400 count=20 oflag=direct
+       sync
+       cmp $tmp $dom || error "file data is different"
+       [ $(stat -c%s $dom) == 2048000 ] || error "bad size after write"
+       rm $dom
+       rm $tmp
+ 
+       return 0
+ }
+ run_test 270a "DoM: basic functionality tests"
+ 
+ test_270b() {
+       local dom=$DIR/$tdir/dom_file
+       local max_size=1048576
+ 
+       mkdir -p $DIR/$tdir
+       $LFS setstripe -E $max_size -L mdt $dom
+ 
+       # truncate over the limit
+       $TRUNCATE $dom $(($max_size + 1)) &&
+               error "successful truncate over the maximum size"
+       # write over the limit
+       dd if=/dev/zero of=$dom bs=$max_size seek=1 count=1 &&
+               error "successful write over the maximum size"
+       # append over the limit
+       dd if=/dev/zero of=$dom bs=$(($max_size - 3)) count=1
+       echo "12345" >> $dom && error "successful append over the maximum size"
+       rm $dom
+ 
+       return 0
+ }
+ run_test 270b "DoM: maximum size overflow checks for DoM-only file"
+ 
+ test_270c() {
+       mkdir -p $DIR/$tdir
+       $LFS setstripe -E 1024K -L mdt $DIR/$tdir
+ 
+       # check files inherit DoM EA
+       touch $DIR/$tdir/first
+       [ $($GETSTRIPE -L $DIR/$tdir/first) == 100 ] ||
+               error "bad pattern"
+       [ $($LFS getstripe -c $DIR/$tdir/first) == 0 ] ||
+               error "bad stripe count"
+       [ $($LFS getstripe -S $DIR/$tdir/first) == 1048576 ] ||
+               error "bad stripe size"
+ 
+       # check directory inherits DoM EA and uses it as default
+       mkdir $DIR/$tdir/subdir
+       touch $DIR/$tdir/subdir/second
+       [ $($LFS getstripe -L $DIR/$tdir/subdir/second) == 100 ] ||
+               error "bad pattern in sub-directory"
+       [ $($LFS getstripe -c $DIR/$tdir/subdir/second) == 0 ] ||
+               error "bad stripe count in sub-directory"
+       [ $($LFS getstripe -S $DIR/$tdir/subdir/second) == 1048576 ] ||
+               error "bad stripe size in sub-directory"
+       return 0
+ }
+ run_test 270c "DoM: DoM EA inheritance tests"
+ 
+ test_270d() {
+       mkdir -p $DIR/$tdir
+       $LFS setstripe -E 1024K -L mdt $DIR/$tdir
+ 
+       # inherit default DoM striping
+       mkdir $DIR/$tdir/subdir
+       touch $DIR/$tdir/subdir/f1
+ 
+       # change default directory striping
+       $LFS setstripe -c 1 $DIR/$tdir/subdir
+       touch $DIR/$tdir/subdir/f2
+       [ $($LFS getstripe -c $DIR/$tdir/subdir/f2) == 1 ] ||
+               error "wrong default striping in file 2"
+       [ $($LFS getstripe -L $DIR/$tdir/subdir/f2) == 1 ] ||
+               error "bad pattern in file 2"
+       return 0
+ }
+ run_test 270d "DoM: change striping from DoM to RAID0"
+ 
+ test_270e() {
+       mkdir -p $DIR/$tdir/dom
+       mkdir -p $DIR/$tdir/norm
+       DOMFILES=20
+       NORMFILES=10
+       $LFS setstripe -E 1M -L mdt $DIR/$tdir/dom
+       $LFS setstripe -i 0 -S 2M $DIR/$tdir/norm
+ 
+       createmany -o $DIR/$tdir/dom/dom- $DOMFILES
+       createmany -o $DIR/$tdir/norm/norm- $NORMFILES
+ 
+       # find DoM files by layout
+       NUM=$($LFIND -L mdt -type f $DIR/$tdir 2>/dev/null | wc -l)
+       [ $NUM -eq  $DOMFILES ] ||
+               error "lfs find -L: found $NUM, expected $DOMFILES"
+       echo "Test 1: lfs find 20 DOM files by layout: OK"
+ 
+       # there should be 1 dir with default DOM striping
+       NUM=$($LFIND -L mdt -type d $DIR/$tdir 2>/dev/null | wc -l)
+       [ $NUM -eq  1 ] ||
+               error "lfs find -L: found $NUM, expected 1 dir"
+       echo "Test 2: lfs find 1 DOM dir by layout: OK"
+ 
+       # find DoM files by stripe size
+       NUM=$($LFIND -S -1200K -type f $DIR/$tdir 2>/dev/null | wc -l)
+       [ $NUM -eq  $DOMFILES ] ||
+               error "lfs find -S: found $NUM, expected $DOMFILES"
+       echo "Test 4: lfs find 20 DOM files by stripe size: OK"
+ 
+       # find files by stripe offset except DoM files
+       NUM=$($LFIND -i 0 -type f $DIR/$tdir 2>/dev/null | wc -l)
+       [ $NUM -eq  $NORMFILES ] ||
+               error "lfs find -i: found $NUM, expected $NORMFILES"
+       echo "Test 5: lfs find no DOM files by stripe index: OK"
+       return 0
+ }
+ run_test 270e "DoM: lfs find with DoM files test"
+ 
+ test_270f() {
+       local mdtname=${FSNAME}-MDT0000-mdtlov
+       local dom=$DIR/$tdir/dom_file
+       local dom_limit_saved=$(do_facet mds1 $LCTL get_param -n \
+                                               lod.$mdtname.dom_stripesize)
+       local dom_limit=131072
+ 
+       do_facet mds1 $LCTL set_param -n lod.$mdtname.dom_stripesize=$dom_limit
+       local dom_current=$(do_facet mds1 $LCTL get_param -n \
+                                               lod.$mdtname.dom_stripesize)
+       [ ${dom_limit} -eq ${dom_current} ] ||
+               error "Cannot change per-MDT DoM stripe limit to $dom_limit"
+ 
+       $LFS mkdir -i 0 -c 1 $DIR/$tdir
+       $LFS setstripe -d $DIR/$tdir
+       $LFS setstripe -E $dom_limit -L mdt $DIR/$tdir ||
+               error "Can't set directory default striping"
+ 
+       # exceed maximum stripe size
+       $LFS setstripe -E $(($dom_limit * 2)) -L mdt $dom &&
+               error "Able to create DoM component size more than LOD limit"
+ 
+       do_facet mds1 $LCTL set_param -n lod.$mdtname.dom_stripesize=0
+       dom_current=$(do_facet mds1 $LCTL get_param -n \
+                                               lod.$mdtname.dom_stripesize)
+       [ 0 -eq ${dom_current} ] ||
+               error "Can't set zero DoM stripe limit"
+ 
+       # too low values to be aligned with smallest stripe size 64K
+       do_facet mds1 $LCTL set_param -n lod.$mdtname.dom_stripesize=30000
+       dom_current=$(do_facet mds1 $LCTL get_param -n \
+                                               lod.$mdtname.dom_stripesize)
+       [ 30000 -eq ${dom_current} ] &&
+               error "Can set too small DoM stripe limit"
+ 
+       do_facet mds1 $LCTL set_param -n lod.$mdtname.dom_stripesize=2147483648
+       dom_current=$(do_facet mds1 $LCTL get_param -n \
+                                               lod.$mdtname.dom_stripesize)
+       echo $dom_current
+       [ 2147483648 -eq ${dom_current} ] &&
+               error "Can set too large DoM stripe limit"
+ 
+       do_facet mds1 $LCTL set_param -n \
+                               lod.$mdtname.dom_stripesize=$((dom_limit * 2))
+       $LFS setstripe -E $((dom_limit * 2)) -L mdt $dom ||
+               error "Can't create DoM component size after limit change"
+       do_facet mds1 $LCTL set_param -n \
+                               lod.$mdtname.dom_stripesize=$((dom_limit / 2))
+       $LFS setstripe -E $dom_limit -L mdt ${dom}_big &&
+               error "Can create big DoM component after limit decrease"
+       touch ${dom}_def ||
+               error "Can't create file with old default layout"
+ 
+       do_facet mds1 $LCTL set_param -n lod.*.dom_stripesize=$dom_limit_saved
+       return 0
+ }
+ run_test 270f "DoM: maximum DoM stripe size checks"
+ 
+ test_271a() {
+       local dom=$DIR/$tdir/dom
+ 
+       mkdir -p $DIR/$tdir
+ 
+       $LFS setstripe -E 1024K -L mdt $dom
+ 
+       lctl set_param -n mdc.*.stats=clear
+       dd if=/dev/zero of=$dom bs=4096 count=1 || return 1
+       cat $dom > /dev/null
+       local reads=$(lctl get_param -n mdc.*.stats |
+                       awk '/ost_read/ {print $2}')
+       [ -z $reads ] || error "Unexpected $reads READ RPCs"
+       ls $dom
+       rm -f $dom
+ }
+ run_test 271a "DoM: data is cached for read after write"
+ 
+ test_271b() {
+       local dom=$DIR/$tdir/dom
+ 
+       mkdir -p $DIR/$tdir
+ 
+       $LFS setstripe -E 1024K -L mdt -E EOF $dom
+ 
+       lctl set_param -n mdc.*.stats=clear
+       dd if=/dev/zero of=$dom bs=4096 count=1 || return 1
+       cancel_lru_locks mdc
+       $CHECKSTAT -t file -s 4096 $dom || error "stat #1 fails"
+       # second stat to check size is cached on client
+       $CHECKSTAT -t file -s 4096 $dom || error "stat #2 fails"
+       local gls=$(lctl get_param -n mdc.*.stats |
+                       awk '/ldlm_glimpse/ {print $2}')
+       [ -z $gls ] || error "Unexpected $gls glimpse RPCs"
+       rm -f $dom
+ }
+ run_test 271b "DoM: no glimpse RPC for stat (DoM only file)"
+ 
+ test_271ba() {
+       local dom=$DIR/$tdir/dom
+ 
+       mkdir -p $DIR/$tdir
+ 
+       $LFS setstripe -E 1024K -L mdt -E EOF $dom
+ 
+       lctl set_param -n mdc.*.stats=clear
+       lctl set_param -n osc.*.stats=clear
+       dd if=/dev/zero of=$dom bs=2048K count=1 || return 1
+       cancel_lru_locks mdc
+       $CHECKSTAT -t file -s 2097152 $dom || error "stat"
+       # second stat to check size is cached on client
+       $CHECKSTAT -t file -s 2097152 $dom || error "stat"
+       local gls=$(lctl get_param -n mdc.*.stats |
+                       awk '/ldlm_glimpse/ {print $2}')
+       [ -z $gls ] || error "Unexpected $gls glimpse RPCs"
+       local gls=$(lctl get_param -n osc.*.stats |
+                       awk '/ldlm_glimpse/ {print $2}')
+       [ -z $gls ] || error "Unexpected $gls OSC glimpse RPCs"
+       rm -f $dom
+ }
+ run_test 271ba "DoM: no glimpse RPC for stat (combined file)"
+ 
+ test_271c() {
+       # test to be enabled with lock_convert
+       skip "skipped until lock convert will be implemented" && return
+ 
+       local dom=$DIR/$tdir/dom
+ 
+       mkdir -p $DIR/$tdir
+ 
+       $LFS setstripe -E 1024K -L mdt $DIR/$tdir
+ 
+       local mdtidx=$($LFS getstripe -M $DIR/$tdir)
+       local facet=mds$((mdtidx + 1))
+ 
+       cancel_lru_locks mdc
+       do_facet $facet lctl set_param -n mdt.*.dom_lock=0
+       createmany -o $dom 1000
+       lctl set_param -n mdc.*.stats=clear
+       smalliomany -w $dom 1000 200
+       lctl get_param -n mdc.*.stats
+       local enq=$(lctl get_param -n mdc.*.stats |
+                       awk '/ldlm_ibits_enqueue/ {print $2}')
+       # Each file has 1 open, 1 IO enqueues, total 2000
+       # but now we have also +1 getxattr for security.capability, total 3000
+       [ $enq -ge 2000 ] || error "Too few enqueues $enq, expected > 2000"
+       unlinkmany $dom 1000
+ 
+       cancel_lru_locks mdc
+       do_facet $facet lctl set_param -n mdt.*.dom_lock=1
+       createmany -o $dom 1000
+       lctl set_param -n mdc.*.stats=clear
+       smalliomany -w $dom 1000 200
+       lctl get_param -n mdc.*.stats
+       local enq_2=$(lctl get_param -n mdc.*.stats |
+                       awk '/ldlm_ibits_enqueue/ {print $2}')
+       # Expect to see reduced amount of RPCs by 1000 due to single enqueue
+       # for OPEN and IO lock.
+       [ $((enq - enq_2)) -ge 1000 ] ||
+               error "Too many enqueues $enq_2, expected about $((enq - 1000))"
+       unlinkmany $dom 1000
+       return 0
+ }
+ run_test 271c "DoM: IO lock at open saves enqueue RPCs"
+ 
   cleanup_test_300() {
         trap 0
         umask $SAVE_UMASK
@@@ -15406,17 -15685,17 +15771,17 @@@ test_300g() 
         $LFS setdirstripe -D -i1 $DIR/$tdir/striped_dir ||
                 error "create striped_dir failed"
   
+ +      $LFS setdirstripe -i0 $DIR/$tdir/striped_dir/dir0 ||
+ +              error "create dir0 fails"
+ +      stripe_index=$($LFS getdirstripe -i $DIR/$tdir/striped_dir/dir0)
+ +      [ $stripe_index -eq 0 ] ||
+ +              error "dir0 expect index 0 got $stripe_index"
+ +
         mkdir $DIR/$tdir/striped_dir/dir1 ||
                 error "create dir1 fails"
         stripe_index=$($LFS getdirstripe -i $DIR/$tdir/striped_dir/dir1)
         [ $stripe_index -eq 1 ] ||
- -              error "dir1 expect 1 got $stripe_index"
- -
- -      $LFS setdirstripe -i2 $DIR/$tdir/striped_dir/dir2 ||
- -              error "create dir2 fails"
- -      stripe_index=$($LFS getdirstripe -i $DIR/$tdir/striped_dir/dir2)
- -      [ $stripe_index -eq 2 ] ||
- -              error "dir2 expect 2 got $stripe_index"
+ +              error "dir1 expect index 1 got $stripe_index"
   
         #check default stripe count/stripe index
         test_300_check_default_striped_dir normal_dir $MDSCOUNT 1
diff --combined lustre/tests/sanityn.sh

index 29ba159,835e3b9..249844a
--- 1/lustre/tests/sanityn.sh
--- 2/lustre/tests/sanityn.sh
+++ b/lustre/tests/sanityn.sh
@@@ -48,6 -48,8 +48,8 @@@ TRACE=${TRACE:-""
   
   check_and_setup_lustre
   
+ OSC=${OSC:-"osc"}
+ 
   assert_DIR
   rm -rf $DIR1/[df][0-9]* $DIR1/lnk $DIR/[df].${TESTSUITE}*
   
@@@ -432,6 -434,8 +434,8 @@@ run_test 18 "mmap sanity check ========
   test_19() { # bug3811
         local node=$(facet_active_host ost1)
   
+       [ "x$DOM" = "xyes" ] && node=$(facet_active_host $SINGLEMDS)
+ 
         # check whether obdfilter is cache capable at all
         if ! get_osd_param $node '' read_cache_enable >/dev/null; then
                 echo "not cache-capable obdfilter"
@@@ -446,7 -450,7 +450,7 @@@
         cp $TMP/$tfile $DIR1/$tfile
         for i in `seq 1 20`; do
                 [ $((i % 5)) -eq 0 ] && log "$testname loop $i"
-               cancel_lru_locks osc > /dev/null
+               cancel_lru_locks $OSC > /dev/null
                 cksum $DIR1/$tfile | cut -d" " -f 1,2 > $TMP/sum1 & \
                 cksum $DIR2/$tfile | cut -d" " -f 1,2 > $TMP/sum2
                 wait
@@@ -462,12 -466,12 +466,12 @@@ run_test 19 "test concurrent uncached r
   
   test_20() {
         test_mkdir $DIR1/d20
-       cancel_lru_locks osc
+       cancel_lru_locks $OSC
         CNT=$((`lctl get_param -n llite.*.dump_page_cache | wc -l`))
         $MULTIOP $DIR1/f20 Ow8190c
         $MULTIOP $DIR2/f20 Oz8194w8190c
         $MULTIOP $DIR1/f20 Oz0r8190c
-       cancel_lru_locks osc
+       cancel_lru_locks $OSC
         CNTD=$((`lctl get_param -n llite.*.dump_page_cache | wc -l` - $CNT))
         [ $CNTD -gt 0 ] && \
             error $CNTD" page left in cache after lock cancel" || true
@@@ -498,7 -502,7 +502,7 @@@ test_23() { # Bug 597
         echo "atime should be updated while another read" > $DIR1/$tfile
   
         # clear the lock(mode: LCK_PW) gotten from creating operation
-       cancel_lru_locks osc
+       cancel_lru_locks $OSC
         time1=$(date +%s)
         echo "now is $time1"
         sleep $((at_diff + 1))
@@@ -530,9 -534,9 +534,9 @@@ test_24a() 
   
         OSC=`lctl dl | awk '/-osc-|OSC.*MNT/ {print $4}' | head -n 1`
   #     OSC=`lctl dl | awk '/-osc-/ {print $4}' | head -n 1`
-       lctl --device %$OSC deactivate
+       lctl --device %osc deactivate
         lfs df -i || error "lfs df -i with deactivated OSC failed"
-       lctl --device %$OSC activate
+       lctl --device %osc activate
         lfs df || error "lfs df with reactivated OSC failed"
   }
   run_test 24a "lfs df [-ih] [path] test ========================="
@@@ -622,7 -626,7 +626,7 @@@ test_26b() 
   run_test 26b "sync mtime between ost and mds"
   
   test_27() {
-       cancel_lru_locks osc
+       cancel_lru_locks $OSC
         lctl clear
         dd if=/dev/zero of=$DIR2/$tfile bs=$((4096+4))k conv=notrunc count=4 seek=3 &
         DD2_PID=$!
@@@ -679,7 -683,19 +683,7 @@@ test_28() { # bug 997
   }
   run_test 28 "read/write/truncate file with lost stripes"
   
- -test_29() { # bug 10999
- -      touch $DIR1/$tfile
- -      #define OBD_FAIL_LDLM_GLIMPSE  0x30f
- -      lctl set_param fail_loc=0x8000030f
- -      ls -l $DIR2/$tfile &
- -      usleep 500
- -      dd if=/dev/zero of=$DIR1/$tfile bs=4k count=1
- -      wait
- -}
- -#bug 11549 - permanently turn test off in b1_5
- -run_test 29 "lock put race between glimpse and enqueue ========="
- -
- -test_30() { #bug #11110, LU-2523
+ +test_30() { #b=11110, LU-2523
         test_mkdir $DIR1/$tdir
         cp -f /bin/bash $DIR1/$tdir/bash
         /bin/sh -c 'sleep 1; rm -f $DIR2/$tdir/bash; cp /bin/bash $DIR2/$tdir' &
@@@ -688,6 -704,7 +692,6 @@@
         wait
         true
   }
- -
   run_test 30 "recreate file race"
   
   test_31a() {
@@@ -728,38 -745,39 +732,39 @@@ run_test 31b "voluntary OST cancel / bl
   
   # enable/disable lockless truncate feature, depending on the arg 0/1
   enable_lockless_truncate() {
-         lctl set_param -n osc.*.lockless_truncate $1
+       lctl set_param -n $OSC.*.lockless_truncate $1
   }
   
   test_32a() { # bug 11270
         local p="$TMP/$TESTSUITE-$TESTNAME.parameters"
-       save_lustre_params client "osc.*.lockless_truncate" > $p
-       cancel_lru_locks osc
+ 
+       save_lustre_params client "$OSC.*.lockless_truncate" > $p
+       cancel_lru_locks $OSC
         enable_lockless_truncate 1
         rm -f $DIR1/$tfile
         lfs setstripe -c -1 $DIR1/$tfile
         dd if=/dev/zero of=$DIR1/$tfile count=$OSTCOUNT bs=$STRIPE_BYTES > \
                 /dev/null 2>&1
-       clear_stats osc.*.osc_stats
+       clear_stats $OSC.*.${OSC}_stats
   
         log "checking cached lockless truncate"
         $TRUNCATE $DIR1/$tfile 8000000
         $CHECKSTAT -s 8000000 $DIR2/$tfile || error "wrong file size"
-       [ $(calc_stats osc.*.osc_stats lockless_truncate) -ne 0 ] ||
+       [ $(calc_stats $OSC.*.${OSC}_stats lockless_truncate) -ne 0 ] ||
                 error "cached truncate isn't lockless"
   
         log "checking not cached lockless truncate"
         $TRUNCATE $DIR2/$tfile 5000000
         $CHECKSTAT -s 5000000 $DIR1/$tfile || error "wrong file size"
-       [ $(calc_stats osc.*.osc_stats lockless_truncate) -ne 0 ] ||
+       [ $(calc_stats $OSC.*.${OSC}_stats lockless_truncate) -ne 0 ] ||
                 error "not cached truncate isn't lockless"
   
         log "disabled lockless truncate"
         enable_lockless_truncate 0
-       clear_stats osc.*.osc_stats
+       clear_stats $OSC.*.${OSC}_stats
         $TRUNCATE $DIR2/$tfile 3000000
         $CHECKSTAT -s 3000000 $DIR1/$tfile || error "wrong file size"
-       [ $(calc_stats osc.*.osc_stats lockless_truncate) -eq 0 ] ||
+       [ $(calc_stats $OSC.*.${OSC}_stats lockless_truncate) -eq 0 ] ||
                 error "lockless truncate disabling failed"
         rm $DIR1/$tfile
         # restore lockless_truncate default values
@@@ -782,21 -800,21 +787,21 @@@ test_32b() { # bug 1127
                 "ldlm.namespaces.filter-*.contended_locks" >> $p
         save_lustre_params $facets \
                 "ldlm.namespaces.filter-*.contention_seconds" >> $p
-       clear_stats osc.*.osc_stats
+       clear_stats $OSC.*.${OSC}_stats
   
         # agressive lockless i/o settings
         do_nodes $(comma_list $(osts_nodes)) \
                 "lctl set_param -n ldlm.namespaces.*.max_nolock_bytes=2000000 \
                         ldlm.namespaces.filter-*.contended_locks=0 \
                         ldlm.namespaces.filter-*.contention_seconds=60"
-       lctl set_param -n osc.*.contention_seconds=60
+       lctl set_param -n $OSC.*.contention_seconds=60
         for i in {1..5}; do
                 dd if=/dev/zero of=$DIR1/$tfile bs=4k count=1 conv=notrunc > \
                         /dev/null 2>&1
                 dd if=/dev/zero of=$DIR2/$tfile bs=4k count=1 conv=notrunc > \
                         /dev/null 2>&1
         done
-       [ $(calc_stats osc.*.osc_stats lockless_write_bytes) -ne 0 ] ||
+       [ $(calc_stats $OSC.*.${OSC}_stats lockless_write_bytes) -ne 0 ] ||
                 error "lockless i/o was not triggered"
         # disable lockless i/o (it is disabled by default)
         do_nodes $(comma_list $(osts_nodes)) \
@@@ -805,15 -823,15 +810,15 @@@
                         ldlm.namespaces.filter-*.contention_seconds=0"
         # set contention_seconds to 0 at client too, otherwise Lustre still
         # remembers lock contention
-       lctl set_param -n osc.*.contention_seconds=0
-       clear_stats osc.*.osc_stats
+       lctl set_param -n $OSC.*.contention_seconds=0
+       clear_stats $OSC.*.${OSC}_stats
         for i in {1..1}; do
                 dd if=/dev/zero of=$DIR1/$tfile bs=4k count=1 conv=notrunc > \
                         /dev/null 2>&1
                 dd if=/dev/zero of=$DIR2/$tfile bs=4k count=1 conv=notrunc > \
                         /dev/null 2>&1
         done
-       [ $(calc_stats osc.*.osc_stats lockless_write_bytes) -eq 0 ] ||
+       [ $(calc_stats $OSC.*.${OSC}_stats lockless_write_bytes) -eq 0 ] ||
                 error "lockless i/o works when disabled"
         rm -f $DIR1/$tfile
         restore_lustre_params <$p
@@@ -1367,7 -1385,7 +1372,7 @@@ test_39d() { # LU-731
   
         $LCTL set_param fail_loc=0
   
-       cancel_lru_locks osc
+       cancel_lru_locks $OSC
   
         local mtime2=$(stat -c %Y $DIR2/$tfile)
         [ "$mtime2" -ge "$d1" ] && [ "$mtime2" -le "$d2" ] ||
@@@ -3148,41 -3166,33 +3153,41 @@@ tbf_verify() 
         local client1=${CLIENT1:-$(hostname)}
         local myRUNAS="$3"
   
+ +      local np=$(check_cpt_number ost1)
+ +      [ $np -gt 0 ] || error "CPU partitions should not be $np."
+ +      echo "cpu_npartitions on ost1 is $np"
+ +
         mkdir $dir || error "mkdir $dir failed"
- -      $LFS setstripe -c 1 $dir || error "setstripe to $dir failed"
+ +      $LFS setstripe -c 1 -i 0 $dir || error "setstripe to $dir failed"
         chmod 777 $dir
   
         trap cleanup_tbf_verify EXIT
         echo "Limited write rate: $1, read rate: $2"
         echo "Verify the write rate is under TBF control"
- -      local runtime=$(do_node $client1 $myRUNAS dd if=/dev/zero of=$dir/tbf \
- -              bs=1M count=100 oflag=direct 2>&1 | awk '/bytes/ {print $6}')
+ +      local start=$SECONDS
+ +      do_node $client1 $myRUNAS dd if=/dev/zero of=$dir/tbf \
+ +              bs=1M count=100 oflag=direct 2>&1
+ +      local runtime=$((SECONDS - start + 1))
         local rate=$(bc <<< "scale=6; 100 / $runtime")
         echo "Write runtime is $runtime s, speed is $rate IOPS"
   
- -      # verify the write rate does not exceed 110% of TBF limited rate
- -      [ $(bc <<< "$rate < 1.1 * $1") -eq 1 ] ||
- -              error "The write rate ($rate) exceeds 110% of preset rate ($1)"
+ +      # verify the write rate does not exceed TBF rate limit
+ +      [ $(bc <<< "$rate < 1.1 * $np * $1") -eq 1 ] ||
+ +              error "The write rate ($rate) exceeds 110% of rate limit ($1 * $np)"
   
         cancel_lru_locks osc
   
         echo "Verify the read rate is under TBF control"
- -      runtime=$(do_node $client1 $myRUNAS dd if=$dir/tbf of=/dev/null \
- -              bs=1M count=100 iflag=direct 2>&1 | awk '/bytes/ {print $6}')
+ +      start=$SECONDS
+ +      do_node $client1 $myRUNAS dd if=$dir/tbf of=/dev/null \
+ +              bs=1M count=100 iflag=direct 2>&1
+ +      runtime=$((SECONDS - start + 1))
         rate=$(bc <<< "scale=6; 100 / $runtime")
         echo "Read runtime is $runtime s, speed is $rate IOPS"
   
- -      # verify the read rate does not exceed 110% of TBF limited rate
- -      [ $(bc <<< "$rate < 1.1 * $2") -eq 1 ] ||
- -              error "The read rate ($rate) exceeds 110% of preset rate ($2)"
+ +      # verify the read rate does not exceed TBF rate limit
+ +      [ $(bc <<< "$rate < 1.1 * $np * $2") -eq 1 ] ||
+ +              error "The read rate ($rate) exceeds 110% of rate limit ($2 * $np)"
   
         cancel_lru_locks osc
         cleanup_tbf_verify || error "rm -rf $dir failed"
@@@ -4002,6 -4012,142 +4007,142 @@@ test_93() 
   }
   run_test 93 "alloc_rr should not allocate on same ost"
   
+ # Data-on-MDT tests
+ test_100a() {
+       skip "Reserved for glimpse-ahead" && return
+       mkdir -p $DIR/$tdir
+ 
+       $LFS setstripe -E 1024K -L mdt -E EOF $DIR/$tdir/dom
+ 
+       lctl set_param -n mdc.*.stats=clear
+       dd if=/dev/zero of=$DIR2/$tdir/dom bs=4096 count=1 || return 1
+ 
+       $CHECKSTAT -t file -s 4096 $DIR/$tdir/dom || error "stat #1"
+       # first stat from server should return size data and save glimpse
+       local gls=$(lctl get_param -n mdc.*.stats | \
+               awk '/ldlm_glimpse/ {print $2}')
+       [ -z $gls ] || error "Unexpected $gls glimpse RPCs"
+       # second stat to check size is NOT cached on client without IO lock
+       $CHECKSTAT -t file -s 4096 $DIR/$tdir/dom || error "stat #2"
+ 
+       local gls=$(lctl get_param -n mdc.*.stats | grep ldlm_glimpse | wc -l)
+       [ "1" == "$gls" ] || error "Expect 1 glimpse RPCs but got $gls"
+       rm -f $dom
+ }
+ run_test 100a "DoM: glimpse RPCs for stat without IO lock (DoM only file)"
+ 
+ test_100b() {
+       mkdir -p $DIR/$tdir
+ 
+       $LFS setstripe -E 1024K -L mdt -E EOF $DIR/$tdir/dom
+ 
+       lctl set_param -n mdc.*.stats=clear
+       dd if=/dev/zero of=$DIR2/$tdir/dom bs=4096 count=1 || return 1
+       cancel_lru_locks mdc
+       # first stat data from server should have size
+       $CHECKSTAT -t file -s 4096 $DIR/$tdir/dom || error "stat #1"
+       # second stat to check size is cached on client
+       $CHECKSTAT -t file -s 4096 $DIR/$tdir/dom || error "stat #2"
+ 
+       local gls=$(lctl get_param -n mdc.*.stats |
+                       awk '/ldlm_glimpse/ {print $2}')
+       # both stats should cause no glimpse requests
+       [ -z $gls ] || error "Unexpected $gls glimpse RPCs"
+       rm -f $dom
+ }
+ run_test 100b "DoM: no glimpse RPC for stat with IO lock (DoM only file)"
+ 
+ test_100c() {
+       mkdir -p $DIR/$tdir
+ 
+       $LFS setstripe -E 1024K -L mdt -E EOF $DIR/$tdir/dom
+ 
+       lctl set_param -n mdc.*.stats=clear
+       lctl set_param -n osc.*.stats=clear
+       dd if=/dev/zero of=$DIR2/$tdir/dom bs=2048K count=1 || return 1
+ 
+       # check that size is merged from MDT and OST correctly
+       $CHECKSTAT -t file -s 2097152 $DIR/$tdir/dom ||
+               error "Wrong size from stat #1"
+ 
+       local gls=$(lctl get_param -n osc.*.stats | grep ldlm_glimpse | wc -l)
+       [ $gls -eq 0 ] && error "Expect OST glimpse RPCs but got none"
+ 
+       rm -f $dom
+ }
+ run_test 100c "DoM: write vs stat without IO lock (combined file)"
+ 
+ test_100d() {
+       mkdir -p $DIR/$tdir
+ 
+       $LFS setstripe -E 1024K -L mdt -E EOF $DIR/$tdir/dom
+ 
+ 
+       dd if=/dev/zero of=$DIR2/$tdir/dom bs=2048K count=1 || return 1
+       lctl set_param -n mdc.*.stats=clear
+       $TRUNCATE $DIR2/$tdir/dom 4096
+ 
+       # check that reported size is valid after file grows to OST and
+       # is truncated back to MDT stripe size
+       $CHECKSTAT -t file -s 4096 $DIR/$tdir/dom ||
+               error "Wrong size from stat #1"
+ 
+       local gls=$(lctl get_param -n osc.*.stats | grep ldlm_glimpse | wc -l)
+       [ $gls -eq 0 ] && error "Expect OST glimpse but got none"
+ 
+       rm -f $dom
+ }
+ run_test 100d "DoM: write+truncate vs stat without IO lock (combined file)"
+ 
+ 
+ test_101a() {
+       $LFS setstripe -E 1024K -L mdt -E EOF $DIR1/$tfile
+       lctl set_param -n mdc.*.stats=clear
+       # to get layout
+       $CHECKSTAT -t file $DIR1/$tfile
+       # open + IO lock
+       dd if=/dev/zero of=$DIR1/$tfile bs=4096 count=1 || error "Write fails"
+       # must discard pages
+       rm $DIR2/$tfile || error "Unlink fails"
+       local writes=$(lctl get_param -n mdc.*.stats | grep ost_write | wc -l)
+       [ $writes -eq 0 ] || error "Found WRITE RPC but expect none"
+ }
+ run_test 101a "Discard DoM data on unlink"
+ 
+ test_101b() {
+       $LFS setstripe -E 1024K -L mdt -E EOF $DIR1/$tfile
+       touch $DIR1/${tfile}_2
+       lctl set_param -n mdc.*.stats=clear
+       # to get layout
+       $CHECKSTAT -t file $DIR1/$tfile
+       # open + IO lock
+       dd if=/dev/zero of=$DIR1/$tfile bs=4096 count=1 || error "Write fails"
+       # must discard pages
+       mv $DIR2/${tfile}_2 $DIR2/$tfile || error "Rename fails"
+       local writes=$(lctl get_param -n mdc.*.stats | grep ost_write | wc -l)
+       [ $writes -eq 0 ] || error "Found WRITE RPC but expect none"
+ }
+ run_test 101b "Discard DoM data on rename"
+ 
+ test_101c() {
+       $LFS setstripe -E 1024K -L mdt -E EOF $DIR1/$tfile
+       lctl set_param -n mdc.*.stats=clear
+       # to get layout
+       $CHECKSTAT -t file $DIR1/$tfile
+       # open + IO lock
+       dd if=/dev/zero of=$DIR1/$tfile bs=4096 count=1 || error "Write fails"
+ 
+       $MULTIOP $DIR1/$tfile O_c &
+       MULTIOP_PID=$!
+       sleep 2
+       rm $DIR2/$tfile > /dev/null || error "Unlink fails"
+       kill -USR1 $MULTIOP_PID || return 2
+       wait $MULTIOP_PID || return 3
+       local writes=$(lctl get_param -n mdc.*.stats | grep ost_write | wc -l)
+       [ $writes -eq 0 ] || error "Found WRITE RPC but expect none"
+ }
+ run_test 101c "Discard DoM data on close-unlink"
+ 
   log "cleanup: ======================================================"
   
   # kill and wait in each test only guarentee script finish, but command in script
diff --combined lustre/tests/test-framework.sh

index 8774f8e,e8ecdfa..34d8580
--- 1/lustre/tests/test-framework.sh
--- 2/lustre/tests/test-framework.sh
+++ b/lustre/tests/test-framework.sh
@@@ -611,18 -611,18 +611,18 @@@ load_modules_local() 
                                 LNETLND="socklnd/ksocklnd"
                 esac
         fi
-     load_module ../lnet/klnds/$LNETLND
-     load_module obdclass/obdclass
-     load_module ptlrpc/ptlrpc
-     load_module ptlrpc/gss/ptlrpc_gss
-     load_module fld/fld
-     load_module fid/fid
-     load_module lmv/lmv
-     load_module mdc/mdc
-     load_module osc/osc
-     load_module lov/lov
-     load_module mgc/mgc
-     load_module obdecho/obdecho
+       load_module ../lnet/klnds/$LNETLND
+       load_module obdclass/obdclass
+       load_module ptlrpc/ptlrpc
+       load_module ptlrpc/gss/ptlrpc_gss
+       load_module fld/fld
+       load_module fid/fid
+       load_module lmv/lmv
+       load_module osc/osc
+       load_module mdc/mdc
+       load_module lov/lov
+       load_module mgc/mgc
+       load_module obdecho/obdecho
         if ! client_only; then
                 SYMLIST=/proc/kallsyms
                 grep -q crc16 $SYMLIST ||
@@@ -2501,7 -2501,7 +2501,7 @@@ wait_update_facet() 
   
   sync_all_data() {
         do_nodes $(comma_list $(mdts_nodes)) \
- -          "lctl set_param -n osd*.*MDT*.force_sync=1"
+ +          "lctl set_param -n os[cd]*.*MDT*.force_sync=1"
         do_nodes $(comma_list $(osts_nodes)) \
             "lctl set_param -n osd*.*OS*.force_sync=1" 2>&1 |
                 grep -v 'Found no match'
@@@ -2542,7 -2542,7 +2542,7 @@@ wait_delete_completed_mds() 
         mds2sync=$(comma_list $mds2sync)
   
         # sync MDS transactions
- -      do_nodes $mds2sync "$LCTL set_param -n osd*.*MD*.force_sync 1"
+ +      do_nodes $mds2sync "$LCTL set_param -n os[cd]*.*MD*.force_sync 1"
   
         # wait till all changes are sent and commmitted by OSTs
         # for ldiskfs space is released upon execution, but DMU
@@@ -3969,7 -3969,7 +3969,7 @@@ format_ost() 
   }
   
   formatall() {
- -      stopall
+ +      stopall -f
         # Set hostid for ZFS/SPL zpool import protection
         # (Assumes MDS version is also OSS version)
         if [ $(lustre_version_code $SINGLEMDS) -ge $(version_code 2.8.54) ];
diff --combined lustre/utils/lfs.c

index be6a2d3,515f169..026e36e
--- 1/lustre/utils/lfs.c
--- 2/lustre/utils/lfs.c
+++ b/lustre/utils/lfs.c
@@@ -113,32 -113,38 +113,34 @@@ static int lfs_list_commands(int argc, 
   
   /* Setstripe and migrate share mostly the same parameters */
   #define SSM_CMD_COMMON(cmd) \
- -      "usage: "cmd" [--stripe-count|-c <stripe_count>]\n"             \
+ +      "usage: "cmd" [--component-end|-E <comp_end>]\n"                \
+ +      "                 [--stripe-count|-c <stripe_count>]\n"         \
         "                 [--stripe-index|-i <start_ost_idx>]\n"        \
         "                 [--stripe-size|-S <stripe_size>]\n"           \
+       "                 [--layout|-L <pattern>]\n"            \
         "                 [--pool|-p <pool_name>]\n"                    \
- -      "                 [--ost|-o <ost_indices>]\n"                   \
- -      "                 [--component-end|-E <comp_end>]\n"
+ +      "                 [--ost|-o <ost_indices>]\n"
   
   #define SSM_HELP_COMMON \
- -      "\tstripe_size:  Number of bytes on each OST (0 filesystem default)\n" \
- -      "\t              Can be specified with k, m or g (in KB, MB and GB\n" \
+ +      "\tstripe_count: Number of OSTs to stripe over (0=fs default, -1 all)\n" \
+ +      "\tstart_ost_idx: OST index of first stripe (-1=default round robin)\n"\
+ +      "\tstripe_size:  Number of bytes on each OST (0=fs default)\n" \
+ +      "\t              Can be specified with K, M or G (for KB, MB, GB\n" \
         "\t              respectively)\n"                               \
- -      "\tstart_ost_idx: OST index of first stripe (-1 default)\n"     \
- -      "\tstripe_count: Number of OSTs to stripe over (0 default, -1 all)\n" \
         "\tpool_name:    Name of OST pool to use (default none)\n"      \
+       "\tlayout:       stripe pattern type: raid0, mdt (default raid0)\n"\
         "\tost_indices:  List of OST indices, can be repeated multiple times\n"\
         "\t              Indices be specified in a format of:\n"        \
         "\t                -o <ost_1>,<ost_i>-<ost_j>,<ost_n>\n"        \
         "\t              Or:\n"                                         \
         "\t                -o <ost_1> -o <ost_i>-<ost_j> -o <ost_n>\n"  \
         "\t              If --pool is set with --ost, then the OSTs\n"  \
- -      "\t              must be the members of the pool."              \
- -      "\tcomp_end:     Extent end of the component\n"                 \
- -      "\t              Can be specified with k, m or g (in KB, MB and GB\n" \
- -      "\t              respectively, -1 for EOF), it must be aligned with\n"\
- -      "\t              the stripe_size\n"
+ +      "\t              must be the members of the pool.\n"            \
+ +      "\tcomp_end:     Extent end of component, start after previous end.\n"\
+ +      "\t              Can be specified with K, M or G (for KB, MB, GB\n" \
+ +      "\t              respectively, -1 for EOF). Must be a multiple of\n"\
+ +      "\t              stripe_size.\n"
   
- -#define SETSTRIPE_USAGE                                               \
- -      SSM_CMD_COMMON("setstripe")                             \
- -      "                 <directory|filename>\n"               \
- -      SSM_HELP_COMMON                                         \
   
   #define MIGRATE_USAGE                                                 \
         SSM_CMD_COMMON("migrate  ")                                     \
@@@ -169,26 -175,28 +171,26 @@@ static bool              file_lease_supported = tr
   /* all available commands */
   command_t cmdlist[] = {
         {"setstripe", lfs_setstripe, 0,
- -       "Create a new file with a specific striping pattern or\n"
- -       "set the default striping pattern on an existing directory or\n"
- -       "delete the default striping pattern from an existing directory or\n"
- -       "add layout component(s) to an existing composite file or\n"
- -       "delete specified component(s) from an existing composite file\n\n"
- -       "To delete default striping from an existing directory:\n"
+ +       "To create a file with specified striping/composite layout, or\n"
+ +       "create/replace the default layout on an existing directory:\n"
+ +       SSM_CMD_COMMON("setstripe")
+ +       "                 <directory|filename>\n"
+ +       " or\n"
+ +       "To add component(s) to an existing composite file:\n"
+ +       SSM_CMD_COMMON("setstripe --component-add")
+ +       SSM_HELP_COMMON
+ +       "To totally delete the default striping from an existing directory:\n"
          "usage: setstripe -d <directory>\n"
          " or\n"
- -       "To delete component(s) from an existing composite file:\n"
+ +       "To delete the last component(s) from an existing composite file\n"
+ +       "(note that this will also delete any data in those components):\n"
          "usage: setstripe --component-del [--component-id|-I <comp_id>]\n"
          "                               [--component-flags|-F <comp_flags>]\n"
          "                               <filename>\n"
- -       "\tcomp_id:     Unique component ID\n"
+ +       "\tcomp_id:     Unique component ID to delete\n"
          "\tcomp_flags:  'init' indicating all instantiated components\n"
- -       "\t             '^init' indicating all uninstantiated components\n"
- -       "\t-I and -F can't be specified at the same time\n"
- -       " or\n"
- -       "To add component(s) to an existing composite file:\n"
- -       SSM_CMD_COMMON("setstripe --component-add")
- -       " or\n"
- -       "To create a file with specified striping/composite layout:\n"
- -       SETSTRIPE_USAGE},
+ +       "\t             '^init' indicating all uninstantiated components\n"
+ +       "\t-I and -F cannot be specified at the same time\n"},
         {"getstripe", lfs_getstripe, 0,
          "To list the striping info for a given file or files in a\n"
          "directory or recursively for all files in a directory tree.\n"
@@@ -243,7 -251,7 +245,7 @@@
            "     [[!] --gid|-g|--group|-G <gid>|<gname>]\n"
            "     [[!] --uid|-u|--user|-U <uid>|<uname>] [[!] --pool <pool>]\n"
          "     [[!] --projid <projid>]\n"
-        "     [[!] --layout|-L released,raid0]\n"
+        "     [[!] --layout|-L released,raid0,mdt]\n"
          "     [[!] --component-count [+-]<comp_cnt>]\n"
          "     [[!] --component-start [+-]N[kMGTPE]]\n"
          "     [[!] --component-end|-E [+-]N[kMGTPE]]\n"
@@@ -721,10 -729,10 +723,10 @@@ static int lfs_component_create(char *f
   
         fd = llapi_layout_file_open(fname, open_flags, open_mode, layout);
         if (fd < 0)
- -              fprintf(stderr, "%s %s failed. %s\n",
+ +              fprintf(stderr, "%s: cannot %s '%s': %s\n", progname,
                         S_ISDIR(st.st_mode) ?
- -                              "Set default composite layout to " :
- -                              "Create composite file",
+ +                              "set default composite layout for" :
+ +                              "create composite file",
                         fname, strerror(errno));
         return fd;
   }
@@@ -777,7 -785,7 +779,7 @@@ static int lfs_migrate(char *name, __u6
         fd = open(name, O_RDWR | O_DIRECT);
         if (fd == -1) {
                 rc = -errno;
- -              fprintf(stderr, "%s: %s: cannot open: %s\n", progname, name,
+ +              fprintf(stderr, "%s: cannot open '%s': %s\n", progname, name,
                         strerror(-rc));
                 goto free;
         }
@@@ -1011,6 -1019,7 +1013,7 @@@ struct lfs_setstripe_args 
         int                      lsa_stripe_off;
         __u32                    lsa_comp_flags;
         int                      lsa_nr_osts;
+       int                      lsa_pattern;
         __u32                   *lsa_osts;
         char                    *lsa_pool_name;
   };
@@@ -1025,7 -1034,7 +1028,7 @@@ static inline bool setstripe_args_speci
   {
         return (lsa->lsa_stripe_size != 0 || lsa->lsa_stripe_count != 0 ||
                 lsa->lsa_stripe_off != -1 || lsa->lsa_pool_name != NULL ||
-               lsa->lsa_comp_end != 0);
+               lsa->lsa_comp_end != 0 || lsa->lsa_pattern != 0);
   }
   
   static int comp_args_to_layout(struct llapi_layout **composite,
@@@ -1070,6 -1079,51 +1073,51 @@@
                 return rc;
         }
   
+       /* Data-on-MDT component setting */
+       if (lsa->lsa_pattern == LLAPI_LAYOUT_MDT) {
+               /* In case of Data-on-MDT patterns the only extra option
+                * applicable is stripe size option. */
+               if (lsa->lsa_stripe_count) {
+                       fprintf(stderr, "Option 'stripe-count' can't be "
+                               "specified with Data-on-MDT component: %i\n",
+                               lsa->lsa_stripe_count);
+                       return -EINVAL;
+               }
+               if (lsa->lsa_stripe_size) {
+                       fprintf(stderr, "Option 'stripe-size' can't be "
+                               "specified with Data-on-MDT component: %llu\n",
+                               lsa->lsa_stripe_size);
+                       return -EINVAL;
+               }
+               if (lsa->lsa_nr_osts != 0) {
+                       fprintf(stderr, "Option 'ost-list' can't be specified "
+                               "with Data-on-MDT component: '%i'\n",
+                               lsa->lsa_nr_osts);
+                       return -EINVAL;
+               }
+               if (lsa->lsa_stripe_off != -1) {
+                       fprintf(stderr, "Option 'stripe-offset' can't be "
+                               "specified with Data-on-MDT component: %i\n",
+                               lsa->lsa_stripe_off);
+                       return -EINVAL;
+               }
+               if (lsa->lsa_pool_name != 0) {
+                       fprintf(stderr, "Option 'pool' can't be specified "
+                               "with Data-on-MDT component: '%s'\n",
+                               lsa->lsa_pool_name);
+                       return -EINVAL;
+               }
+ 
+               rc = llapi_layout_pattern_set(layout, lsa->lsa_pattern);
+               if (rc) {
+                       fprintf(stderr, "Set stripe pattern %#x failed. %s\n",
+                               lsa->lsa_pattern, strerror(errno));
+                       return rc;
+               }
+               /* Data-on-MDT component has always single stripe up to end */
+               lsa->lsa_stripe_size = lsa->lsa_comp_end;
+       }
+ 
         if (lsa->lsa_stripe_size != 0) {
                 rc = llapi_layout_stripe_size_set(layout,
                                                   lsa->lsa_stripe_size);
@@@ -1253,9 -1307,8 +1301,9 @@@ static int comp_str2flags(__u32 *flags
                         }
                 }
                 if (!found) {
- -                      llapi_printf(LLAPI_MSG_ERROR, "Component flag "
- -                                   "'%s' is not supported.\n", name);
+ +                      llapi_printf(LLAPI_MSG_ERROR,
+ +                                   "%s: component flag '%s' not supported\n",
+ +                                   progname, name);
                         return -EINVAL;
                 }
         }
@@@ -1341,6 -1394,12 +1389,6 @@@ static int lfs_setstripe(int argc, cha
         { .val = LFS_COMP_SET_OPT,
                         .name = "component-set",
                                                 .has_arg = no_argument},
- -#if LUSTRE_VERSION_CODE < OBD_OCD_VERSION(2, 9, 59, 0)
- -      /* This formerly implied "stripe-count", but was explicitly
- -       * made "stripe-count" for consistency with other options,
- -       * and to separate it from "mdt-count" when DNE arrives. */
- -      { .val = 'c',   .name = "count",        .has_arg = required_argument },
- -#endif
         { .val = 'c',   .name = "stripe-count", .has_arg = required_argument},
         { .val = 'c',   .name = "stripe_count", .has_arg = required_argument},
         { .val = 'd',   .name = "delete",       .has_arg = no_argument},
@@@ -1348,10 -1407,17 +1396,11 @@@
         { .val = 'E',   .name = "component-end",
                                                 .has_arg = required_argument},
         /* dirstripe {"mdt-hash",     required_argument, 0, 'H'}, */
- -#if LUSTRE_VERSION_CODE < OBD_OCD_VERSION(2, 9, 59, 0)
- -      /* This formerly implied "stripe-index", but was explicitly
- -       * made "stripe-index" for consistency with other options,
- -       * and to separate it from "mdt-index" when DNE arrives. */
- -      { .val = 'i',   .name = "index",        .has_arg = required_argument },
- -#endif
         { .val = 'i',   .name = "stripe-index", .has_arg = required_argument},
         { .val = 'i',   .name = "stripe_index", .has_arg = required_argument},
         { .val = 'I',   .name = "comp-id",      .has_arg = required_argument},
         { .val = 'I',   .name = "component-id", .has_arg = required_argument},
+       { .val = 'L',   .name = "layout",       .has_arg = required_argument },
         { .val = 'm',   .name = "mdt",          .has_arg = required_argument},
         { .val = 'm',   .name = "mdt-index",    .has_arg = required_argument},
         { .val = 'm',   .name = "mdt_index",    .has_arg = required_argument},
@@@ -1363,6 -1429,12 +1412,6 @@@
         { .val = 'o',   .name = "ost_list",     .has_arg = required_argument },
   #endif
         { .val = 'p',   .name = "pool",         .has_arg = required_argument },
- -#if LUSTRE_VERSION_CODE < OBD_OCD_VERSION(2, 9, 59, 0)
- -      /* This formerly implied "--stripe-size", but was confusing
- -       * with "lfs find --size|-s", which means "file size", so use
- -       * the consistent "--stripe-size|-S" for all commands. */
- -      { .val = 's',   .name = "size",         .has_arg = required_argument },
- -#endif
         { .val = 'S',   .name = "stripe-size",  .has_arg = required_argument },
         { .val = 'S',   .name = "stripe_size",  .has_arg = required_argument },
         /* dirstripe {"mdt-count",    required_argument, 0, 'T'}, */
@@@ -1387,7 -1459,7 +1436,7 @@@
         if (strcmp(argv[0], "migrate") == 0)
                 migrate_mode = true;
   
-       while ((c = getopt_long(argc, argv, "bc:dE:i:I:m:no:p:s:S:v",
+       while ((c = getopt_long(argc, argv, "bc:dE:i:I:m:no:p:L:s:S:v",
                                 long_opts, NULL)) >= 0) {
                 switch (c) {
                 case 0:
@@@ -1401,28 -1473,34 +1450,28 @@@
                         break;
                 case LFS_COMP_FLAGS_OPT:
                         result = comp_str2flags(&lsa.lsa_comp_flags, optarg);
- -                      if (result != 0) {
- -                              fprintf(stderr, "error: %s: bad comp flags "
- -                                      "'%s'\n", argv[0], optarg);
- -                              goto error;
- -                      }
+ +                      if (result != 0)
+ +                              goto usage_error;
                         break;
                 case LFS_COMP_SET_OPT:
                         comp_set = 1;
                         break;
                 case 'b':
                         if (!migrate_mode) {
- -                              fprintf(stderr, "--block is valid only for"
- -                                              " migrate mode\n");
- -                              goto error;
+ +                              fprintf(stderr,
+ +                                      "%s %s: -b|--block valid only for migrate command\n",
+ +                                      progname, argv[0]);
+ +                              goto usage_error;
                         }
                         migration_block = true;
                         break;
                 case 'c':
- -#if LUSTRE_VERSION_CODE >= OBD_OCD_VERSION(2, 6, 53, 0)
- -                      if (strcmp(argv[optind - 1], "--count") == 0)
- -                              fprintf(stderr, "warning: '--count' deprecated"
- -                                      ", use '--stripe-count' instead\n");
- -#endif
                         lsa.lsa_stripe_count = strtoul(optarg, &end, 0);
                         if (*end != '\0') {
- -                              fprintf(stderr, "error: %s: bad stripe count "
- -                                      "'%s'\n", argv[0], optarg);
- -                              goto error;
+ +                              fprintf(stderr,
+ +                                      "%s %s: invalid stripe count '%s'\n",
+ +                                      progname, argv[0], optarg);
+ +                              goto usage_error;
                         }
                         break;
                 case 'd':
@@@ -1432,12 -1510,8 +1481,12 @@@
                 case 'E':
                         if (lsa.lsa_comp_end != 0) {
                                 result = comp_args_to_layout(&layout, &lsa);
- -                              if (result)
- -                                      goto error;
+ +                              if (result) {
+ +                                      fprintf(stderr,
+ +                                              "%s %s: invalid layout\n",
+ +                                              progname, argv[0]);
+ +                                      goto usage_error;
+ +                              }
   
                                 setstripe_args_init(&lsa);
                         }
@@@ -1449,47 -1523,70 +1498,71 @@@
                                                         &lsa.lsa_comp_end,
                                                         &size_units, 0);
                                 if (result) {
- -                                      fprintf(stderr, "error: %s: "
- -                                              "bad component end '%s'\n",
- -                                              argv[0], optarg);
- -                                      goto error;
+ +                                      fprintf(stderr,
+ +                                              "%s %s: invalid component end '%s'\n",
+ +                                              progname, argv[0], optarg);
+ +                                      goto usage_error;
                                 }
                         }
                         break;
                 case 'i':
- -                      if (strcmp(argv[optind - 1], "--index") == 0)
- -                              fprintf(stderr, "warning: '--index' deprecated"
- -                                      ", use '--stripe-index' instead\n");
                         lsa.lsa_stripe_off = strtol(optarg, &end, 0);
                         if (*end != '\0') {
- -                              fprintf(stderr, "error: %s: bad stripe offset "
- -                                      "'%s'\n", argv[0], optarg);
- -                              goto error;
+ +                              fprintf(stderr,
+ +                                      "%s %s: invalid stripe offset '%s'\n",
+ +                                      progname, argv[0], optarg);
+ +                              goto usage_error;
                         }
                         break;
                 case 'I':
                         comp_id = strtoul(optarg, &end, 0);
                         if (*end != '\0' || comp_id == 0 ||
                             comp_id > LCME_ID_MAX) {
- -                              fprintf(stderr, "error: %s: bad comp ID "
- -                                      "'%s'\n", argv[0], optarg);
- -                              goto error;
+ +                              fprintf(stderr,
+ +                                      "%s %s: invalid component ID '%s'\n",
+ +                                      progname, argv[0], optarg);
+ +                              goto usage_error;
                         }
                         break;
+               case 'L':
+                       if (strcmp(argv[optind - 1], "mdt") == 0) {
+                               /* Can be only the first component */
+                               if (layout != NULL) {
+                                       result = -EINVAL;
+                                       fprintf(stderr, "error: 'mdt' layout "
+                                               "can be only the first one\n");
+                                       goto error;
+                               }
+                               if (lsa.lsa_comp_end > (1ULL << 30)) { /* 1Gb */
+                                       result = -EFBIG;
+                                       fprintf(stderr, "error: 'mdt' layout "
+                                               "size is too big\n");
+                                       goto error;
+                               }
+                               lsa.lsa_pattern = LLAPI_LAYOUT_MDT;
+                       } else if (strcmp(argv[optind - 1], "raid0") != 0) {
+                               result = -EINVAL;
+                               fprintf(stderr, "error: layout '%s' is "
+                                       "unknown, supported layouts are: "
+                                       "'mdt', 'raid0'\n", argv[optind]);
+                               goto error;
+                       }
+                       break;
                 case 'm':
                         if (!migrate_mode) {
- -                              fprintf(stderr, "--mdt-index is valid only for"
- -                                              " migrate mode\n");
- -                              goto error;
+ +                              fprintf(stderr,
+ +                                      "%s %s: -m|--mdt-index valid only for migrate command\n",
+ +                                      progname, argv[0]);
+ +                              goto usage_error;
                         }
                         mdt_idx_arg = optarg;
                         break;
                 case 'n':
                         if (!migrate_mode) {
- -                              fprintf(stderr, "--non-block is valid only for"
- -                                              " migrate mode\n");
- -                              goto error;
+ +                              fprintf(stderr,
+ +                                      "%s %s: -n|--non-block valid only for migrate command\n",
+ +                                      progname, argv[0]);
+ +                              goto usage_error;
                         }
                         migration_flags |= MIGRATION_NONBLOCK;
                         break;
@@@ -1499,9 -1596,9 +1572,9 @@@
                                                 lsa.lsa_nr_osts, optarg);
                         if (lsa.lsa_nr_osts < 0) {
                                 fprintf(stderr,
- -                                      "error: %s: bad OST indices '%s'\n",
- -                                      argv[0], optarg);
- -                              goto error;
+ +                                      "%s %s: invalid OST target(s) '%s'\n",
+ +                                      progname, argv[0], optarg);
+ +                              goto usage_error;
                         }
   
                         lsa.lsa_osts = osts;
@@@ -1510,32 -1607,35 +1583,32 @@@
                         break;
                 case 'p':
                         if (optarg == NULL)
- -                              goto error;
+ +                              goto usage_error;
                         lsa.lsa_pool_name = optarg;
                         break;
- -#if LUSTRE_VERSION_CODE < OBD_OCD_VERSION(2, 9, 59, 0)
- -              case 's':
- -#if LUSTRE_VERSION_CODE >= OBD_OCD_VERSION(2, 6, 53, 0)
- -                      fprintf(stderr, "warning: '--size|-s' deprecated, "
- -                              "use '--stripe-size|-S' instead\n");
- -#endif
- -#endif /* LUSTRE_VERSION_CODE < OBD_OCD_VERSION(2, 9, 59, 0) */
                 case 'S':
                         result = llapi_parse_size(optarg, &lsa.lsa_stripe_size,
                                                   &size_units, 0);
                         if (result) {
- -                              fprintf(stderr, "error: %s: bad stripe size "
- -                                      "'%s'\n", argv[0], optarg);
- -                              goto error;
+ +                              fprintf(stderr,
+ +                                      "%s %s: invalid stripe size '%s'\n",
+ +                                      progname, argv[0], optarg);
+ +                              goto usage_error;
                         }
                         break;
                 case 'v':
                         if (!migrate_mode) {
- -                              fprintf(stderr, "--verbose is valid only for"
- -                                              " migrate mode\n");
- -                              goto error;
+ +                              fprintf(stderr,
+ +                                      "%s %s: -v|--verbose valid only for migrate command\n",
+ +                                      progname, argv[0]);
+ +                              goto usage_error;
                         }
                         migrate_mdt_param.fp_verbose = VERBOSE_DETAIL;
                         break;
                 default:
- -                      goto error;
+ +                      fprintf(stderr, "%s %s: unrecognized option '%s'\n",
+ +                              progname, argv[0], argv[optind - 1]);
+ +                      goto usage_error;
                 }
         }
   
@@@ -1543,56 -1643,52 +1616,56 @@@
   
         if (lsa.lsa_comp_end != 0) {
                 result = comp_args_to_layout(&layout, &lsa);
- -              if (result)
- -                      goto error;
+ +              if (result) {
+ +                      fprintf(stderr, "%s %s: invalid component layout\n",
+ +                              progname, argv[0]);
+ +                      goto usage_error;
+ +              }
         }
   
         if (optind == argc) {
- -              fprintf(stderr, "error: %s: missing filename|dirname\n",
- -                      argv[0]);
- -              goto error;
+ +              fprintf(stderr, "%s %s: FILE must be specified\n",
+ +                      progname, argv[0]);
+ +              goto usage_error;
         }
   
         /* Only LCME_FL_INIT flags is used in PFL, and it shouldn't be
          * altered by user space tool, so we don't need to support the
          * --component-set for this moment. */
         if (comp_set != 0) {
- -              fprintf(stderr, "error: %s: --component-set isn't supported.\n",
- -                      argv[0]);
- -              goto error;
+ +              fprintf(stderr, "%s %s: --component-set not supported\n",
+ +                      progname, argv[0]);
+ +              goto usage_error;
         }
   
         if ((delete + comp_set + comp_del + comp_add) > 1) {
- -              fprintf(stderr, "error: %s: can't specify --component-set, "
- -                      "--component-del, --component-add or -d together\n",
- -                      argv[0]);
- -              goto error;
+ +              fprintf(stderr,
+ +                      "%s %s: options --component-set, --component-del, --component-add and -d are mutually exclusive\n",
+ +                      progname, argv[0]);
+ +              goto usage_error;
         }
   
         if (delete && (setstripe_args_specified(&lsa) || comp_id != 0 ||
                        lsa.lsa_comp_flags != 0 || layout != NULL)) {
- -              fprintf(stderr, "error: %s: can't specify -d with "
- -                      "-s, -c, -o, -p, -I, -F or -E options\n",
- -                      argv[0]);
- -              goto error;
+ +              fprintf(stderr,
+ +                      "%s %s: option -d is mutually exclusive with -s, -c, -o, -p, -I, -F and -E options\n",
+ +                      progname, argv[0]);
+ +              goto usage_error;
         }
   
         if ((comp_set || comp_del) &&
             (setstripe_args_specified(&lsa) || layout != NULL)) {
- -              fprintf(stderr, "error: %s: can't specify --component-del or "
- -                      "--component-set with -s, -c, -o, -p or -E options.\n",
- -                      argv[0]);
- -              goto error;
+ +              fprintf(stderr,
+ +                      "%s %s: options --component-del and --component-set are mutually exclusive when used with -c, -E, -o, -p, or -s\n",
+ +                      progname, argv[0]);
+ +              goto usage_error;
         }
   
         if (comp_del && comp_id != 0 && lsa.lsa_comp_flags != 0) {
- -              fprintf(stderr, "error: %s: can't specify both -I and -F for "
- -                      "--component-del option.\n", argv[0]);
- -              goto error;
+ +              fprintf(stderr,
+ +                      "%s %s: options -I and -F are mutually exclusive when used with --component-del\n",
+ +                      progname, argv[0]);
+ +              goto usage_error;
         }
   
         if (comp_add || comp_del) {
@@@ -1600,19 -1696,18 +1673,19 @@@
   
                 result = lstat(fname, &st);
                 if (result == 0 && S_ISDIR(st.st_mode)) {
- -                      fprintf(stderr, "error: %s: can't use --component-add "
- -                              "or --component-del for directory.\n",
- -                              argv[0]);
- -                      goto error;
+ +                      fprintf(stderr,
+ +                              "%s setstripe: cannot use --component-add or --component-del for directory\n",
+ +                              progname);
+ +                      goto usage_error;
                 }
         }
   
         if (comp_add) {
                 if (layout == NULL) {
- -                      fprintf(stderr, "error: %s: -E option must be present"
- -                              "in --component-add mode.\n", argv[0]);
- -                      goto error;
+ +                      fprintf(stderr,
+ +                              "%s %s: option -E must be specified with --component-add\n",
+ +                              progname, argv[0]);
+ +                      goto usage_error;
                 }
                 result = adjust_first_extent(fname, layout);
                 if (result == -ENODATA)
@@@ -1622,33 -1717,31 +1695,33 @@@
         }
   
         if (mdt_idx_arg != NULL && optind > 3) {
- -              fprintf(stderr, "error: %s: cannot specify -m with other "
- -                      "options\n", argv[0]);
- -              goto error;
+ +              fprintf(stderr,
+ +                      "%s %s: option -m cannot be used with other options\n",
+ +                      progname, argv[0]);
+ +              goto usage_error;
         }
   
         if ((migration_flags & MIGRATION_NONBLOCK) && migration_block) {
                 fprintf(stderr,
- -                      "error: %s: cannot specify --non-block and --block\n",
- -                      argv[0]);
- -              goto error;
+ +                      "%s %s: options --non-block and --block are mutually exclusive\n",
+ +                      progname, argv[0]);
+ +              goto usage_error;
         }
   
         if (!comp_del && !comp_set && comp_id != 0) {
- -              fprintf(stderr, "error: %s: -I can only be used with "
- -                      "--component-del.\n", argv[0]);
- -              goto error;
+ +              fprintf(stderr,
+ +                      "%s %s: option -I can only be used with --component-del\n",
+ +                      progname, argv[0]);
+ +              goto usage_error;
         }
   
         if (mdt_idx_arg != NULL) {
                 /* initialize migrate mdt parameters */
                 migrate_mdt_param.fp_mdt_index = strtoul(mdt_idx_arg, &end, 0);
                 if (*end != '\0') {
- -                      fprintf(stderr, "error: %s: bad MDT index '%s'\n",
- -                              argv[0], mdt_idx_arg);
- -                      goto error;
+ +                      fprintf(stderr, "%s %s: invalid MDT index '%s'\n",
+ +                              progname, argv[0], mdt_idx_arg);
+ +                      goto usage_error;
                 }
                 migrate_mdt_param.fp_migrate = 1;
         } else if (layout == NULL) {
@@@ -1656,28 -1749,25 +1729,27 @@@
                 param = calloc(1, offsetof(typeof(*param),
                                lsp_osts[lsa.lsa_nr_osts]));
                 if (param == NULL) {
- -                      fprintf(stderr, "error: %s: %s\n", argv[0],
- -                              strerror(ENOMEM));
+ +                      fprintf(stderr,
+ +                              "%s %s: cannot allocate memory for parameters: %s\n",
+ +                              progname, argv[0], strerror(ENOMEM));
+ +                      result = -ENOMEM;
                         goto error;
                 }
   
                 param->lsp_stripe_size = lsa.lsa_stripe_size;
                 param->lsp_stripe_offset = lsa.lsa_stripe_off;
                 param->lsp_stripe_count = lsa.lsa_stripe_count;
-               param->lsp_stripe_pattern = 0;
                 param->lsp_pool = lsa.lsa_pool_name;
                 param->lsp_is_specific = false;
                 if (lsa.lsa_nr_osts > 0) {
                         if (lsa.lsa_stripe_count > 0 &&
                             lsa.lsa_nr_osts != lsa.lsa_stripe_count) {
- -                              fprintf(stderr, "error: %s: stripe count '%d' "
- -                                      "doesn't match the number of OSTs: %d\n"
- -                                      , argv[0], lsa.lsa_stripe_count,
+ +                              fprintf(stderr,
+ +                                      "%s %s: stripe count '%d' does not match number of OSTs: %d\n",
+ +                                      progname, argv[0], lsa.lsa_stripe_count,
                                         lsa.lsa_nr_osts);
                                 free(param);
- -                              goto error;
+ +                              goto usage_error;
                         }
   
                         param->lsp_is_specific = true;
@@@ -1688,19 -1778,25 +1760,19 @@@
         }
   
         for (fname = argv[optind]; fname != NULL; fname = argv[++optind]) {
- -              char *op;
                 if (mdt_idx_arg != NULL) {
                         result = llapi_migrate_mdt(fname, &migrate_mdt_param);
                 } else if (migrate_mode) {
                         result = lfs_migrate(fname, migration_flags, param,
                                              layout);
                 } else if (comp_set != 0) {
                         result = lfs_component_set(fname, comp_id,
                                                    lsa.lsa_comp_flags);
                 } else if (comp_del != 0) {
                         result = lfs_component_del(fname, comp_id,
                                                    lsa.lsa_comp_flags);
- -                      op = "delete component of";
                 } else if (comp_add != 0) {
                         result = lfs_component_add(fname, layout);
- -                      op = "add component to";
                 } else if (layout != NULL) {
                         result = lfs_component_create(fname, O_CREAT | O_WRONLY,
                                                       0644, layout);
@@@ -1708,6 -1804,7 +1780,6 @@@
                                 close(result);
                                 result = 0;
                         }
- -                      op = "create composite";
                 } else {
                         result = llapi_file_open_param(fname,
                                                        O_CREAT | O_WRONLY,
@@@ -1716,11 -1813,16 +1788,11 @@@
                                 close(result);
                                 result = 0;
                         }
- -                      op = "create striped";
                 }
                 if (result) {
                         /* Save the first error encountered. */
                         if (result2 == 0)
                                 result2 = result;
- -                      fprintf(stderr, "error: %s: %s file '%s' failed: %s\n",
- -                              argv[0], op, fname,
- -                              lsa.lsa_pool_name != NULL && result == EINVAL ?
- -                              "OST not in pool?" : strerror(errno));
                         continue;
                 }
         }
@@@ -1728,11 -1830,9 +1800,11 @@@
         free(param);
         llapi_layout_free(layout);
         return result2;
+ +usage_error:
+ +      result = CMD_HELP;
   error:
         llapi_layout_free(layout);
- -      return CMD_HELP;
+ +      return result;
   }
   
   static int lfs_poollist(int argc, char **argv)
@@@ -1822,17 -1922,19 +1894,19 @@@ static inline int gid2name(char **name
   
   static int name2layout(__u32 *layout, char *name)
   {
-       char *ptr, *lyt;
+       char *ptr, *layout_name;
   
         *layout = 0;
         for (ptr = name; ; ptr = NULL) {
-               lyt = strtok(ptr, ",");
-               if (lyt == NULL)
+               layout_name = strtok(ptr, ",");
+               if (layout_name == NULL)
                         break;
-               if (strcmp(lyt, "released") == 0)
+               if (strcmp(layout_name, "released") == 0)
                         *layout |= LOV_PATTERN_F_RELEASED;
-               else if (strcmp(lyt, "raid0") == 0)
+               else if (strcmp(layout_name, "raid0") == 0)
                         *layout |= LOV_PATTERN_RAID0;
+               else if (strcmp(layout_name, "mdt") == 0)
+                       *layout |= LOV_PATTERN_MDT;
                 else
                         return -1;
         }
@@@ -4157,8 -4259,7 +4231,8 @@@ all_output
                         break;
                 default:
                         rc = -ENOTSUP;
- -                      break;
+ +                      pass++;
+ +                      goto out;
                 }
                 if (rc)
                         name = "<unknown>";
@@@ -5439,24 -5540,24 +5513,24 @@@ static int lfs_list_commands(int argc, 
   
   int main(int argc, char **argv)
   {
- -        int rc;
+ +      int rc;
   
         /* Ensure that liblustreapi constructor has run */
         if (!liblustreapi_initialized)
                 fprintf(stderr, "liblustreapi was not properly initialized\n");
   
- -        setlinebuf(stdout);
+ +      setlinebuf(stdout);
+ +      opterr = 0;
   
         Parser_init("lfs > ", cmdlist);
   
         progname = argv[0]; /* Used in error messages */
- -        if (argc > 1) {
- -                rc = Parser_execarg(argc - 1, argv + 1, cmdlist);
- -        } else {
- -                rc = Parser_commands();
- -        }
+ +      if (argc > 1)
+ +              rc = Parser_execarg(argc - 1, argv + 1, cmdlist);
+ +      else
+ +              rc = Parser_commands();
   
- -        return rc < 0 ? -rc : rc;
+ +      return rc < 0 ? -rc : rc;
   }
   
   #ifdef _LUSTRE_IDL_H_
author	Mikhal Pershin <mike.pershin@intel.com>
	Fri, 10 Nov 2017 10:18:48 +0000 (13:18 +0300)
committer	Mikhal Pershin <mike.pershin@intel.com>
	Fri, 10 Nov 2017 10:18:48 +0000 (13:18 +0300)
		1	2
lustre/doc/lfs-setstripe.1	patch \|	diff1 \|	diff2 \|	blob \| history
lustre/include/cl_object.h	patch \|	diff1 \|	diff2 \|	blob \| history
lustre/include/lustre_dlm.h	patch \|	diff1 \|	diff2 \|	blob \| history
lustre/include/lustre_osc.h	patch \|	diff1 \|	diff2 \|	blob \| history
lustre/include/obd.h	patch \|	diff1 \|	diff2 \|	blob \| history
lustre/include/uapi/linux/lustre/lustre_idl.h	patch \|	diff1 \|	diff2 \|	blob \| history
lustre/ldlm/ldlm_lib.c	patch \|	diff1 \|	diff2 \|	blob \| history
lustre/ldlm/ldlm_lock.c	patch \|	diff1 \|	diff2 \|	blob \| history
lustre/ldlm/ldlm_lockd.c	patch \|	diff1 \|	diff2 \|	blob \| history
lustre/ldlm/ldlm_request.c	patch \|	diff1 \|	diff2 \|	blob \| history
lustre/llite/llite_lib.c	patch \|	diff1 \|	diff2 \|	blob \| history
lustre/llite/namei.c	patch \|	diff1 \|	diff2 \|	blob \| history
lustre/lmv/lmv_obd.c	patch \|	diff1 \|	diff2 \|	blob \| history
lustre/lod/lod_lov.c	patch \|	diff1 \|	diff2 \|	blob \| history
lustre/lod/lod_object.c	patch \|	diff1 \|	diff2 \|	blob \| history
lustre/lod/lod_qos.c	patch \|	diff1 \|	diff2 \|	blob \| history
lustre/lod/lproc_lod.c	patch \|	diff1 \|	diff2 \|	blob \| history
lustre/mdc/mdc_locks.c	patch \|	diff1 \|	diff2 \|	blob \| history
lustre/mdc/mdc_request.c	patch \|	diff1 \|	diff2 \|	blob \| history
lustre/mdt/mdt_handler.c	patch \|	diff1 \|	diff2 \|	blob \| history
lustre/mdt/mdt_internal.h	patch \|	diff1 \|	diff2 \|	blob \| history
lustre/mdt/mdt_mds.c	patch \|	diff1 \|	diff2 \|	blob \| history
lustre/mdt/mdt_open.c	patch \|	diff1 \|	diff2 \|	blob \| history
lustre/mdt/mdt_reint.c	patch \|	diff1 \|	diff2 \|	blob \| history
lustre/obdclass/genops.c	patch \|	diff1 \|	diff2 \|	blob \| history
lustre/obdclass/obd_config.c	patch \|	diff1 \|	diff2 \|	blob \| history
lustre/ofd/lproc_ofd.c	patch \|	diff1 \|	diff2 \|	blob \| history
lustre/ofd/ofd_dev.c	patch \|	diff1 \|	diff2 \|	blob \| history
lustre/ofd/ofd_dlm.c	patch \|	diff1 \|	diff2 \|	blob \| history
lustre/ofd/ofd_internal.h	patch \|	diff1 \|	diff2 \|	blob \| history
lustre/osc/osc_page.c	patch \|	diff1 \|	diff2 \|	blob \| history
lustre/osc/osc_request.c	patch \|	diff1 \|	diff2 \|	blob \| history
lustre/osd-zfs/osd_object.c	patch \|	diff1 \|	diff2 \|	blob \| history
lustre/ptlrpc/pack_generic.c	patch \|	diff1 \|	diff2 \|	blob \| history
lustre/target/tgt_grant.c	patch \|	diff1 \|	diff2 \|	blob \| history
lustre/target/tgt_handler.c	patch \|	diff1 \|	diff2 \|	blob \| history
lustre/target/tgt_main.c	patch \|	diff1 \|	diff2 \|	blob \| history
lustre/tests/conf-sanity.sh	patch \|	diff1 \|	diff2 \|	blob \| history
lustre/tests/sanity.sh	patch \|	diff1 \|	diff2 \|	blob \| history
lustre/tests/sanityn.sh	patch \|	diff1 \|	diff2 \|	blob \| history
lustre/tests/test-framework.sh	patch \|	diff1 \|	diff2 \|	blob \| history
lustre/utils/lfs.c	patch \|	diff1 \|	diff2 \|	blob \| history